]> Git Repo - linux.git/blame - fs/btrfs/volumes.c
btrfs: send add define for v2 buffer size
[linux.git] / fs / btrfs / volumes.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
0b86a832
CM
2/*
3 * Copyright (C) 2007 Oracle. All rights reserved.
0b86a832 4 */
c1d7c514 5
0b86a832 6#include <linux/sched.h>
fccc0007 7#include <linux/sched/mm.h>
0b86a832 8#include <linux/bio.h>
5a0e3ad6 9#include <linux/slab.h>
f2d8d74d 10#include <linux/blkdev.h>
442a4f63 11#include <linux/ratelimit.h>
59641015 12#include <linux/kthread.h>
53b381b3 13#include <linux/raid/pq.h>
803b2f54 14#include <linux/semaphore.h>
8da4b8c4 15#include <linux/uuid.h>
f8e10cd3 16#include <linux/list_sort.h>
54fde91f 17#include <linux/namei.h>
784352fe 18#include "misc.h"
0b86a832
CM
19#include "ctree.h"
20#include "extent_map.h"
21#include "disk-io.h"
22#include "transaction.h"
23#include "print-tree.h"
24#include "volumes.h"
53b381b3 25#include "raid56.h"
8b712842 26#include "async-thread.h"
21adbd5c 27#include "check-integrity.h"
606686ee 28#include "rcu-string.h"
8dabb742 29#include "dev-replace.h"
99994cde 30#include "sysfs.h"
82fc28fb 31#include "tree-checker.h"
8719aaae 32#include "space-info.h"
aac0023c 33#include "block-group.h"
b0643e59 34#include "discard.h"
5b316468 35#include "zoned.h"
0b86a832 36
d45cfb88
CH
37static struct bio_set btrfs_bioset;
38
bf08387f
QW
39#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
40 BTRFS_BLOCK_GROUP_RAID10 | \
41 BTRFS_BLOCK_GROUP_RAID56_MASK)
42
af902047
ZL
43const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
44 [BTRFS_RAID_RAID10] = {
45 .sub_stripes = 2,
46 .dev_stripes = 1,
47 .devs_max = 0, /* 0 == as many as possible */
b2f78e88 48 .devs_min = 2,
8789f4fe 49 .tolerated_failures = 1,
af902047
ZL
50 .devs_increment = 2,
51 .ncopies = 2,
b50836ed 52 .nparity = 0,
ed23467b 53 .raid_name = "raid10",
41a6e891 54 .bg_flag = BTRFS_BLOCK_GROUP_RAID10,
f9fbcaa2 55 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
af902047
ZL
56 },
57 [BTRFS_RAID_RAID1] = {
58 .sub_stripes = 1,
59 .dev_stripes = 1,
60 .devs_max = 2,
61 .devs_min = 2,
8789f4fe 62 .tolerated_failures = 1,
af902047
ZL
63 .devs_increment = 2,
64 .ncopies = 2,
b50836ed 65 .nparity = 0,
ed23467b 66 .raid_name = "raid1",
41a6e891 67 .bg_flag = BTRFS_BLOCK_GROUP_RAID1,
f9fbcaa2 68 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
af902047 69 },
47e6f742
DS
70 [BTRFS_RAID_RAID1C3] = {
71 .sub_stripes = 1,
72 .dev_stripes = 1,
cf93e15e 73 .devs_max = 3,
47e6f742
DS
74 .devs_min = 3,
75 .tolerated_failures = 2,
76 .devs_increment = 3,
77 .ncopies = 3,
db26a024 78 .nparity = 0,
47e6f742
DS
79 .raid_name = "raid1c3",
80 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
81 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
82 },
8d6fac00
DS
83 [BTRFS_RAID_RAID1C4] = {
84 .sub_stripes = 1,
85 .dev_stripes = 1,
cf93e15e 86 .devs_max = 4,
8d6fac00
DS
87 .devs_min = 4,
88 .tolerated_failures = 3,
89 .devs_increment = 4,
90 .ncopies = 4,
db26a024 91 .nparity = 0,
8d6fac00
DS
92 .raid_name = "raid1c4",
93 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
94 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
95 },
af902047
ZL
96 [BTRFS_RAID_DUP] = {
97 .sub_stripes = 1,
98 .dev_stripes = 2,
99 .devs_max = 1,
100 .devs_min = 1,
8789f4fe 101 .tolerated_failures = 0,
af902047
ZL
102 .devs_increment = 1,
103 .ncopies = 2,
b50836ed 104 .nparity = 0,
ed23467b 105 .raid_name = "dup",
41a6e891 106 .bg_flag = BTRFS_BLOCK_GROUP_DUP,
f9fbcaa2 107 .mindev_error = 0,
af902047
ZL
108 },
109 [BTRFS_RAID_RAID0] = {
110 .sub_stripes = 1,
111 .dev_stripes = 1,
112 .devs_max = 0,
b2f78e88 113 .devs_min = 1,
8789f4fe 114 .tolerated_failures = 0,
af902047
ZL
115 .devs_increment = 1,
116 .ncopies = 1,
b50836ed 117 .nparity = 0,
ed23467b 118 .raid_name = "raid0",
41a6e891 119 .bg_flag = BTRFS_BLOCK_GROUP_RAID0,
f9fbcaa2 120 .mindev_error = 0,
af902047
ZL
121 },
122 [BTRFS_RAID_SINGLE] = {
123 .sub_stripes = 1,
124 .dev_stripes = 1,
125 .devs_max = 1,
126 .devs_min = 1,
8789f4fe 127 .tolerated_failures = 0,
af902047
ZL
128 .devs_increment = 1,
129 .ncopies = 1,
b50836ed 130 .nparity = 0,
ed23467b 131 .raid_name = "single",
41a6e891 132 .bg_flag = 0,
f9fbcaa2 133 .mindev_error = 0,
af902047
ZL
134 },
135 [BTRFS_RAID_RAID5] = {
136 .sub_stripes = 1,
137 .dev_stripes = 1,
138 .devs_max = 0,
139 .devs_min = 2,
8789f4fe 140 .tolerated_failures = 1,
af902047 141 .devs_increment = 1,
da612e31 142 .ncopies = 1,
b50836ed 143 .nparity = 1,
ed23467b 144 .raid_name = "raid5",
41a6e891 145 .bg_flag = BTRFS_BLOCK_GROUP_RAID5,
f9fbcaa2 146 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
af902047
ZL
147 },
148 [BTRFS_RAID_RAID6] = {
149 .sub_stripes = 1,
150 .dev_stripes = 1,
151 .devs_max = 0,
152 .devs_min = 3,
8789f4fe 153 .tolerated_failures = 2,
af902047 154 .devs_increment = 1,
da612e31 155 .ncopies = 1,
b50836ed 156 .nparity = 2,
ed23467b 157 .raid_name = "raid6",
41a6e891 158 .bg_flag = BTRFS_BLOCK_GROUP_RAID6,
f9fbcaa2 159 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
af902047
ZL
160 },
161};
162
500a44c9
DS
163/*
164 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
165 * can be used as index to access btrfs_raid_array[].
166 */
167enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
168{
719fae89 169 const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
500a44c9 170
719fae89
QW
171 if (!profile)
172 return BTRFS_RAID_SINGLE;
173
174 return BTRFS_BG_FLAG_TO_INDEX(profile);
500a44c9
DS
175}
176
158da513 177const char *btrfs_bg_type_to_raid_name(u64 flags)
ed23467b 178{
158da513
DS
179 const int index = btrfs_bg_flags_to_raid_index(flags);
180
181 if (index >= BTRFS_NR_RAID_TYPES)
ed23467b
AJ
182 return NULL;
183
158da513 184 return btrfs_raid_array[index].raid_name;
ed23467b
AJ
185}
186
0b30f719
QW
187int btrfs_nr_parity_stripes(u64 type)
188{
189 enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type);
190
191 return btrfs_raid_array[index].nparity;
192}
193
f89e09cf
AJ
194/*
195 * Fill @buf with textual description of @bg_flags, no more than @size_buf
196 * bytes including terminating null byte.
197 */
198void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
199{
200 int i;
201 int ret;
202 char *bp = buf;
203 u64 flags = bg_flags;
204 u32 size_bp = size_buf;
205
206 if (!flags) {
207 strcpy(bp, "NONE");
208 return;
209 }
210
211#define DESCRIBE_FLAG(flag, desc) \
212 do { \
213 if (flags & (flag)) { \
214 ret = snprintf(bp, size_bp, "%s|", (desc)); \
215 if (ret < 0 || ret >= size_bp) \
216 goto out_overflow; \
217 size_bp -= ret; \
218 bp += ret; \
219 flags &= ~(flag); \
220 } \
221 } while (0)
222
223 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
224 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
225 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
226
227 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
228 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
229 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
230 btrfs_raid_array[i].raid_name);
231#undef DESCRIBE_FLAG
232
233 if (flags) {
234 ret = snprintf(bp, size_bp, "0x%llx|", flags);
235 size_bp -= ret;
236 }
237
238 if (size_bp < size_buf)
239 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
240
241 /*
242 * The text is trimmed, it's up to the caller to provide sufficiently
243 * large buffer
244 */
245out_overflow:;
246}
247
6f8e0fc7 248static int init_first_rw_device(struct btrfs_trans_handle *trans);
2ff7e61e 249static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
733f4fbb 250static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
5ab56090 251static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
03793cbb 252 enum btrfs_map_op op, u64 logical, u64 *length,
4c664611 253 struct btrfs_io_context **bioc_ret,
03793cbb
CH
254 struct btrfs_io_stripe *smap,
255 int *mirror_num_ret, int need_raid_map);
2b82032c 256
9c6b1c4d
DS
257/*
258 * Device locking
259 * ==============
260 *
261 * There are several mutexes that protect manipulation of devices and low-level
262 * structures like chunks but not block groups, extents or files
263 *
264 * uuid_mutex (global lock)
265 * ------------------------
266 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
267 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
268 * device) or requested by the device= mount option
269 *
270 * the mutex can be very coarse and can cover long-running operations
271 *
272 * protects: updates to fs_devices counters like missing devices, rw devices,
52042d8e 273 * seeding, structure cloning, opening/closing devices at mount/umount time
9c6b1c4d
DS
274 *
275 * global::fs_devs - add, remove, updates to the global list
276 *
18c850fd
JB
277 * does not protect: manipulation of the fs_devices::devices list in general
278 * but in mount context it could be used to exclude list modifications by eg.
279 * scan ioctl
9c6b1c4d
DS
280 *
281 * btrfs_device::name - renames (write side), read is RCU
282 *
283 * fs_devices::device_list_mutex (per-fs, with RCU)
284 * ------------------------------------------------
285 * protects updates to fs_devices::devices, ie. adding and deleting
286 *
287 * simple list traversal with read-only actions can be done with RCU protection
288 *
289 * may be used to exclude some operations from running concurrently without any
290 * modifications to the list (see write_all_supers)
291 *
18c850fd
JB
292 * Is not required at mount and close times, because our device list is
293 * protected by the uuid_mutex at that point.
294 *
9c6b1c4d
DS
295 * balance_mutex
296 * -------------
297 * protects balance structures (status, state) and context accessed from
298 * several places (internally, ioctl)
299 *
300 * chunk_mutex
301 * -----------
302 * protects chunks, adding or removing during allocation, trim or when a new
0b6f5d40
NB
303 * device is added/removed. Additionally it also protects post_commit_list of
304 * individual devices, since they can be added to the transaction's
305 * post_commit_list only with chunk_mutex held.
9c6b1c4d
DS
306 *
307 * cleaner_mutex
308 * -------------
309 * a big lock that is held by the cleaner thread and prevents running subvolume
310 * cleaning together with relocation or delayed iputs
311 *
312 *
313 * Lock nesting
314 * ============
315 *
316 * uuid_mutex
ae3e715f
AJ
317 * device_list_mutex
318 * chunk_mutex
319 * balance_mutex
89595e80
AJ
320 *
321 *
c3e1f96c
GR
322 * Exclusive operations
323 * ====================
89595e80
AJ
324 *
325 * Maintains the exclusivity of the following operations that apply to the
326 * whole filesystem and cannot run in parallel.
327 *
328 * - Balance (*)
329 * - Device add
330 * - Device remove
331 * - Device replace (*)
332 * - Resize
333 *
334 * The device operations (as above) can be in one of the following states:
335 *
336 * - Running state
337 * - Paused state
338 * - Completed state
339 *
340 * Only device operations marked with (*) can go into the Paused state for the
341 * following reasons:
342 *
343 * - ioctl (only Balance can be Paused through ioctl)
344 * - filesystem remounted as read-only
345 * - filesystem unmounted and mounted as read-only
346 * - system power-cycle and filesystem mounted as read-only
347 * - filesystem or device errors leading to forced read-only
348 *
c3e1f96c
GR
349 * The status of exclusive operation is set and cleared atomically.
350 * During the course of Paused state, fs_info::exclusive_operation remains set.
89595e80
AJ
351 * A device operation in Paused or Running state can be canceled or resumed
352 * either by ioctl (Balance only) or when remounted as read-write.
c3e1f96c 353 * The exclusive status is cleared when the device operation is canceled or
89595e80 354 * completed.
9c6b1c4d
DS
355 */
356
67a2c45e 357DEFINE_MUTEX(uuid_mutex);
8a4b83cc 358static LIST_HEAD(fs_uuids);
4143cb8b 359struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
c73eccf7
AJ
360{
361 return &fs_uuids;
362}
8a4b83cc 363
2dfeca9b
DS
364/*
365 * alloc_fs_devices - allocate struct btrfs_fs_devices
7239ff4b
NB
366 * @fsid: if not NULL, copy the UUID to fs_devices::fsid
367 * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
2dfeca9b
DS
368 *
369 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
370 * The returned struct is not linked onto any lists and can be destroyed with
371 * kfree() right away.
372 */
7239ff4b
NB
373static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
374 const u8 *metadata_fsid)
2208a378
ID
375{
376 struct btrfs_fs_devices *fs_devs;
377
78f2c9e6 378 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
2208a378
ID
379 if (!fs_devs)
380 return ERR_PTR(-ENOMEM);
381
382 mutex_init(&fs_devs->device_list_mutex);
383
384 INIT_LIST_HEAD(&fs_devs->devices);
385 INIT_LIST_HEAD(&fs_devs->alloc_list);
c4babc5e 386 INIT_LIST_HEAD(&fs_devs->fs_list);
944d3f9f 387 INIT_LIST_HEAD(&fs_devs->seed_list);
2208a378
ID
388 if (fsid)
389 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
2208a378 390
7239ff4b
NB
391 if (metadata_fsid)
392 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
393 else if (fsid)
394 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
395
2208a378
ID
396 return fs_devs;
397}
398
a425f9d4 399void btrfs_free_device(struct btrfs_device *device)
48dae9cf 400{
bbbf7243 401 WARN_ON(!list_empty(&device->post_commit_list));
48dae9cf 402 rcu_string_free(device->name);
1c11b63e 403 extent_io_tree_release(&device->alloc_state);
5b316468 404 btrfs_destroy_dev_zone_info(device);
48dae9cf
DS
405 kfree(device);
406}
407
e4404d6e
YZ
408static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
409{
410 struct btrfs_device *device;
411 WARN_ON(fs_devices->opened);
412 while (!list_empty(&fs_devices->devices)) {
413 device = list_entry(fs_devices->devices.next,
414 struct btrfs_device, dev_list);
415 list_del(&device->dev_list);
a425f9d4 416 btrfs_free_device(device);
e4404d6e
YZ
417 }
418 kfree(fs_devices);
419}
420
ffc5a379 421void __exit btrfs_cleanup_fs_uuids(void)
8a4b83cc
CM
422{
423 struct btrfs_fs_devices *fs_devices;
8a4b83cc 424
2b82032c
YZ
425 while (!list_empty(&fs_uuids)) {
426 fs_devices = list_entry(fs_uuids.next,
c4babc5e
AJ
427 struct btrfs_fs_devices, fs_list);
428 list_del(&fs_devices->fs_list);
e4404d6e 429 free_fs_devices(fs_devices);
8a4b83cc 430 }
8a4b83cc
CM
431}
432
7239ff4b
NB
433static noinline struct btrfs_fs_devices *find_fsid(
434 const u8 *fsid, const u8 *metadata_fsid)
8a4b83cc 435{
8a4b83cc
CM
436 struct btrfs_fs_devices *fs_devices;
437
7239ff4b
NB
438 ASSERT(fsid);
439
7a62d0f0 440 /* Handle non-split brain cases */
c4babc5e 441 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
7239ff4b
NB
442 if (metadata_fsid) {
443 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
444 && memcmp(metadata_fsid, fs_devices->metadata_uuid,
445 BTRFS_FSID_SIZE) == 0)
446 return fs_devices;
447 } else {
448 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
449 return fs_devices;
450 }
8a4b83cc
CM
451 }
452 return NULL;
453}
454
c6730a0e
SY
455static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
456 struct btrfs_super_block *disk_super)
457{
458
459 struct btrfs_fs_devices *fs_devices;
460
461 /*
462 * Handle scanned device having completed its fsid change but
463 * belonging to a fs_devices that was created by first scanning
464 * a device which didn't have its fsid/metadata_uuid changed
465 * at all and the CHANGING_FSID_V2 flag set.
466 */
467 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
468 if (fs_devices->fsid_change &&
469 memcmp(disk_super->metadata_uuid, fs_devices->fsid,
470 BTRFS_FSID_SIZE) == 0 &&
471 memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
472 BTRFS_FSID_SIZE) == 0) {
473 return fs_devices;
474 }
475 }
476 /*
477 * Handle scanned device having completed its fsid change but
478 * belonging to a fs_devices that was created by a device that
479 * has an outdated pair of fsid/metadata_uuid and
480 * CHANGING_FSID_V2 flag set.
481 */
482 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
483 if (fs_devices->fsid_change &&
484 memcmp(fs_devices->metadata_uuid,
485 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
486 memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
487 BTRFS_FSID_SIZE) == 0) {
488 return fs_devices;
489 }
490 }
491
492 return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
493}
494
495
beaf8ab3
SB
496static int
497btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
498 int flush, struct block_device **bdev,
8f32380d 499 struct btrfs_super_block **disk_super)
beaf8ab3
SB
500{
501 int ret;
502
503 *bdev = blkdev_get_by_path(device_path, flags, holder);
504
505 if (IS_ERR(*bdev)) {
506 ret = PTR_ERR(*bdev);
beaf8ab3
SB
507 goto error;
508 }
509
510 if (flush)
1226dfff 511 sync_blockdev(*bdev);
9f6d2510 512 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
beaf8ab3
SB
513 if (ret) {
514 blkdev_put(*bdev, flags);
515 goto error;
516 }
517 invalidate_bdev(*bdev);
8f32380d
JT
518 *disk_super = btrfs_read_dev_super(*bdev);
519 if (IS_ERR(*disk_super)) {
520 ret = PTR_ERR(*disk_super);
beaf8ab3
SB
521 blkdev_put(*bdev, flags);
522 goto error;
523 }
524
525 return 0;
526
527error:
528 *bdev = NULL;
beaf8ab3
SB
529 return ret;
530}
531
16cab91a
AJ
532/**
533 * Search and remove all stale devices (which are not mounted).
d8367db3 534 * When both inputs are NULL, it will search and release all stale devices.
16cab91a
AJ
535 *
536 * @devt: Optional. When provided will it release all unmounted devices
537 * matching this devt only.
538 * @skip_device: Optional. Will skip this device when searching for the stale
d8367db3 539 * devices.
16cab91a
AJ
540 *
541 * Return: 0 for success or if @devt is 0.
542 * -EBUSY if @devt is a mounted device.
543 * -ENOENT if @devt does not match any device in the list.
d8367db3 544 */
16cab91a 545static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
4fde46f0 546{
fa6d2ae5
AJ
547 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
548 struct btrfs_device *device, *tmp_device;
70bc7088
AJ
549 int ret = 0;
550
c1247069
AJ
551 lockdep_assert_held(&uuid_mutex);
552
16cab91a 553 if (devt)
70bc7088 554 ret = -ENOENT;
4fde46f0 555
fa6d2ae5 556 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
4fde46f0 557
70bc7088 558 mutex_lock(&fs_devices->device_list_mutex);
fa6d2ae5
AJ
559 list_for_each_entry_safe(device, tmp_device,
560 &fs_devices->devices, dev_list) {
fa6d2ae5 561 if (skip_device && skip_device == device)
d8367db3 562 continue;
330a5bf4 563 if (devt && devt != device->devt)
38cf665d 564 continue;
70bc7088
AJ
565 if (fs_devices->opened) {
566 /* for an already deleted device return 0 */
16cab91a 567 if (devt && ret != 0)
70bc7088
AJ
568 ret = -EBUSY;
569 break;
570 }
4fde46f0 571
4fde46f0 572 /* delete the stale device */
7bcb8164
AJ
573 fs_devices->num_devices--;
574 list_del(&device->dev_list);
575 btrfs_free_device(device);
576
70bc7088 577 ret = 0;
7bcb8164
AJ
578 }
579 mutex_unlock(&fs_devices->device_list_mutex);
70bc7088 580
7bcb8164
AJ
581 if (fs_devices->num_devices == 0) {
582 btrfs_sysfs_remove_fsid(fs_devices);
583 list_del(&fs_devices->fs_list);
584 free_fs_devices(fs_devices);
4fde46f0
AJ
585 }
586 }
70bc7088
AJ
587
588 return ret;
4fde46f0
AJ
589}
590
18c850fd
JB
591/*
592 * This is only used on mount, and we are protected from competing things
593 * messing with our fs_devices by the uuid_mutex, thus we do not need the
594 * fs_devices->device_list_mutex here.
595 */
0fb08bcc
AJ
596static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
597 struct btrfs_device *device, fmode_t flags,
598 void *holder)
599{
0fb08bcc 600 struct block_device *bdev;
0fb08bcc
AJ
601 struct btrfs_super_block *disk_super;
602 u64 devid;
603 int ret;
604
605 if (device->bdev)
606 return -EINVAL;
607 if (!device->name)
608 return -EINVAL;
609
610 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
8f32380d 611 &bdev, &disk_super);
0fb08bcc
AJ
612 if (ret)
613 return ret;
614
0fb08bcc
AJ
615 devid = btrfs_stack_device_id(&disk_super->dev_item);
616 if (devid != device->devid)
8f32380d 617 goto error_free_page;
0fb08bcc
AJ
618
619 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
8f32380d 620 goto error_free_page;
0fb08bcc
AJ
621
622 device->generation = btrfs_super_generation(disk_super);
623
624 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
7239ff4b
NB
625 if (btrfs_super_incompat_flags(disk_super) &
626 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
627 pr_err(
628 "BTRFS: Invalid seeding and uuid-changed device detected\n");
8f32380d 629 goto error_free_page;
7239ff4b
NB
630 }
631
ebbede42 632 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
0395d84f 633 fs_devices->seeding = true;
0fb08bcc 634 } else {
ebbede42
AJ
635 if (bdev_read_only(bdev))
636 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
637 else
638 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
0fb08bcc
AJ
639 }
640
10f0d2a5 641 if (!bdev_nonrot(bdev))
7f0432d0 642 fs_devices->rotating = true;
0fb08bcc 643
63a7cb13
DS
644 if (bdev_max_discard_sectors(bdev))
645 fs_devices->discardable = true;
646
0fb08bcc 647 device->bdev = bdev;
e12c9621 648 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
0fb08bcc
AJ
649 device->mode = flags;
650
651 fs_devices->open_devices++;
ebbede42
AJ
652 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
653 device->devid != BTRFS_DEV_REPLACE_DEVID) {
0fb08bcc 654 fs_devices->rw_devices++;
b1b8e386 655 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
0fb08bcc 656 }
8f32380d 657 btrfs_release_disk_super(disk_super);
0fb08bcc
AJ
658
659 return 0;
660
8f32380d
JT
661error_free_page:
662 btrfs_release_disk_super(disk_super);
0fb08bcc
AJ
663 blkdev_put(bdev, flags);
664
665 return -EINVAL;
666}
667
7a62d0f0
NB
668/*
669 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
c0d81c7c
SY
670 * being created with a disk that has already completed its fsid change. Such
671 * disk can belong to an fs which has its FSID changed or to one which doesn't.
672 * Handle both cases here.
7a62d0f0
NB
673 */
674static struct btrfs_fs_devices *find_fsid_inprogress(
675 struct btrfs_super_block *disk_super)
676{
677 struct btrfs_fs_devices *fs_devices;
678
679 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
680 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
681 BTRFS_FSID_SIZE) != 0 &&
682 memcmp(fs_devices->metadata_uuid, disk_super->fsid,
683 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
684 return fs_devices;
685 }
686 }
687
c0d81c7c 688 return find_fsid(disk_super->fsid, NULL);
7a62d0f0
NB
689}
690
cc5de4e7
NB
691
692static struct btrfs_fs_devices *find_fsid_changed(
693 struct btrfs_super_block *disk_super)
694{
695 struct btrfs_fs_devices *fs_devices;
696
697 /*
698 * Handles the case where scanned device is part of an fs that had
1a9fd417 699 * multiple successful changes of FSID but currently device didn't
05840710
NB
700 * observe it. Meaning our fsid will be different than theirs. We need
701 * to handle two subcases :
702 * 1 - The fs still continues to have different METADATA/FSID uuids.
703 * 2 - The fs is switched back to its original FSID (METADATA/FSID
704 * are equal).
cc5de4e7
NB
705 */
706 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
05840710 707 /* Changed UUIDs */
cc5de4e7
NB
708 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
709 BTRFS_FSID_SIZE) != 0 &&
710 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
711 BTRFS_FSID_SIZE) == 0 &&
712 memcmp(fs_devices->fsid, disk_super->fsid,
05840710
NB
713 BTRFS_FSID_SIZE) != 0)
714 return fs_devices;
715
716 /* Unchanged UUIDs */
717 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
718 BTRFS_FSID_SIZE) == 0 &&
719 memcmp(fs_devices->fsid, disk_super->metadata_uuid,
720 BTRFS_FSID_SIZE) == 0)
cc5de4e7 721 return fs_devices;
cc5de4e7
NB
722 }
723
724 return NULL;
725}
1362089d
NB
726
727static struct btrfs_fs_devices *find_fsid_reverted_metadata(
728 struct btrfs_super_block *disk_super)
729{
730 struct btrfs_fs_devices *fs_devices;
731
732 /*
733 * Handle the case where the scanned device is part of an fs whose last
734 * metadata UUID change reverted it to the original FSID. At the same
735 * time * fs_devices was first created by another constitutent device
736 * which didn't fully observe the operation. This results in an
737 * btrfs_fs_devices created with metadata/fsid different AND
738 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
739 * fs_devices equal to the FSID of the disk.
740 */
741 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
742 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
743 BTRFS_FSID_SIZE) != 0 &&
744 memcmp(fs_devices->metadata_uuid, disk_super->fsid,
745 BTRFS_FSID_SIZE) == 0 &&
746 fs_devices->fsid_change)
747 return fs_devices;
748 }
749
750 return NULL;
751}
60999ca4
DS
752/*
753 * Add new device to list of registered devices
754 *
755 * Returns:
e124ece5
AJ
756 * device pointer which was just added or updated when successful
757 * error pointer when failed
60999ca4 758 */
e124ece5 759static noinline struct btrfs_device *device_list_add(const char *path,
4306a974
AJ
760 struct btrfs_super_block *disk_super,
761 bool *new_device_added)
8a4b83cc
CM
762{
763 struct btrfs_device *device;
7a62d0f0 764 struct btrfs_fs_devices *fs_devices = NULL;
606686ee 765 struct rcu_string *name;
8a4b83cc 766 u64 found_transid = btrfs_super_generation(disk_super);
3acbcbfc 767 u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
4889bc05
AJ
768 dev_t path_devt;
769 int error;
7239ff4b
NB
770 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
771 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
d1a63002
NB
772 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
773 BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
7239ff4b 774
4889bc05
AJ
775 error = lookup_bdev(path, &path_devt);
776 if (error)
777 return ERR_PTR(error);
778
cc5de4e7 779 if (fsid_change_in_progress) {
c0d81c7c 780 if (!has_metadata_uuid)
cc5de4e7 781 fs_devices = find_fsid_inprogress(disk_super);
c0d81c7c 782 else
cc5de4e7 783 fs_devices = find_fsid_changed(disk_super);
7a62d0f0 784 } else if (has_metadata_uuid) {
c6730a0e 785 fs_devices = find_fsid_with_metadata_uuid(disk_super);
7a62d0f0 786 } else {
1362089d
NB
787 fs_devices = find_fsid_reverted_metadata(disk_super);
788 if (!fs_devices)
789 fs_devices = find_fsid(disk_super->fsid, NULL);
7a62d0f0
NB
790 }
791
8a4b83cc 792
8a4b83cc 793 if (!fs_devices) {
7239ff4b
NB
794 if (has_metadata_uuid)
795 fs_devices = alloc_fs_devices(disk_super->fsid,
796 disk_super->metadata_uuid);
797 else
798 fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
799
2208a378 800 if (IS_ERR(fs_devices))
e124ece5 801 return ERR_CAST(fs_devices);
2208a378 802
92900e51
AV
803 fs_devices->fsid_change = fsid_change_in_progress;
804
9c6d173e 805 mutex_lock(&fs_devices->device_list_mutex);
c4babc5e 806 list_add(&fs_devices->fs_list, &fs_uuids);
2208a378 807
8a4b83cc
CM
808 device = NULL;
809 } else {
562d7b15
JB
810 struct btrfs_dev_lookup_args args = {
811 .devid = devid,
812 .uuid = disk_super->dev_item.uuid,
813 };
814
9c6d173e 815 mutex_lock(&fs_devices->device_list_mutex);
562d7b15 816 device = btrfs_find_device(fs_devices, &args);
7a62d0f0
NB
817
818 /*
819 * If this disk has been pulled into an fs devices created by
820 * a device which had the CHANGING_FSID_V2 flag then replace the
821 * metadata_uuid/fsid values of the fs_devices.
822 */
1362089d 823 if (fs_devices->fsid_change &&
7a62d0f0
NB
824 found_transid > fs_devices->latest_generation) {
825 memcpy(fs_devices->fsid, disk_super->fsid,
826 BTRFS_FSID_SIZE);
1362089d
NB
827
828 if (has_metadata_uuid)
829 memcpy(fs_devices->metadata_uuid,
830 disk_super->metadata_uuid,
831 BTRFS_FSID_SIZE);
832 else
833 memcpy(fs_devices->metadata_uuid,
834 disk_super->fsid, BTRFS_FSID_SIZE);
7a62d0f0
NB
835
836 fs_devices->fsid_change = false;
837 }
8a4b83cc 838 }
443f24fe 839
8a4b83cc 840 if (!device) {
9c6d173e
AJ
841 if (fs_devices->opened) {
842 mutex_unlock(&fs_devices->device_list_mutex);
e124ece5 843 return ERR_PTR(-EBUSY);
9c6d173e 844 }
2b82032c 845
12bd2fc0
ID
846 device = btrfs_alloc_device(NULL, &devid,
847 disk_super->dev_item.uuid);
848 if (IS_ERR(device)) {
9c6d173e 849 mutex_unlock(&fs_devices->device_list_mutex);
8a4b83cc 850 /* we can safely leave the fs_devices entry around */
e124ece5 851 return device;
8a4b83cc 852 }
606686ee
JB
853
854 name = rcu_string_strdup(path, GFP_NOFS);
855 if (!name) {
a425f9d4 856 btrfs_free_device(device);
9c6d173e 857 mutex_unlock(&fs_devices->device_list_mutex);
e124ece5 858 return ERR_PTR(-ENOMEM);
8a4b83cc 859 }
606686ee 860 rcu_assign_pointer(device->name, name);
4889bc05 861 device->devt = path_devt;
90519d66 862
1f78160c 863 list_add_rcu(&device->dev_list, &fs_devices->devices);
f7171750 864 fs_devices->num_devices++;
e5e9a520 865
2b82032c 866 device->fs_devices = fs_devices;
4306a974 867 *new_device_added = true;
327f18cc
AJ
868
869 if (disk_super->label[0])
aa6c0df7
AJ
870 pr_info(
871 "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
872 disk_super->label, devid, found_transid, path,
873 current->comm, task_pid_nr(current));
327f18cc 874 else
aa6c0df7
AJ
875 pr_info(
876 "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
877 disk_super->fsid, devid, found_transid, path,
878 current->comm, task_pid_nr(current));
327f18cc 879
606686ee 880 } else if (!device->name || strcmp(device->name->str, path)) {
b96de000
AJ
881 /*
882 * When FS is already mounted.
883 * 1. If you are here and if the device->name is NULL that
884 * means this device was missing at time of FS mount.
885 * 2. If you are here and if the device->name is different
886 * from 'path' that means either
887 * a. The same device disappeared and reappeared with
888 * different name. or
889 * b. The missing-disk-which-was-replaced, has
890 * reappeared now.
891 *
892 * We must allow 1 and 2a above. But 2b would be a spurious
893 * and unintentional.
894 *
895 * Further in case of 1 and 2a above, the disk at 'path'
896 * would have missed some transaction when it was away and
897 * in case of 2a the stale bdev has to be updated as well.
898 * 2b must not be allowed at all time.
899 */
900
901 /*
0f23ae74
CM
902 * For now, we do allow update to btrfs_fs_device through the
903 * btrfs dev scan cli after FS has been mounted. We're still
904 * tracking a problem where systems fail mount by subvolume id
905 * when we reject replacement on a mounted FS.
b96de000 906 */
0f23ae74 907 if (!fs_devices->opened && found_transid < device->generation) {
77bdae4d
AJ
908 /*
909 * That is if the FS is _not_ mounted and if you
910 * are here, that means there is more than one
911 * disk with same uuid and devid.We keep the one
912 * with larger generation number or the last-in if
913 * generation are equal.
914 */
9c6d173e 915 mutex_unlock(&fs_devices->device_list_mutex);
e124ece5 916 return ERR_PTR(-EEXIST);
77bdae4d 917 }
b96de000 918
a9261d41
AJ
919 /*
920 * We are going to replace the device path for a given devid,
921 * make sure it's the same device if the device is mounted
79c9234b
DM
922 *
923 * NOTE: the device->fs_info may not be reliable here so pass
924 * in a NULL to message helpers instead. This avoids a possible
925 * use-after-free when the fs_info and fs_info->sb are already
926 * torn down.
a9261d41
AJ
927 */
928 if (device->bdev) {
4889bc05 929 if (device->devt != path_devt) {
a9261d41 930 mutex_unlock(&fs_devices->device_list_mutex);
0697d9a6 931 btrfs_warn_in_rcu(NULL,
79dae17d
AJ
932 "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
933 path, devid, found_transid,
934 current->comm,
935 task_pid_nr(current));
a9261d41
AJ
936 return ERR_PTR(-EEXIST);
937 }
79c9234b 938 btrfs_info_in_rcu(NULL,
79dae17d
AJ
939 "devid %llu device path %s changed to %s scanned by %s (%d)",
940 devid, rcu_str_deref(device->name),
941 path, current->comm,
942 task_pid_nr(current));
a9261d41
AJ
943 }
944
606686ee 945 name = rcu_string_strdup(path, GFP_NOFS);
9c6d173e
AJ
946 if (!name) {
947 mutex_unlock(&fs_devices->device_list_mutex);
e124ece5 948 return ERR_PTR(-ENOMEM);
9c6d173e 949 }
606686ee
JB
950 rcu_string_free(device->name);
951 rcu_assign_pointer(device->name, name);
e6e674bd 952 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
cd02dca5 953 fs_devices->missing_devices--;
e6e674bd 954 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
cd02dca5 955 }
4889bc05 956 device->devt = path_devt;
8a4b83cc
CM
957 }
958
77bdae4d
AJ
959 /*
960 * Unmount does not free the btrfs_device struct but would zero
961 * generation along with most of the other members. So just update
962 * it back. We need it to pick the disk with largest generation
963 * (as above).
964 */
d1a63002 965 if (!fs_devices->opened) {
77bdae4d 966 device->generation = found_transid;
d1a63002
NB
967 fs_devices->latest_generation = max_t(u64, found_transid,
968 fs_devices->latest_generation);
969 }
77bdae4d 970
f2788d2f
AJ
971 fs_devices->total_devices = btrfs_super_num_devices(disk_super);
972
9c6d173e 973 mutex_unlock(&fs_devices->device_list_mutex);
e124ece5 974 return device;
8a4b83cc
CM
975}
976
e4404d6e
YZ
977static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
978{
979 struct btrfs_fs_devices *fs_devices;
980 struct btrfs_device *device;
981 struct btrfs_device *orig_dev;
d2979aa2 982 int ret = 0;
e4404d6e 983
c1247069
AJ
984 lockdep_assert_held(&uuid_mutex);
985
7239ff4b 986 fs_devices = alloc_fs_devices(orig->fsid, NULL);
2208a378
ID
987 if (IS_ERR(fs_devices))
988 return fs_devices;
e4404d6e 989
02db0844 990 fs_devices->total_devices = orig->total_devices;
e4404d6e
YZ
991
992 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
606686ee
JB
993 struct rcu_string *name;
994
12bd2fc0
ID
995 device = btrfs_alloc_device(NULL, &orig_dev->devid,
996 orig_dev->uuid);
d2979aa2
AJ
997 if (IS_ERR(device)) {
998 ret = PTR_ERR(device);
e4404d6e 999 goto error;
d2979aa2 1000 }
e4404d6e 1001
606686ee
JB
1002 /*
1003 * This is ok to do without rcu read locked because we hold the
1004 * uuid mutex so nothing we touch in here is going to disappear.
1005 */
e755f780 1006 if (orig_dev->name) {
78f2c9e6
DS
1007 name = rcu_string_strdup(orig_dev->name->str,
1008 GFP_KERNEL);
e755f780 1009 if (!name) {
a425f9d4 1010 btrfs_free_device(device);
d2979aa2 1011 ret = -ENOMEM;
e755f780
AJ
1012 goto error;
1013 }
1014 rcu_assign_pointer(device->name, name);
fd2696f3 1015 }
e4404d6e 1016
21e61ec6
JT
1017 if (orig_dev->zone_info) {
1018 struct btrfs_zoned_device_info *zone_info;
1019
1020 zone_info = btrfs_clone_dev_zone_info(orig_dev);
1021 if (!zone_info) {
1022 btrfs_free_device(device);
1023 ret = -ENOMEM;
1024 goto error;
1025 }
1026 device->zone_info = zone_info;
1027 }
1028
e4404d6e
YZ
1029 list_add(&device->dev_list, &fs_devices->devices);
1030 device->fs_devices = fs_devices;
1031 fs_devices->num_devices++;
1032 }
1033 return fs_devices;
1034error:
1035 free_fs_devices(fs_devices);
d2979aa2 1036 return ERR_PTR(ret);
e4404d6e
YZ
1037}
1038
3712ccb7 1039static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
bacce86a 1040 struct btrfs_device **latest_dev)
dfe25020 1041{
c6e30871 1042 struct btrfs_device *device, *next;
a6b0d5c8 1043
46224705 1044 /* This is the initialized path, it is safe to release the devices. */
c6e30871 1045 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
3712ccb7 1046 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
401e29c1 1047 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
3712ccb7 1048 &device->dev_state) &&
998a0671
AJ
1049 !test_bit(BTRFS_DEV_STATE_MISSING,
1050 &device->dev_state) &&
3712ccb7
NB
1051 (!*latest_dev ||
1052 device->generation > (*latest_dev)->generation)) {
1053 *latest_dev = device;
a6b0d5c8 1054 }
2b82032c 1055 continue;
a6b0d5c8 1056 }
2b82032c 1057
cf89af14
AJ
1058 /*
1059 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1060 * in btrfs_init_dev_replace() so just continue.
1061 */
1062 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1063 continue;
1064
2b82032c 1065 if (device->bdev) {
d4d77629 1066 blkdev_put(device->bdev, device->mode);
2b82032c
YZ
1067 device->bdev = NULL;
1068 fs_devices->open_devices--;
1069 }
ebbede42 1070 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2b82032c 1071 list_del_init(&device->dev_alloc_list);
ebbede42 1072 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
b2a61667 1073 fs_devices->rw_devices--;
2b82032c 1074 }
e4404d6e
YZ
1075 list_del_init(&device->dev_list);
1076 fs_devices->num_devices--;
a425f9d4 1077 btrfs_free_device(device);
dfe25020 1078 }
2b82032c 1079
3712ccb7
NB
1080}
1081
1082/*
1083 * After we have read the system tree and know devids belonging to this
1084 * filesystem, remove the device which does not belong there.
1085 */
bacce86a 1086void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
3712ccb7
NB
1087{
1088 struct btrfs_device *latest_dev = NULL;
944d3f9f 1089 struct btrfs_fs_devices *seed_dev;
3712ccb7
NB
1090
1091 mutex_lock(&uuid_mutex);
bacce86a 1092 __btrfs_free_extra_devids(fs_devices, &latest_dev);
944d3f9f
NB
1093
1094 list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
bacce86a 1095 __btrfs_free_extra_devids(seed_dev, &latest_dev);
2b82032c 1096
d24fa5c1 1097 fs_devices->latest_dev = latest_dev;
a6b0d5c8 1098
dfe25020 1099 mutex_unlock(&uuid_mutex);
dfe25020 1100}
a0af469b 1101
14238819
AJ
1102static void btrfs_close_bdev(struct btrfs_device *device)
1103{
08ffcae8
DS
1104 if (!device->bdev)
1105 return;
1106
ebbede42 1107 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
14238819
AJ
1108 sync_blockdev(device->bdev);
1109 invalidate_bdev(device->bdev);
1110 }
1111
08ffcae8 1112 blkdev_put(device->bdev, device->mode);
14238819
AJ
1113}
1114
959b1c04 1115static void btrfs_close_one_device(struct btrfs_device *device)
f448341a
AJ
1116{
1117 struct btrfs_fs_devices *fs_devices = device->fs_devices;
f448341a 1118
ebbede42 1119 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
f448341a
AJ
1120 device->devid != BTRFS_DEV_REPLACE_DEVID) {
1121 list_del_init(&device->dev_alloc_list);
1122 fs_devices->rw_devices--;
1123 }
1124
0d977e0e
DCZX
1125 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1126 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
1127
5d03dbeb
LZ
1128 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
1129 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
f448341a 1130 fs_devices->missing_devices--;
5d03dbeb 1131 }
f448341a 1132
959b1c04 1133 btrfs_close_bdev(device);
321f69f8 1134 if (device->bdev) {
3fff3975 1135 fs_devices->open_devices--;
321f69f8 1136 device->bdev = NULL;
f448341a 1137 }
321f69f8 1138 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
5b316468 1139 btrfs_destroy_dev_zone_info(device);
f448341a 1140
321f69f8
JT
1141 device->fs_info = NULL;
1142 atomic_set(&device->dev_stats_ccnt, 0);
1143 extent_io_tree_release(&device->alloc_state);
959b1c04 1144
6b225baa
FM
1145 /*
1146 * Reset the flush error record. We might have a transient flush error
1147 * in this mount, and if so we aborted the current transaction and set
1148 * the fs to an error state, guaranteeing no super blocks can be further
1149 * committed. However that error might be transient and if we unmount the
1150 * filesystem and mount it again, we should allow the mount to succeed
1151 * (btrfs_check_rw_degradable() should not fail) - if after mounting the
1152 * filesystem again we still get flush errors, then we will again abort
1153 * any transaction and set the error state, guaranteeing no commits of
1154 * unsafe super blocks.
1155 */
1156 device->last_flush_error = 0;
1157
321f69f8
JT
1158 /* Verify the device is back in a pristine state */
1159 ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1160 ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1161 ASSERT(list_empty(&device->dev_alloc_list));
1162 ASSERT(list_empty(&device->post_commit_list));
f448341a
AJ
1163}
1164
54eed6ae 1165static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
8a4b83cc 1166{
2037a093 1167 struct btrfs_device *device, *tmp;
e4404d6e 1168
425c6ed6
JB
1169 lockdep_assert_held(&uuid_mutex);
1170
2b82032c 1171 if (--fs_devices->opened > 0)
54eed6ae 1172 return;
8a4b83cc 1173
425c6ed6 1174 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
959b1c04 1175 btrfs_close_one_device(device);
c9513edb 1176
e4404d6e
YZ
1177 WARN_ON(fs_devices->open_devices);
1178 WARN_ON(fs_devices->rw_devices);
2b82032c 1179 fs_devices->opened = 0;
0395d84f 1180 fs_devices->seeding = false;
c4989c2f 1181 fs_devices->fs_info = NULL;
8a4b83cc
CM
1182}
1183
54eed6ae 1184void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
2b82032c 1185{
944d3f9f
NB
1186 LIST_HEAD(list);
1187 struct btrfs_fs_devices *tmp;
2b82032c
YZ
1188
1189 mutex_lock(&uuid_mutex);
54eed6ae 1190 close_fs_devices(fs_devices);
944d3f9f
NB
1191 if (!fs_devices->opened)
1192 list_splice_init(&fs_devices->seed_list, &list);
e4404d6e 1193
944d3f9f 1194 list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
0226e0eb 1195 close_fs_devices(fs_devices);
944d3f9f 1196 list_del(&fs_devices->seed_list);
e4404d6e
YZ
1197 free_fs_devices(fs_devices);
1198 }
425c6ed6 1199 mutex_unlock(&uuid_mutex);
2b82032c
YZ
1200}
1201
897fb573 1202static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
e4404d6e 1203 fmode_t flags, void *holder)
8a4b83cc 1204{
8a4b83cc 1205 struct btrfs_device *device;
443f24fe 1206 struct btrfs_device *latest_dev = NULL;
96c2e067 1207 struct btrfs_device *tmp_device;
8a4b83cc 1208
d4d77629
TH
1209 flags |= FMODE_EXCL;
1210
96c2e067
AJ
1211 list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1212 dev_list) {
1213 int ret;
a0af469b 1214
96c2e067
AJ
1215 ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1216 if (ret == 0 &&
1217 (!latest_dev || device->generation > latest_dev->generation)) {
9f050db4 1218 latest_dev = device;
96c2e067
AJ
1219 } else if (ret == -ENODATA) {
1220 fs_devices->num_devices--;
1221 list_del(&device->dev_list);
1222 btrfs_free_device(device);
1223 }
8a4b83cc 1224 }
1ed802c9
AJ
1225 if (fs_devices->open_devices == 0)
1226 return -EINVAL;
1227
2b82032c 1228 fs_devices->opened = 1;
d24fa5c1 1229 fs_devices->latest_dev = latest_dev;
2b82032c 1230 fs_devices->total_rw_bytes = 0;
c4a816c6 1231 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
33fd2f71 1232 fs_devices->read_policy = BTRFS_READ_POLICY_PID;
1ed802c9
AJ
1233
1234 return 0;
2b82032c
YZ
1235}
1236
4f0f586b
ST
1237static int devid_cmp(void *priv, const struct list_head *a,
1238 const struct list_head *b)
f8e10cd3 1239{
214cc184 1240 const struct btrfs_device *dev1, *dev2;
f8e10cd3
AJ
1241
1242 dev1 = list_entry(a, struct btrfs_device, dev_list);
1243 dev2 = list_entry(b, struct btrfs_device, dev_list);
1244
1245 if (dev1->devid < dev2->devid)
1246 return -1;
1247 else if (dev1->devid > dev2->devid)
1248 return 1;
1249 return 0;
1250}
1251
2b82032c 1252int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
97288f2c 1253 fmode_t flags, void *holder)
2b82032c
YZ
1254{
1255 int ret;
1256
f5194e34 1257 lockdep_assert_held(&uuid_mutex);
18c850fd
JB
1258 /*
1259 * The device_list_mutex cannot be taken here in case opening the
a8698707 1260 * underlying device takes further locks like open_mutex.
18c850fd
JB
1261 *
1262 * We also don't need the lock here as this is called during mount and
1263 * exclusion is provided by uuid_mutex
1264 */
f5194e34 1265
2b82032c 1266 if (fs_devices->opened) {
e4404d6e
YZ
1267 fs_devices->opened++;
1268 ret = 0;
2b82032c 1269 } else {
f8e10cd3 1270 list_sort(NULL, &fs_devices->devices, devid_cmp);
897fb573 1271 ret = open_fs_devices(fs_devices, flags, holder);
2b82032c 1272 }
542c5908 1273
8a4b83cc
CM
1274 return ret;
1275}
1276
8f32380d 1277void btrfs_release_disk_super(struct btrfs_super_block *super)
6cf86a00 1278{
8f32380d
JT
1279 struct page *page = virt_to_page(super);
1280
6cf86a00
AJ
1281 put_page(page);
1282}
1283
b335eab8 1284static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
12659251 1285 u64 bytenr, u64 bytenr_orig)
6cf86a00 1286{
b335eab8
NB
1287 struct btrfs_super_block *disk_super;
1288 struct page *page;
6cf86a00
AJ
1289 void *p;
1290 pgoff_t index;
1291
1292 /* make sure our super fits in the device */
cda00eba 1293 if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
b335eab8 1294 return ERR_PTR(-EINVAL);
6cf86a00
AJ
1295
1296 /* make sure our super fits in the page */
b335eab8
NB
1297 if (sizeof(*disk_super) > PAGE_SIZE)
1298 return ERR_PTR(-EINVAL);
6cf86a00
AJ
1299
1300 /* make sure our super doesn't straddle pages on disk */
1301 index = bytenr >> PAGE_SHIFT;
b335eab8
NB
1302 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1303 return ERR_PTR(-EINVAL);
6cf86a00
AJ
1304
1305 /* pull in the page with our super */
b335eab8 1306 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
6cf86a00 1307
b335eab8
NB
1308 if (IS_ERR(page))
1309 return ERR_CAST(page);
6cf86a00 1310
b335eab8 1311 p = page_address(page);
6cf86a00
AJ
1312
1313 /* align our pointer to the offset of the super block */
b335eab8 1314 disk_super = p + offset_in_page(bytenr);
6cf86a00 1315
12659251 1316 if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
b335eab8 1317 btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
8f32380d 1318 btrfs_release_disk_super(p);
b335eab8 1319 return ERR_PTR(-EINVAL);
6cf86a00
AJ
1320 }
1321
b335eab8
NB
1322 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1323 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
6cf86a00 1324
b335eab8 1325 return disk_super;
6cf86a00
AJ
1326}
1327
16cab91a 1328int btrfs_forget_devices(dev_t devt)
228a73ab
AJ
1329{
1330 int ret;
1331
1332 mutex_lock(&uuid_mutex);
16cab91a 1333 ret = btrfs_free_stale_devices(devt, NULL);
228a73ab
AJ
1334 mutex_unlock(&uuid_mutex);
1335
1336 return ret;
1337}
1338
6f60cbd3
DS
1339/*
1340 * Look for a btrfs signature on a device. This may be called out of the mount path
1341 * and we are not allowed to call set_blocksize during the scan. The superblock
1342 * is read via pagecache
1343 */
36350e95
GJ
1344struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1345 void *holder)
8a4b83cc
CM
1346{
1347 struct btrfs_super_block *disk_super;
4306a974 1348 bool new_device_added = false;
36350e95 1349 struct btrfs_device *device = NULL;
8a4b83cc 1350 struct block_device *bdev;
12659251
NA
1351 u64 bytenr, bytenr_orig;
1352 int ret;
8a4b83cc 1353
899f9307
DS
1354 lockdep_assert_held(&uuid_mutex);
1355
6f60cbd3
DS
1356 /*
1357 * we would like to check all the supers, but that would make
1358 * a btrfs mount succeed after a mkfs from a different FS.
1359 * So, we need to add a special mount option to scan for
1360 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1361 */
d4d77629 1362 flags |= FMODE_EXCL;
6f60cbd3
DS
1363
1364 bdev = blkdev_get_by_path(path, flags, holder);
b6ed73bc 1365 if (IS_ERR(bdev))
36350e95 1366 return ERR_CAST(bdev);
6f60cbd3 1367
12659251
NA
1368 bytenr_orig = btrfs_sb_offset(0);
1369 ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
4989d4a0
SK
1370 if (ret) {
1371 device = ERR_PTR(ret);
1372 goto error_bdev_put;
1373 }
12659251
NA
1374
1375 disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
b335eab8
NB
1376 if (IS_ERR(disk_super)) {
1377 device = ERR_CAST(disk_super);
6f60cbd3 1378 goto error_bdev_put;
05a5c55d 1379 }
6f60cbd3 1380
4306a974 1381 device = device_list_add(path, disk_super, &new_device_added);
4889bc05
AJ
1382 if (!IS_ERR(device) && new_device_added)
1383 btrfs_free_stale_devices(device->devt, device);
6f60cbd3 1384
8f32380d 1385 btrfs_release_disk_super(disk_super);
6f60cbd3
DS
1386
1387error_bdev_put:
d4d77629 1388 blkdev_put(bdev, flags);
b6ed73bc 1389
36350e95 1390 return device;
8a4b83cc 1391}
0b86a832 1392
1c11b63e
JM
1393/*
1394 * Try to find a chunk that intersects [start, start + len] range and when one
1395 * such is found, record the end of it in *start
1396 */
1c11b63e
JM
1397static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1398 u64 len)
6df9a95e 1399{
1c11b63e 1400 u64 physical_start, physical_end;
6df9a95e 1401
1c11b63e 1402 lockdep_assert_held(&device->fs_info->chunk_mutex);
6df9a95e 1403
1c11b63e
JM
1404 if (!find_first_extent_bit(&device->alloc_state, *start,
1405 &physical_start, &physical_end,
1406 CHUNK_ALLOCATED, NULL)) {
c152b63e 1407
1c11b63e
JM
1408 if (in_range(physical_start, *start, len) ||
1409 in_range(*start, physical_start,
1410 physical_end - physical_start)) {
1411 *start = physical_end + 1;
1412 return true;
6df9a95e
JB
1413 }
1414 }
1c11b63e 1415 return false;
6df9a95e
JB
1416}
1417
3b4ffa40
NA
1418static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1419{
1420 switch (device->fs_devices->chunk_alloc_policy) {
1421 case BTRFS_CHUNK_ALLOC_REGULAR:
37f85ec3 1422 return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
1cd6121f
NA
1423 case BTRFS_CHUNK_ALLOC_ZONED:
1424 /*
1425 * We don't care about the starting region like regular
1426 * allocator, because we anyway use/reserve the first two zones
1427 * for superblock logging.
1428 */
1429 return ALIGN(start, device->zone_info->zone_size);
3b4ffa40
NA
1430 default:
1431 BUG();
1432 }
1433}
1434
1cd6121f
NA
1435static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
1436 u64 *hole_start, u64 *hole_size,
1437 u64 num_bytes)
1438{
1439 u64 zone_size = device->zone_info->zone_size;
1440 u64 pos;
1441 int ret;
1442 bool changed = false;
1443
1444 ASSERT(IS_ALIGNED(*hole_start, zone_size));
1445
1446 while (*hole_size > 0) {
1447 pos = btrfs_find_allocatable_zones(device, *hole_start,
1448 *hole_start + *hole_size,
1449 num_bytes);
1450 if (pos != *hole_start) {
1451 *hole_size = *hole_start + *hole_size - pos;
1452 *hole_start = pos;
1453 changed = true;
1454 if (*hole_size < num_bytes)
1455 break;
1456 }
1457
1458 ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
1459
1460 /* Range is ensured to be empty */
1461 if (!ret)
1462 return changed;
1463
1464 /* Given hole range was invalid (outside of device) */
1465 if (ret == -ERANGE) {
1466 *hole_start += *hole_size;
d6f67afb 1467 *hole_size = 0;
7000babd 1468 return true;
1cd6121f
NA
1469 }
1470
1471 *hole_start += zone_size;
1472 *hole_size -= zone_size;
1473 changed = true;
1474 }
1475
1476 return changed;
1477}
1478
3b4ffa40
NA
1479/**
1480 * dev_extent_hole_check - check if specified hole is suitable for allocation
1481 * @device: the device which we have the hole
1482 * @hole_start: starting position of the hole
1483 * @hole_size: the size of the hole
1484 * @num_bytes: the size of the free space that we need
1485 *
1cd6121f 1486 * This function may modify @hole_start and @hole_size to reflect the suitable
3b4ffa40
NA
1487 * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1488 */
1489static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1490 u64 *hole_size, u64 num_bytes)
1491{
1492 bool changed = false;
1493 u64 hole_end = *hole_start + *hole_size;
1494
1cd6121f
NA
1495 for (;;) {
1496 /*
1497 * Check before we set max_hole_start, otherwise we could end up
1498 * sending back this offset anyway.
1499 */
1500 if (contains_pending_extent(device, hole_start, *hole_size)) {
1501 if (hole_end >= *hole_start)
1502 *hole_size = hole_end - *hole_start;
1503 else
1504 *hole_size = 0;
1505 changed = true;
1506 }
1507
1508 switch (device->fs_devices->chunk_alloc_policy) {
1509 case BTRFS_CHUNK_ALLOC_REGULAR:
1510 /* No extra check */
1511 break;
1512 case BTRFS_CHUNK_ALLOC_ZONED:
1513 if (dev_extent_hole_check_zoned(device, hole_start,
1514 hole_size, num_bytes)) {
1515 changed = true;
1516 /*
1517 * The changed hole can contain pending extent.
1518 * Loop again to check that.
1519 */
1520 continue;
1521 }
1522 break;
1523 default:
1524 BUG();
1525 }
3b4ffa40 1526
3b4ffa40 1527 break;
3b4ffa40
NA
1528 }
1529
1530 return changed;
1531}
6df9a95e 1532
0b86a832 1533/*
499f377f
JM
1534 * find_free_dev_extent_start - find free space in the specified device
1535 * @device: the device which we search the free space in
1536 * @num_bytes: the size of the free space that we need
1537 * @search_start: the position from which to begin the search
1538 * @start: store the start of the free space.
1539 * @len: the size of the free space. that we find, or the size
1540 * of the max free space if we don't find suitable free space
7bfc837d 1541 *
0b86a832
CM
1542 * this uses a pretty simple search, the expectation is that it is
1543 * called very infrequently and that a given device has a small number
1544 * of extents
7bfc837d
MX
1545 *
1546 * @start is used to store the start of the free space if we find. But if we
1547 * don't find suitable free space, it will be used to store the start position
1548 * of the max free space.
1549 *
1550 * @len is used to store the size of the free space that we find.
1551 * But if we don't find suitable free space, it is used to store the size of
1552 * the max free space.
135da976
QW
1553 *
1554 * NOTE: This function will search *commit* root of device tree, and does extra
1555 * check to ensure dev extents are not double allocated.
1556 * This makes the function safe to allocate dev extents but may not report
1557 * correct usable device space, as device extent freed in current transaction
1a9fd417 1558 * is not reported as available.
0b86a832 1559 */
9e3246a5
QW
1560static int find_free_dev_extent_start(struct btrfs_device *device,
1561 u64 num_bytes, u64 search_start, u64 *start,
1562 u64 *len)
0b86a832 1563{
0b246afa
JM
1564 struct btrfs_fs_info *fs_info = device->fs_info;
1565 struct btrfs_root *root = fs_info->dev_root;
0b86a832 1566 struct btrfs_key key;
7bfc837d 1567 struct btrfs_dev_extent *dev_extent;
2b82032c 1568 struct btrfs_path *path;
7bfc837d
MX
1569 u64 hole_size;
1570 u64 max_hole_start;
1571 u64 max_hole_size;
1572 u64 extent_end;
0b86a832
CM
1573 u64 search_end = device->total_bytes;
1574 int ret;
7bfc837d 1575 int slot;
0b86a832 1576 struct extent_buffer *l;
8cdc7c5b 1577
3b4ffa40 1578 search_start = dev_extent_search_start(device, search_start);
0b86a832 1579
1cd6121f
NA
1580 WARN_ON(device->zone_info &&
1581 !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
1582
6df9a95e
JB
1583 path = btrfs_alloc_path();
1584 if (!path)
1585 return -ENOMEM;
f2ab7618 1586
7bfc837d
MX
1587 max_hole_start = search_start;
1588 max_hole_size = 0;
1589
f2ab7618 1590again:
401e29c1
AJ
1591 if (search_start >= search_end ||
1592 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
7bfc837d 1593 ret = -ENOSPC;
6df9a95e 1594 goto out;
7bfc837d
MX
1595 }
1596
e4058b54 1597 path->reada = READA_FORWARD;
6df9a95e
JB
1598 path->search_commit_root = 1;
1599 path->skip_locking = 1;
7bfc837d 1600
0b86a832
CM
1601 key.objectid = device->devid;
1602 key.offset = search_start;
1603 key.type = BTRFS_DEV_EXTENT_KEY;
7bfc837d 1604
0ff40a91 1605 ret = btrfs_search_backwards(root, &key, path);
0b86a832 1606 if (ret < 0)
7bfc837d 1607 goto out;
7bfc837d 1608
0b86a832
CM
1609 while (1) {
1610 l = path->nodes[0];
1611 slot = path->slots[0];
1612 if (slot >= btrfs_header_nritems(l)) {
1613 ret = btrfs_next_leaf(root, path);
1614 if (ret == 0)
1615 continue;
1616 if (ret < 0)
7bfc837d
MX
1617 goto out;
1618
1619 break;
0b86a832
CM
1620 }
1621 btrfs_item_key_to_cpu(l, &key, slot);
1622
1623 if (key.objectid < device->devid)
1624 goto next;
1625
1626 if (key.objectid > device->devid)
7bfc837d 1627 break;
0b86a832 1628
962a298f 1629 if (key.type != BTRFS_DEV_EXTENT_KEY)
7bfc837d 1630 goto next;
9779b72f 1631
7bfc837d
MX
1632 if (key.offset > search_start) {
1633 hole_size = key.offset - search_start;
3b4ffa40
NA
1634 dev_extent_hole_check(device, &search_start, &hole_size,
1635 num_bytes);
6df9a95e 1636
7bfc837d
MX
1637 if (hole_size > max_hole_size) {
1638 max_hole_start = search_start;
1639 max_hole_size = hole_size;
1640 }
9779b72f 1641
7bfc837d
MX
1642 /*
1643 * If this free space is greater than which we need,
1644 * it must be the max free space that we have found
1645 * until now, so max_hole_start must point to the start
1646 * of this free space and the length of this free space
1647 * is stored in max_hole_size. Thus, we return
1648 * max_hole_start and max_hole_size and go back to the
1649 * caller.
1650 */
1651 if (hole_size >= num_bytes) {
1652 ret = 0;
1653 goto out;
0b86a832
CM
1654 }
1655 }
0b86a832 1656
0b86a832 1657 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
7bfc837d
MX
1658 extent_end = key.offset + btrfs_dev_extent_length(l,
1659 dev_extent);
1660 if (extent_end > search_start)
1661 search_start = extent_end;
0b86a832
CM
1662next:
1663 path->slots[0]++;
1664 cond_resched();
1665 }
0b86a832 1666
38c01b96 1667 /*
1668 * At this point, search_start should be the end of
1669 * allocated dev extents, and when shrinking the device,
1670 * search_end may be smaller than search_start.
1671 */
f2ab7618 1672 if (search_end > search_start) {
38c01b96 1673 hole_size = search_end - search_start;
3b4ffa40
NA
1674 if (dev_extent_hole_check(device, &search_start, &hole_size,
1675 num_bytes)) {
f2ab7618
ZL
1676 btrfs_release_path(path);
1677 goto again;
1678 }
0b86a832 1679
f2ab7618
ZL
1680 if (hole_size > max_hole_size) {
1681 max_hole_start = search_start;
1682 max_hole_size = hole_size;
1683 }
6df9a95e
JB
1684 }
1685
7bfc837d 1686 /* See above. */
f2ab7618 1687 if (max_hole_size < num_bytes)
7bfc837d
MX
1688 ret = -ENOSPC;
1689 else
1690 ret = 0;
1691
1692out:
2b82032c 1693 btrfs_free_path(path);
7bfc837d 1694 *start = max_hole_start;
b2117a39 1695 if (len)
7bfc837d 1696 *len = max_hole_size;
0b86a832
CM
1697 return ret;
1698}
1699
60dfdf25 1700int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
499f377f
JM
1701 u64 *start, u64 *len)
1702{
499f377f 1703 /* FIXME use last free of some kind */
60dfdf25 1704 return find_free_dev_extent_start(device, num_bytes, 0, start, len);
499f377f
JM
1705}
1706
b2950863 1707static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
8f18cf13 1708 struct btrfs_device *device,
2196d6e8 1709 u64 start, u64 *dev_extent_len)
8f18cf13 1710{
0b246afa
JM
1711 struct btrfs_fs_info *fs_info = device->fs_info;
1712 struct btrfs_root *root = fs_info->dev_root;
8f18cf13
CM
1713 int ret;
1714 struct btrfs_path *path;
8f18cf13 1715 struct btrfs_key key;
a061fc8d
CM
1716 struct btrfs_key found_key;
1717 struct extent_buffer *leaf = NULL;
1718 struct btrfs_dev_extent *extent = NULL;
8f18cf13
CM
1719
1720 path = btrfs_alloc_path();
1721 if (!path)
1722 return -ENOMEM;
1723
1724 key.objectid = device->devid;
1725 key.offset = start;
1726 key.type = BTRFS_DEV_EXTENT_KEY;
924cd8fb 1727again:
8f18cf13 1728 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
a061fc8d
CM
1729 if (ret > 0) {
1730 ret = btrfs_previous_item(root, path, key.objectid,
1731 BTRFS_DEV_EXTENT_KEY);
b0b802d7
TI
1732 if (ret)
1733 goto out;
a061fc8d
CM
1734 leaf = path->nodes[0];
1735 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1736 extent = btrfs_item_ptr(leaf, path->slots[0],
1737 struct btrfs_dev_extent);
1738 BUG_ON(found_key.offset > start || found_key.offset +
1739 btrfs_dev_extent_length(leaf, extent) < start);
924cd8fb
MX
1740 key = found_key;
1741 btrfs_release_path(path);
1742 goto again;
a061fc8d
CM
1743 } else if (ret == 0) {
1744 leaf = path->nodes[0];
1745 extent = btrfs_item_ptr(leaf, path->slots[0],
1746 struct btrfs_dev_extent);
79787eaa 1747 } else {
79787eaa 1748 goto out;
a061fc8d 1749 }
8f18cf13 1750
2196d6e8
MX
1751 *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1752
8f18cf13 1753 ret = btrfs_del_item(trans, root, path);
79bd3712 1754 if (ret == 0)
3204d33c 1755 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
b0b802d7 1756out:
8f18cf13
CM
1757 btrfs_free_path(path);
1758 return ret;
1759}
1760
6df9a95e 1761static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
0b86a832 1762{
6df9a95e
JB
1763 struct extent_map_tree *em_tree;
1764 struct extent_map *em;
1765 struct rb_node *n;
1766 u64 ret = 0;
0b86a832 1767
c8bf1b67 1768 em_tree = &fs_info->mapping_tree;
6df9a95e 1769 read_lock(&em_tree->lock);
07e1ce09 1770 n = rb_last(&em_tree->map.rb_root);
6df9a95e
JB
1771 if (n) {
1772 em = rb_entry(n, struct extent_map, rb_node);
1773 ret = em->start + em->len;
0b86a832 1774 }
6df9a95e
JB
1775 read_unlock(&em_tree->lock);
1776
0b86a832
CM
1777 return ret;
1778}
1779
53f10659
ID
1780static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1781 u64 *devid_ret)
0b86a832
CM
1782{
1783 int ret;
1784 struct btrfs_key key;
1785 struct btrfs_key found_key;
2b82032c
YZ
1786 struct btrfs_path *path;
1787
2b82032c
YZ
1788 path = btrfs_alloc_path();
1789 if (!path)
1790 return -ENOMEM;
0b86a832
CM
1791
1792 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1793 key.type = BTRFS_DEV_ITEM_KEY;
1794 key.offset = (u64)-1;
1795
53f10659 1796 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
0b86a832
CM
1797 if (ret < 0)
1798 goto error;
1799
a06dee4d
AJ
1800 if (ret == 0) {
1801 /* Corruption */
1802 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1803 ret = -EUCLEAN;
1804 goto error;
1805 }
0b86a832 1806
53f10659
ID
1807 ret = btrfs_previous_item(fs_info->chunk_root, path,
1808 BTRFS_DEV_ITEMS_OBJECTID,
0b86a832
CM
1809 BTRFS_DEV_ITEM_KEY);
1810 if (ret) {
53f10659 1811 *devid_ret = 1;
0b86a832
CM
1812 } else {
1813 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1814 path->slots[0]);
53f10659 1815 *devid_ret = found_key.offset + 1;
0b86a832
CM
1816 }
1817 ret = 0;
1818error:
2b82032c 1819 btrfs_free_path(path);
0b86a832
CM
1820 return ret;
1821}
1822
1823/*
1824 * the device information is stored in the chunk root
1825 * the btrfs_device struct should be fully filled in
1826 */
c74a0b02 1827static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
48a3b636 1828 struct btrfs_device *device)
0b86a832
CM
1829{
1830 int ret;
1831 struct btrfs_path *path;
1832 struct btrfs_dev_item *dev_item;
1833 struct extent_buffer *leaf;
1834 struct btrfs_key key;
1835 unsigned long ptr;
0b86a832 1836
0b86a832
CM
1837 path = btrfs_alloc_path();
1838 if (!path)
1839 return -ENOMEM;
1840
0b86a832
CM
1841 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1842 key.type = BTRFS_DEV_ITEM_KEY;
2b82032c 1843 key.offset = device->devid;
0b86a832 1844
2bb2e00e 1845 btrfs_reserve_chunk_metadata(trans, true);
8e87e856
NB
1846 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1847 &key, sizeof(*dev_item));
2bb2e00e 1848 btrfs_trans_release_chunk_metadata(trans);
0b86a832
CM
1849 if (ret)
1850 goto out;
1851
1852 leaf = path->nodes[0];
1853 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1854
1855 btrfs_set_device_id(leaf, dev_item, device->devid);
2b82032c 1856 btrfs_set_device_generation(leaf, dev_item, 0);
0b86a832
CM
1857 btrfs_set_device_type(leaf, dev_item, device->type);
1858 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1859 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1860 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
7cc8e58d
MX
1861 btrfs_set_device_total_bytes(leaf, dev_item,
1862 btrfs_device_get_disk_total_bytes(device));
1863 btrfs_set_device_bytes_used(leaf, dev_item,
1864 btrfs_device_get_bytes_used(device));
e17cade2
CM
1865 btrfs_set_device_group(leaf, dev_item, 0);
1866 btrfs_set_device_seek_speed(leaf, dev_item, 0);
1867 btrfs_set_device_bandwidth(leaf, dev_item, 0);
c3027eb5 1868 btrfs_set_device_start_offset(leaf, dev_item, 0);
0b86a832 1869
410ba3a2 1870 ptr = btrfs_device_uuid(dev_item);
e17cade2 1871 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1473b24e 1872 ptr = btrfs_device_fsid(dev_item);
de37aa51
NB
1873 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1874 ptr, BTRFS_FSID_SIZE);
0b86a832 1875 btrfs_mark_buffer_dirty(leaf);
0b86a832 1876
2b82032c 1877 ret = 0;
0b86a832
CM
1878out:
1879 btrfs_free_path(path);
1880 return ret;
1881}
8f18cf13 1882
5a1972bd
QW
1883/*
1884 * Function to update ctime/mtime for a given device path.
1885 * Mainly used for ctime/mtime based probe like libblkid.
54fde91f
JB
1886 *
1887 * We don't care about errors here, this is just to be kind to userspace.
5a1972bd 1888 */
54fde91f 1889static void update_dev_time(const char *device_path)
5a1972bd 1890{
54fde91f 1891 struct path path;
8f96a5bf 1892 struct timespec64 now;
54fde91f 1893 int ret;
5a1972bd 1894
54fde91f
JB
1895 ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
1896 if (ret)
5a1972bd 1897 return;
8f96a5bf 1898
54fde91f
JB
1899 now = current_time(d_inode(path.dentry));
1900 inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
1901 path_put(&path);
5a1972bd
QW
1902}
1903
bbac5869
QW
1904static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
1905 struct btrfs_device *device)
a061fc8d 1906{
f331a952 1907 struct btrfs_root *root = device->fs_info->chunk_root;
a061fc8d
CM
1908 int ret;
1909 struct btrfs_path *path;
a061fc8d 1910 struct btrfs_key key;
a061fc8d 1911
a061fc8d
CM
1912 path = btrfs_alloc_path();
1913 if (!path)
1914 return -ENOMEM;
1915
a061fc8d
CM
1916 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1917 key.type = BTRFS_DEV_ITEM_KEY;
1918 key.offset = device->devid;
1919
2bb2e00e 1920 btrfs_reserve_chunk_metadata(trans, false);
a061fc8d 1921 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2bb2e00e 1922 btrfs_trans_release_chunk_metadata(trans);
5e9f2ad5
NB
1923 if (ret) {
1924 if (ret > 0)
1925 ret = -ENOENT;
a061fc8d
CM
1926 goto out;
1927 }
1928
1929 ret = btrfs_del_item(trans, root, path);
a061fc8d
CM
1930out:
1931 btrfs_free_path(path);
a061fc8d
CM
1932 return ret;
1933}
1934
3cc31a0d
DS
1935/*
1936 * Verify that @num_devices satisfies the RAID profile constraints in the whole
1937 * filesystem. It's up to the caller to adjust that number regarding eg. device
1938 * replace.
1939 */
1940static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1941 u64 num_devices)
a061fc8d 1942{
a061fc8d 1943 u64 all_avail;
de98ced9 1944 unsigned seq;
418775a2 1945 int i;
a061fc8d 1946
de98ced9 1947 do {
bd45ffbc 1948 seq = read_seqbegin(&fs_info->profiles_lock);
de98ced9 1949
bd45ffbc
AJ
1950 all_avail = fs_info->avail_data_alloc_bits |
1951 fs_info->avail_system_alloc_bits |
1952 fs_info->avail_metadata_alloc_bits;
1953 } while (read_seqretry(&fs_info->profiles_lock, seq));
a061fc8d 1954
418775a2 1955 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
41a6e891 1956 if (!(all_avail & btrfs_raid_array[i].bg_flag))
418775a2 1957 continue;
a061fc8d 1958
efc222f8
AJ
1959 if (num_devices < btrfs_raid_array[i].devs_min)
1960 return btrfs_raid_array[i].mindev_error;
53b381b3
DW
1961 }
1962
bd45ffbc 1963 return 0;
f1fa7f26
AJ
1964}
1965
c9162bdf
OS
1966static struct btrfs_device * btrfs_find_next_active_device(
1967 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
a061fc8d 1968{
2b82032c 1969 struct btrfs_device *next_device;
88acff64
AJ
1970
1971 list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1972 if (next_device != device &&
e6e674bd
AJ
1973 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1974 && next_device->bdev)
88acff64
AJ
1975 return next_device;
1976 }
1977
1978 return NULL;
1979}
1980
1981/*
d24fa5c1 1982 * Helper function to check if the given device is part of s_bdev / latest_dev
88acff64
AJ
1983 * and replace it with the provided or the next active device, in the context
1984 * where this function called, there should be always be another device (or
1985 * this_dev) which is active.
1986 */
b105e927 1987void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
e493e8f9 1988 struct btrfs_device *next_device)
88acff64 1989{
d6507cf1 1990 struct btrfs_fs_info *fs_info = device->fs_info;
88acff64 1991
e493e8f9 1992 if (!next_device)
88acff64 1993 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
e493e8f9 1994 device);
88acff64
AJ
1995 ASSERT(next_device);
1996
1997 if (fs_info->sb->s_bdev &&
1998 (fs_info->sb->s_bdev == device->bdev))
1999 fs_info->sb->s_bdev = next_device->bdev;
2000
d24fa5c1
AJ
2001 if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
2002 fs_info->fs_devices->latest_dev = next_device;
88acff64
AJ
2003}
2004
1da73967
AJ
2005/*
2006 * Return btrfs_fs_devices::num_devices excluding the device that's being
2007 * currently replaced.
2008 */
2009static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
2010{
2011 u64 num_devices = fs_info->fs_devices->num_devices;
2012
cb5583dd 2013 down_read(&fs_info->dev_replace.rwsem);
1da73967
AJ
2014 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
2015 ASSERT(num_devices > 1);
2016 num_devices--;
2017 }
cb5583dd 2018 up_read(&fs_info->dev_replace.rwsem);
1da73967
AJ
2019
2020 return num_devices;
2021}
2022
313b0858
JB
2023void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
2024 struct block_device *bdev,
2025 const char *device_path)
6fbceb9f 2026{
6fbceb9f
JT
2027 struct btrfs_super_block *disk_super;
2028 int copy_num;
2029
2030 if (!bdev)
2031 return;
2032
2033 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
8f32380d
JT
2034 struct page *page;
2035 int ret;
6fbceb9f 2036
a05d3c91 2037 disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
8f32380d
JT
2038 if (IS_ERR(disk_super))
2039 continue;
6fbceb9f 2040
12659251
NA
2041 if (bdev_is_zoned(bdev)) {
2042 btrfs_reset_sb_log_zones(bdev, copy_num);
2043 continue;
2044 }
2045
6fbceb9f 2046 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
8f32380d
JT
2047
2048 page = virt_to_page(disk_super);
2049 set_page_dirty(page);
2050 lock_page(page);
2051 /* write_on_page() unlocks the page */
2052 ret = write_one_page(page);
2053 if (ret)
2054 btrfs_warn(fs_info,
2055 "error clearing superblock number %d (%d)",
2056 copy_num, ret);
2057 btrfs_release_disk_super(disk_super);
2058
6fbceb9f
JT
2059 }
2060
2061 /* Notify udev that device has changed */
2062 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2063
2064 /* Update ctime/mtime for device path for libblkid */
54fde91f 2065 update_dev_time(device_path);
6fbceb9f
JT
2066}
2067
1a15eb72
JB
2068int btrfs_rm_device(struct btrfs_fs_info *fs_info,
2069 struct btrfs_dev_lookup_args *args,
2070 struct block_device **bdev, fmode_t *mode)
f1fa7f26 2071{
bbac5869 2072 struct btrfs_trans_handle *trans;
f1fa7f26 2073 struct btrfs_device *device;
1f78160c 2074 struct btrfs_fs_devices *cur_devices;
b5185197 2075 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2b82032c 2076 u64 num_devices;
a061fc8d
CM
2077 int ret = 0;
2078
914a519b
JB
2079 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
2080 btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
2081 return -EINVAL;
2082 }
2083
8ef9dc0f
JB
2084 /*
2085 * The device list in fs_devices is accessed without locks (neither
2086 * uuid_mutex nor device_list_mutex) as it won't change on a mounted
2087 * filesystem and another device rm cannot run.
2088 */
1da73967 2089 num_devices = btrfs_num_devices(fs_info);
8dabb742 2090
0b246afa 2091 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
f1fa7f26 2092 if (ret)
bbac5869 2093 return ret;
a061fc8d 2094
1a15eb72
JB
2095 device = btrfs_find_device(fs_info->fs_devices, args);
2096 if (!device) {
2097 if (args->missing)
a27a94c2
NB
2098 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2099 else
1a15eb72 2100 ret = -ENOENT;
bbac5869 2101 return ret;
a27a94c2 2102 }
dfe25020 2103
eede2bf3
OS
2104 if (btrfs_pinned_by_swapfile(fs_info, device)) {
2105 btrfs_warn_in_rcu(fs_info,
2106 "cannot remove device %s (devid %llu) due to active swapfile",
2107 rcu_str_deref(device->name), device->devid);
bbac5869 2108 return -ETXTBSY;
eede2bf3
OS
2109 }
2110
bbac5869
QW
2111 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
2112 return BTRFS_ERROR_DEV_TGT_REPLACE;
63a212ab 2113
ebbede42 2114 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
bbac5869
QW
2115 fs_info->fs_devices->rw_devices == 1)
2116 return BTRFS_ERROR_DEV_ONLY_WRITABLE;
2b82032c 2117
ebbede42 2118 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
34441361 2119 mutex_lock(&fs_info->chunk_mutex);
2b82032c 2120 list_del_init(&device->dev_alloc_list);
c3929c36 2121 device->fs_devices->rw_devices--;
34441361 2122 mutex_unlock(&fs_info->chunk_mutex);
dfe25020 2123 }
a061fc8d
CM
2124
2125 ret = btrfs_shrink_device(device, 0);
2126 if (ret)
9b3517e9 2127 goto error_undo;
a061fc8d 2128
bbac5869
QW
2129 trans = btrfs_start_transaction(fs_info->chunk_root, 0);
2130 if (IS_ERR(trans)) {
2131 ret = PTR_ERR(trans);
9b3517e9 2132 goto error_undo;
bbac5869
QW
2133 }
2134
2135 ret = btrfs_rm_dev_item(trans, device);
2136 if (ret) {
2137 /* Any error in dev item removal is critical */
2138 btrfs_crit(fs_info,
2139 "failed to remove device item for devid %llu: %d",
2140 device->devid, ret);
2141 btrfs_abort_transaction(trans, ret);
2142 btrfs_end_transaction(trans);
2143 return ret;
2144 }
a061fc8d 2145
e12c9621 2146 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
163e97ee 2147 btrfs_scrub_cancel_dev(device);
e5e9a520
CM
2148
2149 /*
2150 * the device list mutex makes sure that we don't change
2151 * the device list while someone else is writing out all
d7306801
FDBM
2152 * the device supers. Whoever is writing all supers, should
2153 * lock the device list mutex before getting the number of
2154 * devices in the super block (super_copy). Conversely,
2155 * whoever updates the number of devices in the super block
2156 * (super_copy) should hold the device list mutex.
e5e9a520 2157 */
1f78160c 2158
41a52a0f
AJ
2159 /*
2160 * In normal cases the cur_devices == fs_devices. But in case
2161 * of deleting a seed device, the cur_devices should point to
9675ea8c 2162 * its own fs_devices listed under the fs_devices->seed_list.
41a52a0f 2163 */
1f78160c 2164 cur_devices = device->fs_devices;
b5185197 2165 mutex_lock(&fs_devices->device_list_mutex);
1f78160c 2166 list_del_rcu(&device->dev_list);
e5e9a520 2167
41a52a0f
AJ
2168 cur_devices->num_devices--;
2169 cur_devices->total_devices--;
b4993e64
AJ
2170 /* Update total_devices of the parent fs_devices if it's seed */
2171 if (cur_devices != fs_devices)
2172 fs_devices->total_devices--;
2b82032c 2173
e6e674bd 2174 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
41a52a0f 2175 cur_devices->missing_devices--;
cd02dca5 2176
d6507cf1 2177 btrfs_assign_next_active_device(device, NULL);
2b82032c 2178
0bfaa9c5 2179 if (device->bdev) {
41a52a0f 2180 cur_devices->open_devices--;
0bfaa9c5 2181 /* remove sysfs entry */
53f8a74c 2182 btrfs_sysfs_remove_device(device);
0bfaa9c5 2183 }
99994cde 2184
0b246afa
JM
2185 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2186 btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
b5185197 2187 mutex_unlock(&fs_devices->device_list_mutex);
2b82032c 2188
cea67ab9 2189 /*
3fa421de
JB
2190 * At this point, the device is zero sized and detached from the
2191 * devices list. All that's left is to zero out the old supers and
2192 * free the device.
2193 *
2194 * We cannot call btrfs_close_bdev() here because we're holding the sb
2195 * write lock, and blkdev_put() will pull in the ->open_mutex on the
2196 * block device and it's dependencies. Instead just flush the device
2197 * and let the caller do the final blkdev_put.
cea67ab9 2198 */
3fa421de 2199 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
8f32380d
JT
2200 btrfs_scratch_superblocks(fs_info, device->bdev,
2201 device->name->str);
3fa421de
JB
2202 if (device->bdev) {
2203 sync_blockdev(device->bdev);
2204 invalidate_bdev(device->bdev);
2205 }
2206 }
cea67ab9 2207
3fa421de
JB
2208 *bdev = device->bdev;
2209 *mode = device->mode;
8e75fd89
NB
2210 synchronize_rcu();
2211 btrfs_free_device(device);
cea67ab9 2212
8b41393f
JB
2213 /*
2214 * This can happen if cur_devices is the private seed devices list. We
2215 * cannot call close_fs_devices() here because it expects the uuid_mutex
2216 * to be held, but in fact we don't need that for the private
2217 * seed_devices, we can simply decrement cur_devices->opened and then
2218 * remove it from our list and free the fs_devices.
2219 */
8e906945 2220 if (cur_devices->num_devices == 0) {
944d3f9f 2221 list_del_init(&cur_devices->seed_list);
8b41393f
JB
2222 ASSERT(cur_devices->opened == 1);
2223 cur_devices->opened--;
1f78160c 2224 free_fs_devices(cur_devices);
2b82032c
YZ
2225 }
2226
bbac5869
QW
2227 ret = btrfs_commit_transaction(trans);
2228
a061fc8d 2229 return ret;
24fc572f 2230
9b3517e9 2231error_undo:
ebbede42 2232 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
34441361 2233 mutex_lock(&fs_info->chunk_mutex);
9b3517e9 2234 list_add(&device->dev_alloc_list,
b5185197 2235 &fs_devices->alloc_list);
c3929c36 2236 device->fs_devices->rw_devices++;
34441361 2237 mutex_unlock(&fs_info->chunk_mutex);
9b3517e9 2238 }
bbac5869 2239 return ret;
a061fc8d
CM
2240}
2241
68a9db5f 2242void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
e93c89c1 2243{
d51908ce
AJ
2244 struct btrfs_fs_devices *fs_devices;
2245
68a9db5f 2246 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
1357272f 2247
25e8e911
AJ
2248 /*
2249 * in case of fs with no seed, srcdev->fs_devices will point
2250 * to fs_devices of fs_info. However when the dev being replaced is
2251 * a seed dev it will point to the seed's local fs_devices. In short
2252 * srcdev will have its correct fs_devices in both the cases.
2253 */
2254 fs_devices = srcdev->fs_devices;
d51908ce 2255
e93c89c1 2256 list_del_rcu(&srcdev->dev_list);
619c47f3 2257 list_del(&srcdev->dev_alloc_list);
d51908ce 2258 fs_devices->num_devices--;
e6e674bd 2259 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
d51908ce 2260 fs_devices->missing_devices--;
e93c89c1 2261
ebbede42 2262 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
82372bc8 2263 fs_devices->rw_devices--;
1357272f 2264
82372bc8 2265 if (srcdev->bdev)
d51908ce 2266 fs_devices->open_devices--;
084b6e7c
QW
2267}
2268
65237ee3 2269void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
084b6e7c
QW
2270{
2271 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
e93c89c1 2272
a466c85e
JB
2273 mutex_lock(&uuid_mutex);
2274
14238819 2275 btrfs_close_bdev(srcdev);
8e75fd89
NB
2276 synchronize_rcu();
2277 btrfs_free_device(srcdev);
94d5f0c2 2278
94d5f0c2
AJ
2279 /* if this is no devs we rather delete the fs_devices */
2280 if (!fs_devices->num_devices) {
6dd38f81
AJ
2281 /*
2282 * On a mounted FS, num_devices can't be zero unless it's a
2283 * seed. In case of a seed device being replaced, the replace
2284 * target added to the sprout FS, so there will be no more
2285 * device left under the seed FS.
2286 */
2287 ASSERT(fs_devices->seeding);
2288
944d3f9f 2289 list_del_init(&fs_devices->seed_list);
0226e0eb 2290 close_fs_devices(fs_devices);
8bef8401 2291 free_fs_devices(fs_devices);
94d5f0c2 2292 }
a466c85e 2293 mutex_unlock(&uuid_mutex);
e93c89c1
SB
2294}
2295
4f5ad7bd 2296void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
e93c89c1 2297{
4f5ad7bd 2298 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
d9a071f0 2299
d9a071f0 2300 mutex_lock(&fs_devices->device_list_mutex);
d2ff1b20 2301
53f8a74c 2302 btrfs_sysfs_remove_device(tgtdev);
d2ff1b20 2303
779bf3fe 2304 if (tgtdev->bdev)
d9a071f0 2305 fs_devices->open_devices--;
779bf3fe 2306
d9a071f0 2307 fs_devices->num_devices--;
e93c89c1 2308
d6507cf1 2309 btrfs_assign_next_active_device(tgtdev, NULL);
e93c89c1 2310
e93c89c1 2311 list_del_rcu(&tgtdev->dev_list);
e93c89c1 2312
d9a071f0 2313 mutex_unlock(&fs_devices->device_list_mutex);
779bf3fe 2314
8f32380d
JT
2315 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2316 tgtdev->name->str);
14238819
AJ
2317
2318 btrfs_close_bdev(tgtdev);
8e75fd89
NB
2319 synchronize_rcu();
2320 btrfs_free_device(tgtdev);
e93c89c1
SB
2321}
2322
faa775c4
JB
2323/**
2324 * Populate args from device at path
2325 *
2326 * @fs_info: the filesystem
2327 * @args: the args to populate
2328 * @path: the path to the device
2329 *
2330 * This will read the super block of the device at @path and populate @args with
2331 * the devid, fsid, and uuid. This is meant to be used for ioctls that need to
2332 * lookup a device to operate on, but need to do it before we take any locks.
2333 * This properly handles the special case of "missing" that a user may pass in,
2334 * and does some basic sanity checks. The caller must make sure that @path is
2335 * properly NUL terminated before calling in, and must call
2336 * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
2337 * uuid buffers.
2338 *
2339 * Return: 0 for success, -errno for failure
2340 */
2341int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
2342 struct btrfs_dev_lookup_args *args,
2343 const char *path)
7ba15b7d 2344{
7ba15b7d 2345 struct btrfs_super_block *disk_super;
7ba15b7d 2346 struct block_device *bdev;
faa775c4 2347 int ret;
7ba15b7d 2348
faa775c4
JB
2349 if (!path || !path[0])
2350 return -EINVAL;
2351 if (!strcmp(path, "missing")) {
2352 args->missing = true;
2353 return 0;
2354 }
8f32380d 2355
faa775c4
JB
2356 args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
2357 args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
2358 if (!args->uuid || !args->fsid) {
2359 btrfs_put_dev_args_from_path(args);
2360 return -ENOMEM;
2361 }
8f32380d 2362
faa775c4
JB
2363 ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
2364 &bdev, &disk_super);
9ea0106a
ZF
2365 if (ret) {
2366 btrfs_put_dev_args_from_path(args);
faa775c4 2367 return ret;
9ea0106a
ZF
2368 }
2369
faa775c4
JB
2370 args->devid = btrfs_stack_device_id(&disk_super->dev_item);
2371 memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
7239ff4b 2372 if (btrfs_fs_incompat(fs_info, METADATA_UUID))
faa775c4 2373 memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
7239ff4b 2374 else
faa775c4 2375 memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
8f32380d 2376 btrfs_release_disk_super(disk_super);
7ba15b7d 2377 blkdev_put(bdev, FMODE_READ);
faa775c4 2378 return 0;
7ba15b7d
SB
2379}
2380
5c5c0df0 2381/*
faa775c4
JB
2382 * Only use this jointly with btrfs_get_dev_args_from_path() because we will
2383 * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
2384 * that don't need to be freed.
5c5c0df0 2385 */
faa775c4
JB
2386void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
2387{
2388 kfree(args->uuid);
2389 kfree(args->fsid);
2390 args->uuid = NULL;
2391 args->fsid = NULL;
2392}
2393
a27a94c2 2394struct btrfs_device *btrfs_find_device_by_devspec(
6e927ceb
AJ
2395 struct btrfs_fs_info *fs_info, u64 devid,
2396 const char *device_path)
24e0474b 2397{
562d7b15 2398 BTRFS_DEV_LOOKUP_ARGS(args);
a27a94c2 2399 struct btrfs_device *device;
faa775c4 2400 int ret;
24e0474b 2401
5c5c0df0 2402 if (devid) {
562d7b15
JB
2403 args.devid = devid;
2404 device = btrfs_find_device(fs_info->fs_devices, &args);
a27a94c2
NB
2405 if (!device)
2406 return ERR_PTR(-ENOENT);
6e927ceb
AJ
2407 return device;
2408 }
2409
faa775c4
JB
2410 ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
2411 if (ret)
2412 return ERR_PTR(ret);
2413 device = btrfs_find_device(fs_info->fs_devices, &args);
2414 btrfs_put_dev_args_from_path(&args);
2415 if (!device)
6e927ceb 2416 return ERR_PTR(-ENOENT);
faa775c4 2417 return device;
24e0474b
AJ
2418}
2419
849eae5e 2420static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
2b82032c 2421{
0b246afa 2422 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2b82032c 2423 struct btrfs_fs_devices *old_devices;
e4404d6e 2424 struct btrfs_fs_devices *seed_devices;
2b82032c 2425
a32bf9a3 2426 lockdep_assert_held(&uuid_mutex);
e4404d6e 2427 if (!fs_devices->seeding)
849eae5e 2428 return ERR_PTR(-EINVAL);
2b82032c 2429
427c8fdd
NB
2430 /*
2431 * Private copy of the seed devices, anchored at
2432 * fs_info->fs_devices->seed_list
2433 */
7239ff4b 2434 seed_devices = alloc_fs_devices(NULL, NULL);
2208a378 2435 if (IS_ERR(seed_devices))
849eae5e 2436 return seed_devices;
2b82032c 2437
427c8fdd
NB
2438 /*
2439 * It's necessary to retain a copy of the original seed fs_devices in
2440 * fs_uuids so that filesystems which have been seeded can successfully
2441 * reference the seed device from open_seed_devices. This also supports
2442 * multiple fs seed.
2443 */
e4404d6e
YZ
2444 old_devices = clone_fs_devices(fs_devices);
2445 if (IS_ERR(old_devices)) {
2446 kfree(seed_devices);
849eae5e 2447 return old_devices;
2b82032c 2448 }
e4404d6e 2449
c4babc5e 2450 list_add(&old_devices->fs_list, &fs_uuids);
2b82032c 2451
e4404d6e
YZ
2452 memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2453 seed_devices->opened = 1;
2454 INIT_LIST_HEAD(&seed_devices->devices);
2455 INIT_LIST_HEAD(&seed_devices->alloc_list);
e5e9a520 2456 mutex_init(&seed_devices->device_list_mutex);
c9513edb 2457
849eae5e
AJ
2458 return seed_devices;
2459}
2460
2461/*
2462 * Splice seed devices into the sprout fs_devices.
2463 * Generate a new fsid for the sprouted read-write filesystem.
2464 */
2465static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
2466 struct btrfs_fs_devices *seed_devices)
2467{
2468 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2469 struct btrfs_super_block *disk_super = fs_info->super_copy;
2470 struct btrfs_device *device;
2471 u64 super_flags;
2472
2473 /*
2474 * We are updating the fsid, the thread leading to device_list_add()
2475 * could race, so uuid_mutex is needed.
2476 */
2477 lockdep_assert_held(&uuid_mutex);
2478
2479 /*
2480 * The threads listed below may traverse dev_list but can do that without
2481 * device_list_mutex:
2482 * - All device ops and balance - as we are in btrfs_exclop_start.
2483 * - Various dev_list readers - are using RCU.
2484 * - btrfs_ioctl_fitrim() - is using RCU.
2485 *
2486 * For-read threads as below are using device_list_mutex:
2487 * - Readonly scrub btrfs_scrub_dev()
2488 * - Readonly scrub btrfs_scrub_progress()
2489 * - btrfs_get_dev_stats()
2490 */
2491 lockdep_assert_held(&fs_devices->device_list_mutex);
2492
1f78160c
XG
2493 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2494 synchronize_rcu);
2196d6e8
MX
2495 list_for_each_entry(device, &seed_devices->devices, dev_list)
2496 device->fs_devices = seed_devices;
c9513edb 2497
0395d84f 2498 fs_devices->seeding = false;
2b82032c
YZ
2499 fs_devices->num_devices = 0;
2500 fs_devices->open_devices = 0;
69611ac8 2501 fs_devices->missing_devices = 0;
7f0432d0 2502 fs_devices->rotating = false;
944d3f9f 2503 list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2b82032c
YZ
2504
2505 generate_random_uuid(fs_devices->fsid);
7239ff4b 2506 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2b82032c 2507 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
f7171750 2508
2b82032c
YZ
2509 super_flags = btrfs_super_flags(disk_super) &
2510 ~BTRFS_SUPER_FLAG_SEEDING;
2511 btrfs_set_super_flags(disk_super, super_flags);
2b82032c
YZ
2512}
2513
2514/*
01327610 2515 * Store the expected generation for seed devices in device items.
2b82032c 2516 */
5c466629 2517static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2b82032c 2518{
562d7b15 2519 BTRFS_DEV_LOOKUP_ARGS(args);
5c466629 2520 struct btrfs_fs_info *fs_info = trans->fs_info;
5b4aacef 2521 struct btrfs_root *root = fs_info->chunk_root;
2b82032c
YZ
2522 struct btrfs_path *path;
2523 struct extent_buffer *leaf;
2524 struct btrfs_dev_item *dev_item;
2525 struct btrfs_device *device;
2526 struct btrfs_key key;
44880fdc 2527 u8 fs_uuid[BTRFS_FSID_SIZE];
2b82032c 2528 u8 dev_uuid[BTRFS_UUID_SIZE];
2b82032c
YZ
2529 int ret;
2530
2531 path = btrfs_alloc_path();
2532 if (!path)
2533 return -ENOMEM;
2534
2b82032c
YZ
2535 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2536 key.offset = 0;
2537 key.type = BTRFS_DEV_ITEM_KEY;
2538
2539 while (1) {
2bb2e00e 2540 btrfs_reserve_chunk_metadata(trans, false);
2b82032c 2541 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2bb2e00e 2542 btrfs_trans_release_chunk_metadata(trans);
2b82032c
YZ
2543 if (ret < 0)
2544 goto error;
2545
2546 leaf = path->nodes[0];
2547next_slot:
2548 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2549 ret = btrfs_next_leaf(root, path);
2550 if (ret > 0)
2551 break;
2552 if (ret < 0)
2553 goto error;
2554 leaf = path->nodes[0];
2555 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
b3b4aa74 2556 btrfs_release_path(path);
2b82032c
YZ
2557 continue;
2558 }
2559
2560 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2561 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2562 key.type != BTRFS_DEV_ITEM_KEY)
2563 break;
2564
2565 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2566 struct btrfs_dev_item);
562d7b15 2567 args.devid = btrfs_device_id(leaf, dev_item);
410ba3a2 2568 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2b82032c 2569 BTRFS_UUID_SIZE);
1473b24e 2570 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
44880fdc 2571 BTRFS_FSID_SIZE);
562d7b15
JB
2572 args.uuid = dev_uuid;
2573 args.fsid = fs_uuid;
2574 device = btrfs_find_device(fs_info->fs_devices, &args);
79787eaa 2575 BUG_ON(!device); /* Logic error */
2b82032c
YZ
2576
2577 if (device->fs_devices->seeding) {
2578 btrfs_set_device_generation(leaf, dev_item,
2579 device->generation);
2580 btrfs_mark_buffer_dirty(leaf);
2581 }
2582
2583 path->slots[0]++;
2584 goto next_slot;
2585 }
2586 ret = 0;
2587error:
2588 btrfs_free_path(path);
2589 return ret;
2590}
2591
da353f6b 2592int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
788f20eb 2593{
5112febb 2594 struct btrfs_root *root = fs_info->dev_root;
788f20eb
CM
2595 struct btrfs_trans_handle *trans;
2596 struct btrfs_device *device;
2597 struct block_device *bdev;
0b246afa 2598 struct super_block *sb = fs_info->sb;
606686ee 2599 struct rcu_string *name;
5da54bc1 2600 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
849eae5e 2601 struct btrfs_fs_devices *seed_devices;
39379faa
NA
2602 u64 orig_super_total_bytes;
2603 u64 orig_super_num_devices;
788f20eb 2604 int ret = 0;
fd880809 2605 bool seeding_dev = false;
44cab9ba 2606 bool locked = false;
788f20eb 2607
5da54bc1 2608 if (sb_rdonly(sb) && !fs_devices->seeding)
f8c5d0b4 2609 return -EROFS;
788f20eb 2610
a5d16333 2611 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
0b246afa 2612 fs_info->bdev_holder);
7f59203a
JB
2613 if (IS_ERR(bdev))
2614 return PTR_ERR(bdev);
a2135011 2615
b70f5097
NA
2616 if (!btrfs_check_device_zone_type(fs_info, bdev)) {
2617 ret = -EINVAL;
2618 goto error;
2619 }
2620
5da54bc1 2621 if (fs_devices->seeding) {
fd880809 2622 seeding_dev = true;
2b82032c
YZ
2623 down_write(&sb->s_umount);
2624 mutex_lock(&uuid_mutex);
44cab9ba 2625 locked = true;
2b82032c
YZ
2626 }
2627
b9ba017f 2628 sync_blockdev(bdev);
a2135011 2629
f4cfa9bd
NB
2630 rcu_read_lock();
2631 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
788f20eb
CM
2632 if (device->bdev == bdev) {
2633 ret = -EEXIST;
f4cfa9bd 2634 rcu_read_unlock();
2b82032c 2635 goto error;
788f20eb
CM
2636 }
2637 }
f4cfa9bd 2638 rcu_read_unlock();
788f20eb 2639
0b246afa 2640 device = btrfs_alloc_device(fs_info, NULL, NULL);
12bd2fc0 2641 if (IS_ERR(device)) {
788f20eb 2642 /* we can safely leave the fs_devices entry around */
12bd2fc0 2643 ret = PTR_ERR(device);
2b82032c 2644 goto error;
788f20eb
CM
2645 }
2646
78f2c9e6 2647 name = rcu_string_strdup(device_path, GFP_KERNEL);
606686ee 2648 if (!name) {
2b82032c 2649 ret = -ENOMEM;
5c4cf6c9 2650 goto error_free_device;
788f20eb 2651 }
606686ee 2652 rcu_assign_pointer(device->name, name);
2b82032c 2653
5b316468
NA
2654 device->fs_info = fs_info;
2655 device->bdev = bdev;
4889bc05
AJ
2656 ret = lookup_bdev(device_path, &device->devt);
2657 if (ret)
2658 goto error_free_device;
5b316468 2659
16beac87 2660 ret = btrfs_get_dev_zone_info(device, false);
5b316468
NA
2661 if (ret)
2662 goto error_free_device;
2663
a22285a6 2664 trans = btrfs_start_transaction(root, 0);
98d5dc13 2665 if (IS_ERR(trans)) {
98d5dc13 2666 ret = PTR_ERR(trans);
5b316468 2667 goto error_free_zone;
98d5dc13
TI
2668 }
2669
ebbede42 2670 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2b82032c 2671 device->generation = trans->transid;
0b246afa
JM
2672 device->io_width = fs_info->sectorsize;
2673 device->io_align = fs_info->sectorsize;
2674 device->sector_size = fs_info->sectorsize;
cda00eba
CH
2675 device->total_bytes =
2676 round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
2cc3c559 2677 device->disk_total_bytes = device->total_bytes;
935e5cc9 2678 device->commit_total_bytes = device->total_bytes;
e12c9621 2679 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
401e29c1 2680 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
fb01aa85 2681 device->mode = FMODE_EXCL;
27087f37 2682 device->dev_stats_valid = 1;
9f6d2510 2683 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
788f20eb 2684
2b82032c 2685 if (seeding_dev) {
a0a1db70 2686 btrfs_clear_sb_rdonly(sb);
849eae5e
AJ
2687
2688 /* GFP_KERNEL allocation must not be under device_list_mutex */
2689 seed_devices = btrfs_init_sprout(fs_info);
2690 if (IS_ERR(seed_devices)) {
2691 ret = PTR_ERR(seed_devices);
d31c32f6
AJ
2692 btrfs_abort_transaction(trans, ret);
2693 goto error_trans;
2694 }
849eae5e
AJ
2695 }
2696
2697 mutex_lock(&fs_devices->device_list_mutex);
2698 if (seeding_dev) {
2699 btrfs_setup_sprout(fs_info, seed_devices);
b7cb29e6
AJ
2700 btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
2701 device);
2b82032c 2702 }
788f20eb 2703
5da54bc1 2704 device->fs_devices = fs_devices;
e5e9a520 2705
34441361 2706 mutex_lock(&fs_info->chunk_mutex);
5da54bc1
AJ
2707 list_add_rcu(&device->dev_list, &fs_devices->devices);
2708 list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2709 fs_devices->num_devices++;
2710 fs_devices->open_devices++;
2711 fs_devices->rw_devices++;
2712 fs_devices->total_devices++;
2713 fs_devices->total_rw_bytes += device->total_bytes;
325cd4ba 2714
a5ed45f8 2715 atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2bf64758 2716
10f0d2a5 2717 if (!bdev_nonrot(bdev))
7f0432d0 2718 fs_devices->rotating = true;
c289811c 2719
39379faa 2720 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
0b246afa 2721 btrfs_set_super_total_bytes(fs_info->super_copy,
39379faa
NA
2722 round_down(orig_super_total_bytes + device->total_bytes,
2723 fs_info->sectorsize));
788f20eb 2724
39379faa
NA
2725 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2726 btrfs_set_super_num_devices(fs_info->super_copy,
2727 orig_super_num_devices + 1);
0d39376a 2728
2196d6e8
MX
2729 /*
2730 * we've got more storage, clear any full flags on the space
2731 * infos
2732 */
0b246afa 2733 btrfs_clear_space_info_full(fs_info);
2196d6e8 2734
34441361 2735 mutex_unlock(&fs_info->chunk_mutex);
ca10845a
JB
2736
2737 /* Add sysfs device entry */
cd36da2e 2738 btrfs_sysfs_add_device(device);
ca10845a 2739
5da54bc1 2740 mutex_unlock(&fs_devices->device_list_mutex);
788f20eb 2741
2b82032c 2742 if (seeding_dev) {
34441361 2743 mutex_lock(&fs_info->chunk_mutex);
6f8e0fc7 2744 ret = init_first_rw_device(trans);
34441361 2745 mutex_unlock(&fs_info->chunk_mutex);
005d6427 2746 if (ret) {
66642832 2747 btrfs_abort_transaction(trans, ret);
d31c32f6 2748 goto error_sysfs;
005d6427 2749 }
2196d6e8
MX
2750 }
2751
8e87e856 2752 ret = btrfs_add_dev_item(trans, device);
2196d6e8 2753 if (ret) {
66642832 2754 btrfs_abort_transaction(trans, ret);
d31c32f6 2755 goto error_sysfs;
2196d6e8
MX
2756 }
2757
2758 if (seeding_dev) {
5c466629 2759 ret = btrfs_finish_sprout(trans);
005d6427 2760 if (ret) {
66642832 2761 btrfs_abort_transaction(trans, ret);
d31c32f6 2762 goto error_sysfs;
005d6427 2763 }
b2373f25 2764
8e560081
NB
2765 /*
2766 * fs_devices now represents the newly sprouted filesystem and
849eae5e 2767 * its fsid has been changed by btrfs_sprout_splice().
8e560081
NB
2768 */
2769 btrfs_sysfs_update_sprout_fsid(fs_devices);
2b82032c
YZ
2770 }
2771
3a45bb20 2772 ret = btrfs_commit_transaction(trans);
a2135011 2773
2b82032c
YZ
2774 if (seeding_dev) {
2775 mutex_unlock(&uuid_mutex);
2776 up_write(&sb->s_umount);
44cab9ba 2777 locked = false;
788f20eb 2778
79787eaa
JM
2779 if (ret) /* transaction commit */
2780 return ret;
2781
2ff7e61e 2782 ret = btrfs_relocate_sys_chunks(fs_info);
79787eaa 2783 if (ret < 0)
0b246afa 2784 btrfs_handle_fs_error(fs_info, ret,
5d163e0e 2785 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
671415b7
MX
2786 trans = btrfs_attach_transaction(root);
2787 if (IS_ERR(trans)) {
2788 if (PTR_ERR(trans) == -ENOENT)
2789 return 0;
7132a262
AJ
2790 ret = PTR_ERR(trans);
2791 trans = NULL;
2792 goto error_sysfs;
671415b7 2793 }
3a45bb20 2794 ret = btrfs_commit_transaction(trans);
2b82032c 2795 }
c9e9f97b 2796
7f551d96
AJ
2797 /*
2798 * Now that we have written a new super block to this device, check all
2799 * other fs_devices list if device_path alienates any other scanned
2800 * device.
2801 * We can ignore the return value as it typically returns -EINVAL and
2802 * only succeeds if the device was an alien.
2803 */
4889bc05 2804 btrfs_forget_devices(device->devt);
7f551d96
AJ
2805
2806 /* Update ctime/mtime for blkid or udev */
54fde91f 2807 update_dev_time(device_path);
7f551d96 2808
2b82032c 2809 return ret;
79787eaa 2810
d31c32f6 2811error_sysfs:
53f8a74c 2812 btrfs_sysfs_remove_device(device);
39379faa
NA
2813 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2814 mutex_lock(&fs_info->chunk_mutex);
2815 list_del_rcu(&device->dev_list);
2816 list_del(&device->dev_alloc_list);
2817 fs_info->fs_devices->num_devices--;
2818 fs_info->fs_devices->open_devices--;
2819 fs_info->fs_devices->rw_devices--;
2820 fs_info->fs_devices->total_devices--;
2821 fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2822 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2823 btrfs_set_super_total_bytes(fs_info->super_copy,
2824 orig_super_total_bytes);
2825 btrfs_set_super_num_devices(fs_info->super_copy,
2826 orig_super_num_devices);
2827 mutex_unlock(&fs_info->chunk_mutex);
2828 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
79787eaa 2829error_trans:
0af2c4bf 2830 if (seeding_dev)
a0a1db70 2831 btrfs_set_sb_rdonly(sb);
7132a262
AJ
2832 if (trans)
2833 btrfs_end_transaction(trans);
5b316468
NA
2834error_free_zone:
2835 btrfs_destroy_dev_zone_info(device);
5c4cf6c9 2836error_free_device:
a425f9d4 2837 btrfs_free_device(device);
2b82032c 2838error:
e525fd89 2839 blkdev_put(bdev, FMODE_EXCL);
44cab9ba 2840 if (locked) {
2b82032c
YZ
2841 mutex_unlock(&uuid_mutex);
2842 up_write(&sb->s_umount);
2843 }
c9e9f97b 2844 return ret;
788f20eb
CM
2845}
2846
d397712b
CM
2847static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2848 struct btrfs_device *device)
0b86a832
CM
2849{
2850 int ret;
2851 struct btrfs_path *path;
0b246afa 2852 struct btrfs_root *root = device->fs_info->chunk_root;
0b86a832
CM
2853 struct btrfs_dev_item *dev_item;
2854 struct extent_buffer *leaf;
2855 struct btrfs_key key;
2856
0b86a832
CM
2857 path = btrfs_alloc_path();
2858 if (!path)
2859 return -ENOMEM;
2860
2861 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2862 key.type = BTRFS_DEV_ITEM_KEY;
2863 key.offset = device->devid;
2864
2865 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2866 if (ret < 0)
2867 goto out;
2868
2869 if (ret > 0) {
2870 ret = -ENOENT;
2871 goto out;
2872 }
2873
2874 leaf = path->nodes[0];
2875 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2876
2877 btrfs_set_device_id(leaf, dev_item, device->devid);
2878 btrfs_set_device_type(leaf, dev_item, device->type);
2879 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2880 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2881 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
7cc8e58d
MX
2882 btrfs_set_device_total_bytes(leaf, dev_item,
2883 btrfs_device_get_disk_total_bytes(device));
2884 btrfs_set_device_bytes_used(leaf, dev_item,
2885 btrfs_device_get_bytes_used(device));
0b86a832
CM
2886 btrfs_mark_buffer_dirty(leaf);
2887
2888out:
2889 btrfs_free_path(path);
2890 return ret;
2891}
2892
2196d6e8 2893int btrfs_grow_device(struct btrfs_trans_handle *trans,
8f18cf13
CM
2894 struct btrfs_device *device, u64 new_size)
2895{
0b246afa
JM
2896 struct btrfs_fs_info *fs_info = device->fs_info;
2897 struct btrfs_super_block *super_copy = fs_info->super_copy;
2196d6e8
MX
2898 u64 old_total;
2899 u64 diff;
2bb2e00e 2900 int ret;
8f18cf13 2901
ebbede42 2902 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2b82032c 2903 return -EACCES;
2196d6e8 2904
7dfb8be1
NB
2905 new_size = round_down(new_size, fs_info->sectorsize);
2906
34441361 2907 mutex_lock(&fs_info->chunk_mutex);
2196d6e8 2908 old_total = btrfs_super_total_bytes(super_copy);
0e4324a4 2909 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2196d6e8 2910
63a212ab 2911 if (new_size <= device->total_bytes ||
401e29c1 2912 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
34441361 2913 mutex_unlock(&fs_info->chunk_mutex);
2b82032c 2914 return -EINVAL;
2196d6e8 2915 }
2b82032c 2916
7dfb8be1
NB
2917 btrfs_set_super_total_bytes(super_copy,
2918 round_down(old_total + diff, fs_info->sectorsize));
2b82032c
YZ
2919 device->fs_devices->total_rw_bytes += diff;
2920
7cc8e58d
MX
2921 btrfs_device_set_total_bytes(device, new_size);
2922 btrfs_device_set_disk_total_bytes(device, new_size);
fb456252 2923 btrfs_clear_space_info_full(device->fs_info);
bbbf7243
NB
2924 if (list_empty(&device->post_commit_list))
2925 list_add_tail(&device->post_commit_list,
2926 &trans->transaction->dev_update_list);
34441361 2927 mutex_unlock(&fs_info->chunk_mutex);
4184ea7f 2928
2bb2e00e
FM
2929 btrfs_reserve_chunk_metadata(trans, false);
2930 ret = btrfs_update_device(trans, device);
2931 btrfs_trans_release_chunk_metadata(trans);
2932
2933 return ret;
8f18cf13
CM
2934}
2935
f4208794 2936static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
8f18cf13 2937{
f4208794 2938 struct btrfs_fs_info *fs_info = trans->fs_info;
5b4aacef 2939 struct btrfs_root *root = fs_info->chunk_root;
8f18cf13
CM
2940 int ret;
2941 struct btrfs_path *path;
2942 struct btrfs_key key;
2943
8f18cf13
CM
2944 path = btrfs_alloc_path();
2945 if (!path)
2946 return -ENOMEM;
2947
408fbf19 2948 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
8f18cf13
CM
2949 key.offset = chunk_offset;
2950 key.type = BTRFS_CHUNK_ITEM_KEY;
2951
2952 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
79787eaa
JM
2953 if (ret < 0)
2954 goto out;
2955 else if (ret > 0) { /* Logic error or corruption */
0b246afa
JM
2956 btrfs_handle_fs_error(fs_info, -ENOENT,
2957 "Failed lookup while freeing chunk.");
79787eaa
JM
2958 ret = -ENOENT;
2959 goto out;
2960 }
8f18cf13
CM
2961
2962 ret = btrfs_del_item(trans, root, path);
79787eaa 2963 if (ret < 0)
0b246afa
JM
2964 btrfs_handle_fs_error(fs_info, ret,
2965 "Failed to delete chunk item.");
79787eaa 2966out:
8f18cf13 2967 btrfs_free_path(path);
65a246c5 2968 return ret;
8f18cf13
CM
2969}
2970
408fbf19 2971static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
8f18cf13 2972{
0b246afa 2973 struct btrfs_super_block *super_copy = fs_info->super_copy;
8f18cf13
CM
2974 struct btrfs_disk_key *disk_key;
2975 struct btrfs_chunk *chunk;
2976 u8 *ptr;
2977 int ret = 0;
2978 u32 num_stripes;
2979 u32 array_size;
2980 u32 len = 0;
2981 u32 cur;
2982 struct btrfs_key key;
2983
79bd3712 2984 lockdep_assert_held(&fs_info->chunk_mutex);
8f18cf13
CM
2985 array_size = btrfs_super_sys_array_size(super_copy);
2986
2987 ptr = super_copy->sys_chunk_array;
2988 cur = 0;
2989
2990 while (cur < array_size) {
2991 disk_key = (struct btrfs_disk_key *)ptr;
2992 btrfs_disk_key_to_cpu(&key, disk_key);
2993
2994 len = sizeof(*disk_key);
2995
2996 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2997 chunk = (struct btrfs_chunk *)(ptr + len);
2998 num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2999 len += btrfs_chunk_item_size(num_stripes);
3000 } else {
3001 ret = -EIO;
3002 break;
3003 }
408fbf19 3004 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
8f18cf13
CM
3005 key.offset == chunk_offset) {
3006 memmove(ptr, ptr + len, array_size - (cur + len));
3007 array_size -= len;
3008 btrfs_set_super_sys_array_size(super_copy, array_size);
3009 } else {
3010 ptr += len;
3011 cur += len;
3012 }
3013 }
3014 return ret;
3015}
3016
60ca842e
OS
3017/*
3018 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
3019 * @logical: Logical block offset in bytes.
3020 * @length: Length of extent in bytes.
3021 *
3022 * Return: Chunk mapping or ERR_PTR.
3023 */
3024struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
3025 u64 logical, u64 length)
592d92ee
LB
3026{
3027 struct extent_map_tree *em_tree;
3028 struct extent_map *em;
3029
c8bf1b67 3030 em_tree = &fs_info->mapping_tree;
592d92ee
LB
3031 read_lock(&em_tree->lock);
3032 em = lookup_extent_mapping(em_tree, logical, length);
3033 read_unlock(&em_tree->lock);
3034
3035 if (!em) {
3036 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
3037 logical, length);
3038 return ERR_PTR(-EINVAL);
3039 }
3040
3041 if (em->start > logical || em->start + em->len < logical) {
3042 btrfs_crit(fs_info,
3043 "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
3044 logical, length, em->start, em->start + em->len);
3045 free_extent_map(em);
3046 return ERR_PTR(-EINVAL);
3047 }
3048
3049 /* callers are responsible for dropping em's ref. */
3050 return em;
3051}
3052
79bd3712
FM
3053static int remove_chunk_item(struct btrfs_trans_handle *trans,
3054 struct map_lookup *map, u64 chunk_offset)
3055{
3056 int i;
3057
3058 /*
3059 * Removing chunk items and updating the device items in the chunks btree
3060 * requires holding the chunk_mutex.
3061 * See the comment at btrfs_chunk_alloc() for the details.
3062 */
3063 lockdep_assert_held(&trans->fs_info->chunk_mutex);
3064
3065 for (i = 0; i < map->num_stripes; i++) {
3066 int ret;
3067
3068 ret = btrfs_update_device(trans, map->stripes[i].dev);
3069 if (ret)
3070 return ret;
3071 }
3072
3073 return btrfs_free_chunk(trans, chunk_offset);
3074}
3075
97aff912 3076int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
8f18cf13 3077{
97aff912 3078 struct btrfs_fs_info *fs_info = trans->fs_info;
8f18cf13
CM
3079 struct extent_map *em;
3080 struct map_lookup *map;
2196d6e8 3081 u64 dev_extent_len = 0;
47ab2a6c 3082 int i, ret = 0;
0b246afa 3083 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
8f18cf13 3084
60ca842e 3085 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
592d92ee 3086 if (IS_ERR(em)) {
47ab2a6c
JB
3087 /*
3088 * This is a logic error, but we don't want to just rely on the
bb7ab3b9 3089 * user having built with ASSERT enabled, so if ASSERT doesn't
47ab2a6c
JB
3090 * do anything we still error out.
3091 */
3092 ASSERT(0);
592d92ee 3093 return PTR_ERR(em);
47ab2a6c 3094 }
95617d69 3095 map = em->map_lookup;
8f18cf13 3096
57ba4cb8 3097 /*
79bd3712
FM
3098 * First delete the device extent items from the devices btree.
3099 * We take the device_list_mutex to avoid racing with the finishing phase
3100 * of a device replace operation. See the comment below before acquiring
3101 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
3102 * because that can result in a deadlock when deleting the device extent
3103 * items from the devices btree - COWing an extent buffer from the btree
3104 * may result in allocating a new metadata chunk, which would attempt to
3105 * lock again fs_info->chunk_mutex.
57ba4cb8
FM
3106 */
3107 mutex_lock(&fs_devices->device_list_mutex);
8f18cf13 3108 for (i = 0; i < map->num_stripes; i++) {
47ab2a6c 3109 struct btrfs_device *device = map->stripes[i].dev;
2196d6e8
MX
3110 ret = btrfs_free_dev_extent(trans, device,
3111 map->stripes[i].physical,
3112 &dev_extent_len);
47ab2a6c 3113 if (ret) {
57ba4cb8 3114 mutex_unlock(&fs_devices->device_list_mutex);
66642832 3115 btrfs_abort_transaction(trans, ret);
47ab2a6c
JB
3116 goto out;
3117 }
a061fc8d 3118
2196d6e8 3119 if (device->bytes_used > 0) {
34441361 3120 mutex_lock(&fs_info->chunk_mutex);
2196d6e8
MX
3121 btrfs_device_set_bytes_used(device,
3122 device->bytes_used - dev_extent_len);
a5ed45f8 3123 atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
0b246afa 3124 btrfs_clear_space_info_full(fs_info);
34441361 3125 mutex_unlock(&fs_info->chunk_mutex);
2196d6e8 3126 }
79bd3712
FM
3127 }
3128 mutex_unlock(&fs_devices->device_list_mutex);
a061fc8d 3129
79bd3712
FM
3130 /*
3131 * We acquire fs_info->chunk_mutex for 2 reasons:
3132 *
3133 * 1) Just like with the first phase of the chunk allocation, we must
3134 * reserve system space, do all chunk btree updates and deletions, and
3135 * update the system chunk array in the superblock while holding this
3136 * mutex. This is for similar reasons as explained on the comment at
3137 * the top of btrfs_chunk_alloc();
3138 *
3139 * 2) Prevent races with the final phase of a device replace operation
3140 * that replaces the device object associated with the map's stripes,
3141 * because the device object's id can change at any time during that
3142 * final phase of the device replace operation
3143 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
3144 * replaced device and then see it with an ID of
3145 * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
3146 * the device item, which does not exists on the chunk btree.
3147 * The finishing phase of device replace acquires both the
3148 * device_list_mutex and the chunk_mutex, in that order, so we are
3149 * safe by just acquiring the chunk_mutex.
3150 */
3151 trans->removing_chunk = true;
3152 mutex_lock(&fs_info->chunk_mutex);
3153
3154 check_system_chunk(trans, map->type);
3155
3156 ret = remove_chunk_item(trans, map, chunk_offset);
3157 /*
3158 * Normally we should not get -ENOSPC since we reserved space before
3159 * through the call to check_system_chunk().
3160 *
3161 * Despite our system space_info having enough free space, we may not
3162 * be able to allocate extents from its block groups, because all have
3163 * an incompatible profile, which will force us to allocate a new system
3164 * block group with the right profile, or right after we called
3165 * check_system_space() above, a scrub turned the only system block group
3166 * with enough free space into RO mode.
3167 * This is explained with more detail at do_chunk_alloc().
3168 *
3169 * So if we get -ENOSPC, allocate a new system chunk and retry once.
3170 */
3171 if (ret == -ENOSPC) {
3172 const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
3173 struct btrfs_block_group *sys_bg;
3174
f6f39f7a 3175 sys_bg = btrfs_create_chunk(trans, sys_flags);
79bd3712
FM
3176 if (IS_ERR(sys_bg)) {
3177 ret = PTR_ERR(sys_bg);
3178 btrfs_abort_transaction(trans, ret);
3179 goto out;
3180 }
3181
3182 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
64bc6c2a 3183 if (ret) {
64bc6c2a
NB
3184 btrfs_abort_transaction(trans, ret);
3185 goto out;
dfe25020 3186 }
57ba4cb8 3187
79bd3712
FM
3188 ret = remove_chunk_item(trans, map, chunk_offset);
3189 if (ret) {
3190 btrfs_abort_transaction(trans, ret);
3191 goto out;
3192 }
3193 } else if (ret) {
66642832 3194 btrfs_abort_transaction(trans, ret);
47ab2a6c
JB
3195 goto out;
3196 }
8f18cf13 3197
6bccf3ab 3198 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
1abe9b8a 3199
8f18cf13 3200 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
408fbf19 3201 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
47ab2a6c 3202 if (ret) {
66642832 3203 btrfs_abort_transaction(trans, ret);
47ab2a6c
JB
3204 goto out;
3205 }
8f18cf13
CM
3206 }
3207
79bd3712
FM
3208 mutex_unlock(&fs_info->chunk_mutex);
3209 trans->removing_chunk = false;
3210
3211 /*
3212 * We are done with chunk btree updates and deletions, so release the
3213 * system space we previously reserved (with check_system_chunk()).
3214 */
3215 btrfs_trans_release_chunk_metadata(trans);
3216
5a98ec01 3217 ret = btrfs_remove_block_group(trans, chunk_offset, em);
47ab2a6c 3218 if (ret) {
66642832 3219 btrfs_abort_transaction(trans, ret);
47ab2a6c
JB
3220 goto out;
3221 }
2b82032c 3222
47ab2a6c 3223out:
79bd3712
FM
3224 if (trans->removing_chunk) {
3225 mutex_unlock(&fs_info->chunk_mutex);
3226 trans->removing_chunk = false;
3227 }
2b82032c
YZ
3228 /* once for us */
3229 free_extent_map(em);
47ab2a6c
JB
3230 return ret;
3231}
2b82032c 3232
18bb8bbf 3233int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
47ab2a6c 3234{
5b4aacef 3235 struct btrfs_root *root = fs_info->chunk_root;
19c4d2f9 3236 struct btrfs_trans_handle *trans;
b0643e59 3237 struct btrfs_block_group *block_group;
01e86008 3238 u64 length;
47ab2a6c 3239 int ret;
2b82032c 3240
4b349253
JB
3241 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
3242 btrfs_err(fs_info,
3243 "relocate: not supported on extent tree v2 yet");
3244 return -EINVAL;
3245 }
3246
67c5e7d4
FM
3247 /*
3248 * Prevent races with automatic removal of unused block groups.
3249 * After we relocate and before we remove the chunk with offset
3250 * chunk_offset, automatic removal of the block group can kick in,
3251 * resulting in a failure when calling btrfs_remove_chunk() below.
3252 *
3253 * Make sure to acquire this mutex before doing a tree search (dev
3254 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3255 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3256 * we release the path used to search the chunk/dev tree and before
3257 * the current task acquires this mutex and calls us.
3258 */
f3372065 3259 lockdep_assert_held(&fs_info->reclaim_bgs_lock);
67c5e7d4 3260
47ab2a6c 3261 /* step one, relocate all the extents inside this chunk */
2ff7e61e 3262 btrfs_scrub_pause(fs_info);
0b246afa 3263 ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2ff7e61e 3264 btrfs_scrub_continue(fs_info);
47ab2a6c
JB
3265 if (ret)
3266 return ret;
3267
b0643e59
DZ
3268 block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3269 if (!block_group)
3270 return -ENOENT;
3271 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
01e86008 3272 length = block_group->length;
b0643e59
DZ
3273 btrfs_put_block_group(block_group);
3274
01e86008
JT
3275 /*
3276 * On a zoned file system, discard the whole block group, this will
3277 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
3278 * resetting the zone fails, don't treat it as a fatal problem from the
3279 * filesystem's point of view.
3280 */
3281 if (btrfs_is_zoned(fs_info)) {
3282 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
3283 if (ret)
3284 btrfs_info(fs_info,
3285 "failed to reset zone %llu after relocation",
3286 chunk_offset);
3287 }
3288
19c4d2f9
CM
3289 trans = btrfs_start_trans_remove_block_group(root->fs_info,
3290 chunk_offset);
3291 if (IS_ERR(trans)) {
3292 ret = PTR_ERR(trans);
3293 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3294 return ret;
3295 }
3296
47ab2a6c 3297 /*
19c4d2f9
CM
3298 * step two, delete the device extents and the
3299 * chunk tree entries
47ab2a6c 3300 */
97aff912 3301 ret = btrfs_remove_chunk(trans, chunk_offset);
3a45bb20 3302 btrfs_end_transaction(trans);
19c4d2f9 3303 return ret;
2b82032c
YZ
3304}
3305
2ff7e61e 3306static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
2b82032c 3307{
0b246afa 3308 struct btrfs_root *chunk_root = fs_info->chunk_root;
2b82032c
YZ
3309 struct btrfs_path *path;
3310 struct extent_buffer *leaf;
3311 struct btrfs_chunk *chunk;
3312 struct btrfs_key key;
3313 struct btrfs_key found_key;
2b82032c 3314 u64 chunk_type;
ba1bf481
JB
3315 bool retried = false;
3316 int failed = 0;
2b82032c
YZ
3317 int ret;
3318
3319 path = btrfs_alloc_path();
3320 if (!path)
3321 return -ENOMEM;
3322
ba1bf481 3323again:
2b82032c
YZ
3324 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3325 key.offset = (u64)-1;
3326 key.type = BTRFS_CHUNK_ITEM_KEY;
3327
3328 while (1) {
f3372065 3329 mutex_lock(&fs_info->reclaim_bgs_lock);
2b82032c 3330 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
67c5e7d4 3331 if (ret < 0) {
f3372065 3332 mutex_unlock(&fs_info->reclaim_bgs_lock);
2b82032c 3333 goto error;
67c5e7d4 3334 }
79787eaa 3335 BUG_ON(ret == 0); /* Corruption */
2b82032c
YZ
3336
3337 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3338 key.type);
67c5e7d4 3339 if (ret)
f3372065 3340 mutex_unlock(&fs_info->reclaim_bgs_lock);
2b82032c
YZ
3341 if (ret < 0)
3342 goto error;
3343 if (ret > 0)
3344 break;
1a40e23b 3345
2b82032c
YZ
3346 leaf = path->nodes[0];
3347 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1a40e23b 3348
2b82032c
YZ
3349 chunk = btrfs_item_ptr(leaf, path->slots[0],
3350 struct btrfs_chunk);
3351 chunk_type = btrfs_chunk_type(leaf, chunk);
b3b4aa74 3352 btrfs_release_path(path);
8f18cf13 3353
2b82032c 3354 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
0b246afa 3355 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
ba1bf481
JB
3356 if (ret == -ENOSPC)
3357 failed++;
14586651
HS
3358 else
3359 BUG_ON(ret);
2b82032c 3360 }
f3372065 3361 mutex_unlock(&fs_info->reclaim_bgs_lock);
8f18cf13 3362
2b82032c
YZ
3363 if (found_key.offset == 0)
3364 break;
3365 key.offset = found_key.offset - 1;
3366 }
3367 ret = 0;
ba1bf481
JB
3368 if (failed && !retried) {
3369 failed = 0;
3370 retried = true;
3371 goto again;
fae7f21c 3372 } else if (WARN_ON(failed && retried)) {
ba1bf481
JB
3373 ret = -ENOSPC;
3374 }
2b82032c
YZ
3375error:
3376 btrfs_free_path(path);
3377 return ret;
8f18cf13
CM
3378}
3379
a6f93c71
LB
3380/*
3381 * return 1 : allocate a data chunk successfully,
3382 * return <0: errors during allocating a data chunk,
3383 * return 0 : no need to allocate a data chunk.
3384 */
3385static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3386 u64 chunk_offset)
3387{
32da5386 3388 struct btrfs_block_group *cache;
a6f93c71
LB
3389 u64 bytes_used;
3390 u64 chunk_type;
3391
3392 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3393 ASSERT(cache);
3394 chunk_type = cache->flags;
3395 btrfs_put_block_group(cache);
3396
5ae21692
JT
3397 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3398 return 0;
3399
3400 spin_lock(&fs_info->data_sinfo->lock);
3401 bytes_used = fs_info->data_sinfo->bytes_used;
3402 spin_unlock(&fs_info->data_sinfo->lock);
3403
3404 if (!bytes_used) {
3405 struct btrfs_trans_handle *trans;
3406 int ret;
3407
3408 trans = btrfs_join_transaction(fs_info->tree_root);
3409 if (IS_ERR(trans))
3410 return PTR_ERR(trans);
3411
3412 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3413 btrfs_end_transaction(trans);
3414 if (ret < 0)
3415 return ret;
3416 return 1;
a6f93c71 3417 }
5ae21692 3418
a6f93c71
LB
3419 return 0;
3420}
3421
6bccf3ab 3422static int insert_balance_item(struct btrfs_fs_info *fs_info,
0940ebf6
ID
3423 struct btrfs_balance_control *bctl)
3424{
6bccf3ab 3425 struct btrfs_root *root = fs_info->tree_root;
0940ebf6
ID
3426 struct btrfs_trans_handle *trans;
3427 struct btrfs_balance_item *item;
3428 struct btrfs_disk_balance_args disk_bargs;
3429 struct btrfs_path *path;
3430 struct extent_buffer *leaf;
3431 struct btrfs_key key;
3432 int ret, err;
3433
3434 path = btrfs_alloc_path();
3435 if (!path)
3436 return -ENOMEM;
3437
3438 trans = btrfs_start_transaction(root, 0);
3439 if (IS_ERR(trans)) {
3440 btrfs_free_path(path);
3441 return PTR_ERR(trans);
3442 }
3443
3444 key.objectid = BTRFS_BALANCE_OBJECTID;
c479cb4f 3445 key.type = BTRFS_TEMPORARY_ITEM_KEY;
0940ebf6
ID
3446 key.offset = 0;
3447
3448 ret = btrfs_insert_empty_item(trans, root, path, &key,
3449 sizeof(*item));
3450 if (ret)
3451 goto out;
3452
3453 leaf = path->nodes[0];
3454 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3455
b159fa28 3456 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
0940ebf6
ID
3457
3458 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3459 btrfs_set_balance_data(leaf, item, &disk_bargs);
3460 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3461 btrfs_set_balance_meta(leaf, item, &disk_bargs);
3462 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3463 btrfs_set_balance_sys(leaf, item, &disk_bargs);
3464
3465 btrfs_set_balance_flags(leaf, item, bctl->flags);
3466
3467 btrfs_mark_buffer_dirty(leaf);
3468out:
3469 btrfs_free_path(path);
3a45bb20 3470 err = btrfs_commit_transaction(trans);
0940ebf6
ID
3471 if (err && !ret)
3472 ret = err;
3473 return ret;
3474}
3475
6bccf3ab 3476static int del_balance_item(struct btrfs_fs_info *fs_info)
0940ebf6 3477{
6bccf3ab 3478 struct btrfs_root *root = fs_info->tree_root;
0940ebf6
ID
3479 struct btrfs_trans_handle *trans;
3480 struct btrfs_path *path;
3481 struct btrfs_key key;
3482 int ret, err;
3483
3484 path = btrfs_alloc_path();
3485 if (!path)
3486 return -ENOMEM;
3487
3502a8c0 3488 trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
0940ebf6
ID
3489 if (IS_ERR(trans)) {
3490 btrfs_free_path(path);
3491 return PTR_ERR(trans);
3492 }
3493
3494 key.objectid = BTRFS_BALANCE_OBJECTID;
c479cb4f 3495 key.type = BTRFS_TEMPORARY_ITEM_KEY;
0940ebf6
ID
3496 key.offset = 0;
3497
3498 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3499 if (ret < 0)
3500 goto out;
3501 if (ret > 0) {
3502 ret = -ENOENT;
3503 goto out;
3504 }
3505
3506 ret = btrfs_del_item(trans, root, path);
3507out:
3508 btrfs_free_path(path);
3a45bb20 3509 err = btrfs_commit_transaction(trans);
0940ebf6
ID
3510 if (err && !ret)
3511 ret = err;
3512 return ret;
3513}
3514
59641015
ID
3515/*
3516 * This is a heuristic used to reduce the number of chunks balanced on
3517 * resume after balance was interrupted.
3518 */
3519static void update_balance_args(struct btrfs_balance_control *bctl)
3520{
3521 /*
3522 * Turn on soft mode for chunk types that were being converted.
3523 */
3524 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3525 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3526 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3527 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3528 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3529 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3530
3531 /*
3532 * Turn on usage filter if is not already used. The idea is
3533 * that chunks that we have already balanced should be
3534 * reasonably full. Don't do it for chunks that are being
3535 * converted - that will keep us from relocating unconverted
3536 * (albeit full) chunks.
3537 */
3538 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
bc309467 3539 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
59641015
ID
3540 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3541 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3542 bctl->data.usage = 90;
3543 }
3544 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
bc309467 3545 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
59641015
ID
3546 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3547 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3548 bctl->sys.usage = 90;
3549 }
3550 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
bc309467 3551 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
59641015
ID
3552 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3553 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3554 bctl->meta.usage = 90;
3555 }
3556}
3557
149196a2
DS
3558/*
3559 * Clear the balance status in fs_info and delete the balance item from disk.
3560 */
3561static void reset_balance_state(struct btrfs_fs_info *fs_info)
c9e9f97b
ID
3562{
3563 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
149196a2 3564 int ret;
c9e9f97b
ID
3565
3566 BUG_ON(!fs_info->balance_ctl);
3567
3568 spin_lock(&fs_info->balance_lock);
3569 fs_info->balance_ctl = NULL;
3570 spin_unlock(&fs_info->balance_lock);
3571
3572 kfree(bctl);
149196a2
DS
3573 ret = del_balance_item(fs_info);
3574 if (ret)
3575 btrfs_handle_fs_error(fs_info, ret, NULL);
c9e9f97b
ID
3576}
3577
ed25e9b2
ID
3578/*
3579 * Balance filters. Return 1 if chunk should be filtered out
3580 * (should not be balanced).
3581 */
899c81ea 3582static int chunk_profiles_filter(u64 chunk_type,
ed25e9b2
ID
3583 struct btrfs_balance_args *bargs)
3584{
899c81ea
ID
3585 chunk_type = chunk_to_extended(chunk_type) &
3586 BTRFS_EXTENDED_PROFILE_MASK;
ed25e9b2 3587
899c81ea 3588 if (bargs->profiles & chunk_type)
ed25e9b2
ID
3589 return 0;
3590
3591 return 1;
3592}
3593
dba72cb3 3594static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
5ce5b3c0 3595 struct btrfs_balance_args *bargs)
bc309467 3596{
32da5386 3597 struct btrfs_block_group *cache;
bc309467
DS
3598 u64 chunk_used;
3599 u64 user_thresh_min;
3600 u64 user_thresh_max;
3601 int ret = 1;
3602
3603 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
bf38be65 3604 chunk_used = cache->used;
bc309467
DS
3605
3606 if (bargs->usage_min == 0)
3607 user_thresh_min = 0;
3608 else
b3470b5d
DS
3609 user_thresh_min = div_factor_fine(cache->length,
3610 bargs->usage_min);
bc309467
DS
3611
3612 if (bargs->usage_max == 0)
3613 user_thresh_max = 1;
3614 else if (bargs->usage_max > 100)
b3470b5d 3615 user_thresh_max = cache->length;
bc309467 3616 else
b3470b5d
DS
3617 user_thresh_max = div_factor_fine(cache->length,
3618 bargs->usage_max);
bc309467
DS
3619
3620 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3621 ret = 0;
3622
3623 btrfs_put_block_group(cache);
3624 return ret;
3625}
3626
dba72cb3 3627static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
bc309467 3628 u64 chunk_offset, struct btrfs_balance_args *bargs)
5ce5b3c0 3629{
32da5386 3630 struct btrfs_block_group *cache;
5ce5b3c0
ID
3631 u64 chunk_used, user_thresh;
3632 int ret = 1;
3633
3634 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
bf38be65 3635 chunk_used = cache->used;
5ce5b3c0 3636
bc309467 3637 if (bargs->usage_min == 0)
3e39cea6 3638 user_thresh = 1;
a105bb88 3639 else if (bargs->usage > 100)
b3470b5d 3640 user_thresh = cache->length;
a105bb88 3641 else
b3470b5d 3642 user_thresh = div_factor_fine(cache->length, bargs->usage);
a105bb88 3643
5ce5b3c0
ID
3644 if (chunk_used < user_thresh)
3645 ret = 0;
3646
3647 btrfs_put_block_group(cache);
3648 return ret;
3649}
3650
409d404b
ID
3651static int chunk_devid_filter(struct extent_buffer *leaf,
3652 struct btrfs_chunk *chunk,
3653 struct btrfs_balance_args *bargs)
3654{
3655 struct btrfs_stripe *stripe;
3656 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3657 int i;
3658
3659 for (i = 0; i < num_stripes; i++) {
3660 stripe = btrfs_stripe_nr(chunk, i);
3661 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3662 return 0;
3663 }
3664
3665 return 1;
3666}
3667
946c9256
DS
3668static u64 calc_data_stripes(u64 type, int num_stripes)
3669{
3670 const int index = btrfs_bg_flags_to_raid_index(type);
3671 const int ncopies = btrfs_raid_array[index].ncopies;
3672 const int nparity = btrfs_raid_array[index].nparity;
3673
d58ede8d 3674 return (num_stripes - nparity) / ncopies;
946c9256
DS
3675}
3676
94e60d5a
ID
3677/* [pstart, pend) */
3678static int chunk_drange_filter(struct extent_buffer *leaf,
3679 struct btrfs_chunk *chunk,
94e60d5a
ID
3680 struct btrfs_balance_args *bargs)
3681{
3682 struct btrfs_stripe *stripe;
3683 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3684 u64 stripe_offset;
3685 u64 stripe_length;
946c9256 3686 u64 type;
94e60d5a
ID
3687 int factor;
3688 int i;
3689
3690 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3691 return 0;
3692
946c9256
DS
3693 type = btrfs_chunk_type(leaf, chunk);
3694 factor = calc_data_stripes(type, num_stripes);
94e60d5a
ID
3695
3696 for (i = 0; i < num_stripes; i++) {
3697 stripe = btrfs_stripe_nr(chunk, i);
3698 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3699 continue;
3700
3701 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3702 stripe_length = btrfs_chunk_length(leaf, chunk);
b8b93add 3703 stripe_length = div_u64(stripe_length, factor);
94e60d5a
ID
3704
3705 if (stripe_offset < bargs->pend &&
3706 stripe_offset + stripe_length > bargs->pstart)
3707 return 0;
3708 }
3709
3710 return 1;
3711}
3712
ea67176a
ID
3713/* [vstart, vend) */
3714static int chunk_vrange_filter(struct extent_buffer *leaf,
3715 struct btrfs_chunk *chunk,
3716 u64 chunk_offset,
3717 struct btrfs_balance_args *bargs)
3718{
3719 if (chunk_offset < bargs->vend &&
3720 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3721 /* at least part of the chunk is inside this vrange */
3722 return 0;
3723
3724 return 1;
3725}
3726
dee32d0a
GAP
3727static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3728 struct btrfs_chunk *chunk,
3729 struct btrfs_balance_args *bargs)
3730{
3731 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3732
3733 if (bargs->stripes_min <= num_stripes
3734 && num_stripes <= bargs->stripes_max)
3735 return 0;
3736
3737 return 1;
3738}
3739
899c81ea 3740static int chunk_soft_convert_filter(u64 chunk_type,
cfa4c961
ID
3741 struct btrfs_balance_args *bargs)
3742{
3743 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3744 return 0;
3745
899c81ea
ID
3746 chunk_type = chunk_to_extended(chunk_type) &
3747 BTRFS_EXTENDED_PROFILE_MASK;
cfa4c961 3748
899c81ea 3749 if (bargs->target == chunk_type)
cfa4c961
ID
3750 return 1;
3751
3752 return 0;
3753}
3754
6ec0896c 3755static int should_balance_chunk(struct extent_buffer *leaf,
f43ffb60
ID
3756 struct btrfs_chunk *chunk, u64 chunk_offset)
3757{
6ec0896c 3758 struct btrfs_fs_info *fs_info = leaf->fs_info;
0b246afa 3759 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
f43ffb60
ID
3760 struct btrfs_balance_args *bargs = NULL;
3761 u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3762
3763 /* type filter */
3764 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3765 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3766 return 0;
3767 }
3768
3769 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3770 bargs = &bctl->data;
3771 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3772 bargs = &bctl->sys;
3773 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3774 bargs = &bctl->meta;
3775
ed25e9b2
ID
3776 /* profiles filter */
3777 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3778 chunk_profiles_filter(chunk_type, bargs)) {
3779 return 0;
5ce5b3c0
ID
3780 }
3781
3782 /* usage filter */
3783 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
0b246afa 3784 chunk_usage_filter(fs_info, chunk_offset, bargs)) {
5ce5b3c0 3785 return 0;
bc309467 3786 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
0b246afa 3787 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
bc309467 3788 return 0;
409d404b
ID
3789 }
3790
3791 /* devid filter */
3792 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3793 chunk_devid_filter(leaf, chunk, bargs)) {
3794 return 0;
94e60d5a
ID
3795 }
3796
3797 /* drange filter, makes sense only with devid filter */
3798 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
e4ff5fb5 3799 chunk_drange_filter(leaf, chunk, bargs)) {
94e60d5a 3800 return 0;
ea67176a
ID
3801 }
3802
3803 /* vrange filter */
3804 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3805 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3806 return 0;
ed25e9b2
ID
3807 }
3808
dee32d0a
GAP
3809 /* stripes filter */
3810 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3811 chunk_stripes_range_filter(leaf, chunk, bargs)) {
3812 return 0;
3813 }
3814
cfa4c961
ID
3815 /* soft profile changing mode */
3816 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3817 chunk_soft_convert_filter(chunk_type, bargs)) {
3818 return 0;
3819 }
3820
7d824b6f
DS
3821 /*
3822 * limited by count, must be the last filter
3823 */
3824 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3825 if (bargs->limit == 0)
3826 return 0;
3827 else
3828 bargs->limit--;
12907fc7
DS
3829 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3830 /*
3831 * Same logic as the 'limit' filter; the minimum cannot be
01327610 3832 * determined here because we do not have the global information
12907fc7
DS
3833 * about the count of all chunks that satisfy the filters.
3834 */
3835 if (bargs->limit_max == 0)
3836 return 0;
3837 else
3838 bargs->limit_max--;
7d824b6f
DS
3839 }
3840
f43ffb60
ID
3841 return 1;
3842}
3843
c9e9f97b 3844static int __btrfs_balance(struct btrfs_fs_info *fs_info)
ec44a35c 3845{
19a39dce 3846 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
c9e9f97b 3847 struct btrfs_root *chunk_root = fs_info->chunk_root;
12907fc7 3848 u64 chunk_type;
f43ffb60 3849 struct btrfs_chunk *chunk;
5a488b9d 3850 struct btrfs_path *path = NULL;
ec44a35c 3851 struct btrfs_key key;
ec44a35c 3852 struct btrfs_key found_key;
f43ffb60
ID
3853 struct extent_buffer *leaf;
3854 int slot;
c9e9f97b
ID
3855 int ret;
3856 int enospc_errors = 0;
19a39dce 3857 bool counting = true;
12907fc7 3858 /* The single value limit and min/max limits use the same bytes in the */
7d824b6f
DS
3859 u64 limit_data = bctl->data.limit;
3860 u64 limit_meta = bctl->meta.limit;
3861 u64 limit_sys = bctl->sys.limit;
12907fc7
DS
3862 u32 count_data = 0;
3863 u32 count_meta = 0;
3864 u32 count_sys = 0;
2c9fe835 3865 int chunk_reserved = 0;
ec44a35c 3866
ec44a35c 3867 path = btrfs_alloc_path();
17e9f796
MF
3868 if (!path) {
3869 ret = -ENOMEM;
3870 goto error;
3871 }
19a39dce
ID
3872
3873 /* zero out stat counters */
3874 spin_lock(&fs_info->balance_lock);
3875 memset(&bctl->stat, 0, sizeof(bctl->stat));
3876 spin_unlock(&fs_info->balance_lock);
3877again:
7d824b6f 3878 if (!counting) {
12907fc7
DS
3879 /*
3880 * The single value limit and min/max limits use the same bytes
3881 * in the
3882 */
7d824b6f
DS
3883 bctl->data.limit = limit_data;
3884 bctl->meta.limit = limit_meta;
3885 bctl->sys.limit = limit_sys;
3886 }
ec44a35c
CM
3887 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3888 key.offset = (u64)-1;
3889 key.type = BTRFS_CHUNK_ITEM_KEY;
3890
d397712b 3891 while (1) {
19a39dce 3892 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
a7e99c69 3893 atomic_read(&fs_info->balance_cancel_req)) {
837d5b6e
ID
3894 ret = -ECANCELED;
3895 goto error;
3896 }
3897
f3372065 3898 mutex_lock(&fs_info->reclaim_bgs_lock);
ec44a35c 3899 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
67c5e7d4 3900 if (ret < 0) {
f3372065 3901 mutex_unlock(&fs_info->reclaim_bgs_lock);
ec44a35c 3902 goto error;
67c5e7d4 3903 }
ec44a35c
CM
3904
3905 /*
3906 * this shouldn't happen, it means the last relocate
3907 * failed
3908 */
3909 if (ret == 0)
c9e9f97b 3910 BUG(); /* FIXME break ? */
ec44a35c
CM
3911
3912 ret = btrfs_previous_item(chunk_root, path, 0,
3913 BTRFS_CHUNK_ITEM_KEY);
c9e9f97b 3914 if (ret) {
f3372065 3915 mutex_unlock(&fs_info->reclaim_bgs_lock);
c9e9f97b 3916 ret = 0;
ec44a35c 3917 break;
c9e9f97b 3918 }
7d9eb12c 3919
f43ffb60
ID
3920 leaf = path->nodes[0];
3921 slot = path->slots[0];
3922 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7d9eb12c 3923
67c5e7d4 3924 if (found_key.objectid != key.objectid) {
f3372065 3925 mutex_unlock(&fs_info->reclaim_bgs_lock);
ec44a35c 3926 break;
67c5e7d4 3927 }
7d9eb12c 3928
f43ffb60 3929 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
12907fc7 3930 chunk_type = btrfs_chunk_type(leaf, chunk);
f43ffb60 3931
19a39dce
ID
3932 if (!counting) {
3933 spin_lock(&fs_info->balance_lock);
3934 bctl->stat.considered++;
3935 spin_unlock(&fs_info->balance_lock);
3936 }
3937
6ec0896c 3938 ret = should_balance_chunk(leaf, chunk, found_key.offset);
2c9fe835 3939
b3b4aa74 3940 btrfs_release_path(path);
67c5e7d4 3941 if (!ret) {
f3372065 3942 mutex_unlock(&fs_info->reclaim_bgs_lock);
f43ffb60 3943 goto loop;
67c5e7d4 3944 }
f43ffb60 3945
19a39dce 3946 if (counting) {
f3372065 3947 mutex_unlock(&fs_info->reclaim_bgs_lock);
19a39dce
ID
3948 spin_lock(&fs_info->balance_lock);
3949 bctl->stat.expected++;
3950 spin_unlock(&fs_info->balance_lock);
12907fc7
DS
3951
3952 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3953 count_data++;
3954 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3955 count_sys++;
3956 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3957 count_meta++;
3958
3959 goto loop;
3960 }
3961
3962 /*
3963 * Apply limit_min filter, no need to check if the LIMITS
3964 * filter is used, limit_min is 0 by default
3965 */
3966 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3967 count_data < bctl->data.limit_min)
3968 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3969 count_meta < bctl->meta.limit_min)
3970 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3971 count_sys < bctl->sys.limit_min)) {
f3372065 3972 mutex_unlock(&fs_info->reclaim_bgs_lock);
19a39dce
ID
3973 goto loop;
3974 }
3975
a6f93c71
LB
3976 if (!chunk_reserved) {
3977 /*
3978 * We may be relocating the only data chunk we have,
3979 * which could potentially end up with losing data's
3980 * raid profile, so lets allocate an empty one in
3981 * advance.
3982 */
3983 ret = btrfs_may_alloc_data_chunk(fs_info,
3984 found_key.offset);
2c9fe835 3985 if (ret < 0) {
f3372065 3986 mutex_unlock(&fs_info->reclaim_bgs_lock);
2c9fe835 3987 goto error;
a6f93c71
LB
3988 } else if (ret == 1) {
3989 chunk_reserved = 1;
2c9fe835 3990 }
2c9fe835
ZL
3991 }
3992
5b4aacef 3993 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
f3372065 3994 mutex_unlock(&fs_info->reclaim_bgs_lock);
19a39dce 3995 if (ret == -ENOSPC) {
c9e9f97b 3996 enospc_errors++;
eede2bf3
OS
3997 } else if (ret == -ETXTBSY) {
3998 btrfs_info(fs_info,
3999 "skipping relocation of block group %llu due to active swapfile",
4000 found_key.offset);
4001 ret = 0;
4002 } else if (ret) {
4003 goto error;
19a39dce
ID
4004 } else {
4005 spin_lock(&fs_info->balance_lock);
4006 bctl->stat.completed++;
4007 spin_unlock(&fs_info->balance_lock);
4008 }
f43ffb60 4009loop:
795a3321
ID
4010 if (found_key.offset == 0)
4011 break;
ba1bf481 4012 key.offset = found_key.offset - 1;
ec44a35c 4013 }
c9e9f97b 4014
19a39dce
ID
4015 if (counting) {
4016 btrfs_release_path(path);
4017 counting = false;
4018 goto again;
4019 }
ec44a35c
CM
4020error:
4021 btrfs_free_path(path);
c9e9f97b 4022 if (enospc_errors) {
efe120a0 4023 btrfs_info(fs_info, "%d enospc errors during balance",
5d163e0e 4024 enospc_errors);
c9e9f97b
ID
4025 if (!ret)
4026 ret = -ENOSPC;
4027 }
4028
ec44a35c
CM
4029 return ret;
4030}
4031
0c460c0d
ID
4032/**
4033 * alloc_profile_is_valid - see if a given profile is valid and reduced
4034 * @flags: profile to validate
4035 * @extended: if true @flags is treated as an extended profile
4036 */
4037static int alloc_profile_is_valid(u64 flags, int extended)
4038{
4039 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
4040 BTRFS_BLOCK_GROUP_PROFILE_MASK);
4041
4042 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
4043
4044 /* 1) check that all other bits are zeroed */
4045 if (flags & ~mask)
4046 return 0;
4047
4048 /* 2) see if profile is reduced */
4049 if (flags == 0)
4050 return !extended; /* "0" is valid for usual profiles */
4051
c1499166 4052 return has_single_bit_set(flags);
0c460c0d
ID
4053}
4054
837d5b6e
ID
4055static inline int balance_need_close(struct btrfs_fs_info *fs_info)
4056{
a7e99c69
ID
4057 /* cancel requested || normal exit path */
4058 return atomic_read(&fs_info->balance_cancel_req) ||
4059 (atomic_read(&fs_info->balance_pause_req) == 0 &&
4060 atomic_read(&fs_info->balance_cancel_req) == 0);
837d5b6e
ID
4061}
4062
5ba366c3
DS
4063/*
4064 * Validate target profile against allowed profiles and return true if it's OK.
4065 * Otherwise print the error message and return false.
4066 */
4067static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
4068 const struct btrfs_balance_args *bargs,
4069 u64 allowed, const char *type)
bdcd3c97 4070{
5ba366c3
DS
4071 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
4072 return true;
4073
4074 /* Profile is valid and does not have bits outside of the allowed set */
4075 if (alloc_profile_is_valid(bargs->target, 1) &&
4076 (bargs->target & ~allowed) == 0)
4077 return true;
4078
4079 btrfs_err(fs_info, "balance: invalid convert %s profile %s",
4080 type, btrfs_bg_type_to_raid_name(bargs->target));
4081 return false;
bdcd3c97
AM
4082}
4083
56fc37d9
AJ
4084/*
4085 * Fill @buf with textual description of balance filter flags @bargs, up to
4086 * @size_buf including the terminating null. The output may be trimmed if it
4087 * does not fit into the provided buffer.
4088 */
4089static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
4090 u32 size_buf)
4091{
4092 int ret;
4093 u32 size_bp = size_buf;
4094 char *bp = buf;
4095 u64 flags = bargs->flags;
4096 char tmp_buf[128] = {'\0'};
4097
4098 if (!flags)
4099 return;
4100
4101#define CHECK_APPEND_NOARG(a) \
4102 do { \
4103 ret = snprintf(bp, size_bp, (a)); \
4104 if (ret < 0 || ret >= size_bp) \
4105 goto out_overflow; \
4106 size_bp -= ret; \
4107 bp += ret; \
4108 } while (0)
4109
4110#define CHECK_APPEND_1ARG(a, v1) \
4111 do { \
4112 ret = snprintf(bp, size_bp, (a), (v1)); \
4113 if (ret < 0 || ret >= size_bp) \
4114 goto out_overflow; \
4115 size_bp -= ret; \
4116 bp += ret; \
4117 } while (0)
4118
4119#define CHECK_APPEND_2ARG(a, v1, v2) \
4120 do { \
4121 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
4122 if (ret < 0 || ret >= size_bp) \
4123 goto out_overflow; \
4124 size_bp -= ret; \
4125 bp += ret; \
4126 } while (0)
4127
158da513
DS
4128 if (flags & BTRFS_BALANCE_ARGS_CONVERT)
4129 CHECK_APPEND_1ARG("convert=%s,",
4130 btrfs_bg_type_to_raid_name(bargs->target));
56fc37d9
AJ
4131
4132 if (flags & BTRFS_BALANCE_ARGS_SOFT)
4133 CHECK_APPEND_NOARG("soft,");
4134
4135 if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
4136 btrfs_describe_block_groups(bargs->profiles, tmp_buf,
4137 sizeof(tmp_buf));
4138 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
4139 }
4140
4141 if (flags & BTRFS_BALANCE_ARGS_USAGE)
4142 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
4143
4144 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
4145 CHECK_APPEND_2ARG("usage=%u..%u,",
4146 bargs->usage_min, bargs->usage_max);
4147
4148 if (flags & BTRFS_BALANCE_ARGS_DEVID)
4149 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
4150
4151 if (flags & BTRFS_BALANCE_ARGS_DRANGE)
4152 CHECK_APPEND_2ARG("drange=%llu..%llu,",
4153 bargs->pstart, bargs->pend);
4154
4155 if (flags & BTRFS_BALANCE_ARGS_VRANGE)
4156 CHECK_APPEND_2ARG("vrange=%llu..%llu,",
4157 bargs->vstart, bargs->vend);
4158
4159 if (flags & BTRFS_BALANCE_ARGS_LIMIT)
4160 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
4161
4162 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
4163 CHECK_APPEND_2ARG("limit=%u..%u,",
4164 bargs->limit_min, bargs->limit_max);
4165
4166 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
4167 CHECK_APPEND_2ARG("stripes=%u..%u,",
4168 bargs->stripes_min, bargs->stripes_max);
4169
4170#undef CHECK_APPEND_2ARG
4171#undef CHECK_APPEND_1ARG
4172#undef CHECK_APPEND_NOARG
4173
4174out_overflow:
4175
4176 if (size_bp < size_buf)
4177 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
4178 else
4179 buf[0] = '\0';
4180}
4181
4182static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
4183{
4184 u32 size_buf = 1024;
4185 char tmp_buf[192] = {'\0'};
4186 char *buf;
4187 char *bp;
4188 u32 size_bp = size_buf;
4189 int ret;
4190 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4191
4192 buf = kzalloc(size_buf, GFP_KERNEL);
4193 if (!buf)
4194 return;
4195
4196 bp = buf;
4197
4198#define CHECK_APPEND_1ARG(a, v1) \
4199 do { \
4200 ret = snprintf(bp, size_bp, (a), (v1)); \
4201 if (ret < 0 || ret >= size_bp) \
4202 goto out_overflow; \
4203 size_bp -= ret; \
4204 bp += ret; \
4205 } while (0)
4206
4207 if (bctl->flags & BTRFS_BALANCE_FORCE)
4208 CHECK_APPEND_1ARG("%s", "-f ");
4209
4210 if (bctl->flags & BTRFS_BALANCE_DATA) {
4211 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
4212 CHECK_APPEND_1ARG("-d%s ", tmp_buf);
4213 }
4214
4215 if (bctl->flags & BTRFS_BALANCE_METADATA) {
4216 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
4217 CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4218 }
4219
4220 if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4221 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
4222 CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4223 }
4224
4225#undef CHECK_APPEND_1ARG
4226
4227out_overflow:
4228
4229 if (size_bp < size_buf)
4230 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
4231 btrfs_info(fs_info, "balance: %s %s",
4232 (bctl->flags & BTRFS_BALANCE_RESUME) ?
4233 "resume" : "start", buf);
4234
4235 kfree(buf);
4236}
4237
c9e9f97b 4238/*
dccdb07b 4239 * Should be called with balance mutexe held
c9e9f97b 4240 */
6fcf6e2b
DS
4241int btrfs_balance(struct btrfs_fs_info *fs_info,
4242 struct btrfs_balance_control *bctl,
c9e9f97b
ID
4243 struct btrfs_ioctl_balance_args *bargs)
4244{
14506127 4245 u64 meta_target, data_target;
f43ffb60 4246 u64 allowed;
e4837f8f 4247 int mixed = 0;
c9e9f97b 4248 int ret;
8dabb742 4249 u64 num_devices;
de98ced9 4250 unsigned seq;
e62869be 4251 bool reducing_redundancy;
081db89b 4252 int i;
c9e9f97b 4253
837d5b6e 4254 if (btrfs_fs_closing(fs_info) ||
a7e99c69 4255 atomic_read(&fs_info->balance_pause_req) ||
726a3421 4256 btrfs_should_cancel_balance(fs_info)) {
c9e9f97b
ID
4257 ret = -EINVAL;
4258 goto out;
4259 }
4260
e4837f8f
ID
4261 allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4262 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4263 mixed = 1;
4264
f43ffb60
ID
4265 /*
4266 * In case of mixed groups both data and meta should be picked,
4267 * and identical options should be given for both of them.
4268 */
e4837f8f
ID
4269 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4270 if (mixed && (bctl->flags & allowed)) {
f43ffb60
ID
4271 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4272 !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4273 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
5d163e0e 4274 btrfs_err(fs_info,
6dac13f8 4275 "balance: mixed groups data and metadata options must be the same");
f43ffb60
ID
4276 ret = -EINVAL;
4277 goto out;
4278 }
4279 }
4280
b35cf1f0
JB
4281 /*
4282 * rw_devices will not change at the moment, device add/delete/replace
c3e1f96c 4283 * are exclusive
b35cf1f0
JB
4284 */
4285 num_devices = fs_info->fs_devices->rw_devices;
fab27359
QW
4286
4287 /*
4288 * SINGLE profile on-disk has no profile bit, but in-memory we have a
4289 * special bit for it, to make it easier to distinguish. Thus we need
4290 * to set it manually, or balance would refuse the profile.
4291 */
4292 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
081db89b
DS
4293 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4294 if (num_devices >= btrfs_raid_array[i].devs_min)
4295 allowed |= btrfs_raid_array[i].bg_flag;
1da73967 4296
5ba366c3
DS
4297 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4298 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4299 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) {
e4d8ec0f
ID
4300 ret = -EINVAL;
4301 goto out;
4302 }
4303
6079e12c
DS
4304 /*
4305 * Allow to reduce metadata or system integrity only if force set for
4306 * profiles with redundancy (copies, parity)
4307 */
4308 allowed = 0;
4309 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4310 if (btrfs_raid_array[i].ncopies >= 2 ||
4311 btrfs_raid_array[i].tolerated_failures >= 1)
4312 allowed |= btrfs_raid_array[i].bg_flag;
4313 }
de98ced9
MX
4314 do {
4315 seq = read_seqbegin(&fs_info->profiles_lock);
4316
4317 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4318 (fs_info->avail_system_alloc_bits & allowed) &&
4319 !(bctl->sys.target & allowed)) ||
4320 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4321 (fs_info->avail_metadata_alloc_bits & allowed) &&
5a8067c0 4322 !(bctl->meta.target & allowed)))
e62869be 4323 reducing_redundancy = true;
5a8067c0 4324 else
e62869be 4325 reducing_redundancy = false;
5a8067c0
FM
4326
4327 /* if we're not converting, the target field is uninitialized */
4328 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4329 bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4330 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4331 bctl->data.target : fs_info->avail_data_alloc_bits;
de98ced9 4332 } while (read_seqretry(&fs_info->profiles_lock, seq));
e4d8ec0f 4333
e62869be 4334 if (reducing_redundancy) {
5a8067c0
FM
4335 if (bctl->flags & BTRFS_BALANCE_FORCE) {
4336 btrfs_info(fs_info,
e62869be 4337 "balance: force reducing metadata redundancy");
5a8067c0
FM
4338 } else {
4339 btrfs_err(fs_info,
e62869be 4340 "balance: reduces metadata redundancy, use --force if you want this");
5a8067c0
FM
4341 ret = -EINVAL;
4342 goto out;
4343 }
4344 }
4345
14506127
AB
4346 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4347 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
ee592d07 4348 btrfs_warn(fs_info,
6dac13f8 4349 "balance: metadata profile %s has lower redundancy than data profile %s",
158da513
DS
4350 btrfs_bg_type_to_raid_name(meta_target),
4351 btrfs_bg_type_to_raid_name(data_target));
ee592d07
ST
4352 }
4353
6bccf3ab 4354 ret = insert_balance_item(fs_info, bctl);
59641015 4355 if (ret && ret != -EEXIST)
0940ebf6
ID
4356 goto out;
4357
59641015
ID
4358 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4359 BUG_ON(ret == -EEXIST);
833aae18
DS
4360 BUG_ON(fs_info->balance_ctl);
4361 spin_lock(&fs_info->balance_lock);
4362 fs_info->balance_ctl = bctl;
4363 spin_unlock(&fs_info->balance_lock);
59641015
ID
4364 } else {
4365 BUG_ON(ret != -EEXIST);
4366 spin_lock(&fs_info->balance_lock);
4367 update_balance_args(bctl);
4368 spin_unlock(&fs_info->balance_lock);
4369 }
c9e9f97b 4370
3009a62f
DS
4371 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4372 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
56fc37d9 4373 describe_balance_start_or_resume(fs_info);
c9e9f97b
ID
4374 mutex_unlock(&fs_info->balance_mutex);
4375
4376 ret = __btrfs_balance(fs_info);
4377
4378 mutex_lock(&fs_info->balance_mutex);
efc0e69c 4379 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
7333bd02 4380 btrfs_info(fs_info, "balance: paused");
efc0e69c
NB
4381 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
4382 }
44d354ab
QW
4383 /*
4384 * Balance can be canceled by:
4385 *
4386 * - Regular cancel request
4387 * Then ret == -ECANCELED and balance_cancel_req > 0
4388 *
4389 * - Fatal signal to "btrfs" process
4390 * Either the signal caught by wait_reserve_ticket() and callers
4391 * got -EINTR, or caught by btrfs_should_cancel_balance() and
4392 * got -ECANCELED.
4393 * Either way, in this case balance_cancel_req = 0, and
4394 * ret == -EINTR or ret == -ECANCELED.
4395 *
4396 * So here we only check the return value to catch canceled balance.
4397 */
4398 else if (ret == -ECANCELED || ret == -EINTR)
7333bd02
AJ
4399 btrfs_info(fs_info, "balance: canceled");
4400 else
4401 btrfs_info(fs_info, "balance: ended with status: %d", ret);
4402
3009a62f 4403 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
c9e9f97b
ID
4404
4405 if (bargs) {
4406 memset(bargs, 0, sizeof(*bargs));
008ef096 4407 btrfs_update_ioctl_balance_args(fs_info, bargs);
c9e9f97b
ID
4408 }
4409
3a01aa7a
ID
4410 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
4411 balance_need_close(fs_info)) {
149196a2 4412 reset_balance_state(fs_info);
c3e1f96c 4413 btrfs_exclop_finish(fs_info);
3a01aa7a
ID
4414 }
4415
837d5b6e 4416 wake_up(&fs_info->balance_wait_q);
c9e9f97b
ID
4417
4418 return ret;
4419out:
59641015 4420 if (bctl->flags & BTRFS_BALANCE_RESUME)
149196a2 4421 reset_balance_state(fs_info);
a17c95df 4422 else
59641015 4423 kfree(bctl);
c3e1f96c 4424 btrfs_exclop_finish(fs_info);
a17c95df 4425
59641015
ID
4426 return ret;
4427}
4428
4429static int balance_kthread(void *data)
4430{
2b6ba629 4431 struct btrfs_fs_info *fs_info = data;
9555c6c1 4432 int ret = 0;
59641015 4433
a690e5f2 4434 sb_start_write(fs_info->sb);
59641015 4435 mutex_lock(&fs_info->balance_mutex);
56fc37d9 4436 if (fs_info->balance_ctl)
6fcf6e2b 4437 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
59641015 4438 mutex_unlock(&fs_info->balance_mutex);
a690e5f2 4439 sb_end_write(fs_info->sb);
2b6ba629 4440
59641015
ID
4441 return ret;
4442}
4443
2b6ba629
ID
4444int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4445{
4446 struct task_struct *tsk;
4447
1354e1a1 4448 mutex_lock(&fs_info->balance_mutex);
2b6ba629 4449 if (!fs_info->balance_ctl) {
1354e1a1 4450 mutex_unlock(&fs_info->balance_mutex);
2b6ba629
ID
4451 return 0;
4452 }
1354e1a1 4453 mutex_unlock(&fs_info->balance_mutex);
2b6ba629 4454
3cdde224 4455 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
6dac13f8 4456 btrfs_info(fs_info, "balance: resume skipped");
2b6ba629
ID
4457 return 0;
4458 }
4459
efc0e69c
NB
4460 spin_lock(&fs_info->super_lock);
4461 ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
4462 fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
4463 spin_unlock(&fs_info->super_lock);
02ee654d
AJ
4464 /*
4465 * A ro->rw remount sequence should continue with the paused balance
4466 * regardless of who pauses it, system or the user as of now, so set
4467 * the resume flag.
4468 */
4469 spin_lock(&fs_info->balance_lock);
4470 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4471 spin_unlock(&fs_info->balance_lock);
4472
2b6ba629 4473 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
cd633972 4474 return PTR_ERR_OR_ZERO(tsk);
2b6ba629
ID
4475}
4476
68310a5e 4477int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
59641015 4478{
59641015
ID
4479 struct btrfs_balance_control *bctl;
4480 struct btrfs_balance_item *item;
4481 struct btrfs_disk_balance_args disk_bargs;
4482 struct btrfs_path *path;
4483 struct extent_buffer *leaf;
4484 struct btrfs_key key;
4485 int ret;
4486
4487 path = btrfs_alloc_path();
4488 if (!path)
4489 return -ENOMEM;
4490
59641015 4491 key.objectid = BTRFS_BALANCE_OBJECTID;
c479cb4f 4492 key.type = BTRFS_TEMPORARY_ITEM_KEY;
59641015
ID
4493 key.offset = 0;
4494
68310a5e 4495 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
59641015 4496 if (ret < 0)
68310a5e 4497 goto out;
59641015
ID
4498 if (ret > 0) { /* ret = -ENOENT; */
4499 ret = 0;
68310a5e
ID
4500 goto out;
4501 }
4502
4503 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4504 if (!bctl) {
4505 ret = -ENOMEM;
4506 goto out;
59641015
ID
4507 }
4508
4509 leaf = path->nodes[0];
4510 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4511
68310a5e
ID
4512 bctl->flags = btrfs_balance_flags(leaf, item);
4513 bctl->flags |= BTRFS_BALANCE_RESUME;
59641015
ID
4514
4515 btrfs_balance_data(leaf, item, &disk_bargs);
4516 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4517 btrfs_balance_meta(leaf, item, &disk_bargs);
4518 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4519 btrfs_balance_sys(leaf, item, &disk_bargs);
4520 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4521
eee95e3f
DS
4522 /*
4523 * This should never happen, as the paused balance state is recovered
4524 * during mount without any chance of other exclusive ops to collide.
4525 *
4526 * This gives the exclusive op status to balance and keeps in paused
4527 * state until user intervention (cancel or umount). If the ownership
4528 * cannot be assigned, show a message but do not fail. The balance
4529 * is in a paused state and must have fs_info::balance_ctl properly
4530 * set up.
4531 */
efc0e69c 4532 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
eee95e3f 4533 btrfs_warn(fs_info,
6dac13f8 4534 "balance: cannot set exclusive op status, resume manually");
ed0fb78f 4535
fb286100
JB
4536 btrfs_release_path(path);
4537
68310a5e 4538 mutex_lock(&fs_info->balance_mutex);
833aae18
DS
4539 BUG_ON(fs_info->balance_ctl);
4540 spin_lock(&fs_info->balance_lock);
4541 fs_info->balance_ctl = bctl;
4542 spin_unlock(&fs_info->balance_lock);
68310a5e 4543 mutex_unlock(&fs_info->balance_mutex);
59641015
ID
4544out:
4545 btrfs_free_path(path);
ec44a35c
CM
4546 return ret;
4547}
4548
837d5b6e
ID
4549int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4550{
4551 int ret = 0;
4552
4553 mutex_lock(&fs_info->balance_mutex);
4554 if (!fs_info->balance_ctl) {
4555 mutex_unlock(&fs_info->balance_mutex);
4556 return -ENOTCONN;
4557 }
4558
3009a62f 4559 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
837d5b6e
ID
4560 atomic_inc(&fs_info->balance_pause_req);
4561 mutex_unlock(&fs_info->balance_mutex);
4562
4563 wait_event(fs_info->balance_wait_q,
3009a62f 4564 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
837d5b6e
ID
4565
4566 mutex_lock(&fs_info->balance_mutex);
4567 /* we are good with balance_ctl ripped off from under us */
3009a62f 4568 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
837d5b6e
ID
4569 atomic_dec(&fs_info->balance_pause_req);
4570 } else {
4571 ret = -ENOTCONN;
4572 }
4573
4574 mutex_unlock(&fs_info->balance_mutex);
4575 return ret;
4576}
4577
a7e99c69
ID
4578int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4579{
4580 mutex_lock(&fs_info->balance_mutex);
4581 if (!fs_info->balance_ctl) {
4582 mutex_unlock(&fs_info->balance_mutex);
4583 return -ENOTCONN;
4584 }
4585
cf7d20f4
DS
4586 /*
4587 * A paused balance with the item stored on disk can be resumed at
4588 * mount time if the mount is read-write. Otherwise it's still paused
4589 * and we must not allow cancelling as it deletes the item.
4590 */
4591 if (sb_rdonly(fs_info->sb)) {
4592 mutex_unlock(&fs_info->balance_mutex);
4593 return -EROFS;
4594 }
4595
a7e99c69
ID
4596 atomic_inc(&fs_info->balance_cancel_req);
4597 /*
4598 * if we are running just wait and return, balance item is
4599 * deleted in btrfs_balance in this case
4600 */
3009a62f 4601 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
a7e99c69
ID
4602 mutex_unlock(&fs_info->balance_mutex);
4603 wait_event(fs_info->balance_wait_q,
3009a62f 4604 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
a7e99c69
ID
4605 mutex_lock(&fs_info->balance_mutex);
4606 } else {
a7e99c69 4607 mutex_unlock(&fs_info->balance_mutex);
dccdb07b
DS
4608 /*
4609 * Lock released to allow other waiters to continue, we'll
4610 * reexamine the status again.
4611 */
a7e99c69
ID
4612 mutex_lock(&fs_info->balance_mutex);
4613
a17c95df 4614 if (fs_info->balance_ctl) {
149196a2 4615 reset_balance_state(fs_info);
c3e1f96c 4616 btrfs_exclop_finish(fs_info);
6dac13f8 4617 btrfs_info(fs_info, "balance: canceled");
a17c95df 4618 }
a7e99c69
ID
4619 }
4620
3009a62f
DS
4621 BUG_ON(fs_info->balance_ctl ||
4622 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
a7e99c69
ID
4623 atomic_dec(&fs_info->balance_cancel_req);
4624 mutex_unlock(&fs_info->balance_mutex);
4625 return 0;
4626}
4627
97f4dd09 4628int btrfs_uuid_scan_kthread(void *data)
803b2f54
SB
4629{
4630 struct btrfs_fs_info *fs_info = data;
4631 struct btrfs_root *root = fs_info->tree_root;
4632 struct btrfs_key key;
803b2f54
SB
4633 struct btrfs_path *path = NULL;
4634 int ret = 0;
4635 struct extent_buffer *eb;
4636 int slot;
4637 struct btrfs_root_item root_item;
4638 u32 item_size;
f45388f3 4639 struct btrfs_trans_handle *trans = NULL;
c94bec2c 4640 bool closing = false;
803b2f54
SB
4641
4642 path = btrfs_alloc_path();
4643 if (!path) {
4644 ret = -ENOMEM;
4645 goto out;
4646 }
4647
4648 key.objectid = 0;
4649 key.type = BTRFS_ROOT_ITEM_KEY;
4650 key.offset = 0;
4651
803b2f54 4652 while (1) {
c94bec2c
JB
4653 if (btrfs_fs_closing(fs_info)) {
4654 closing = true;
4655 break;
4656 }
7c829b72
AJ
4657 ret = btrfs_search_forward(root, &key, path,
4658 BTRFS_OLDEST_GENERATION);
803b2f54
SB
4659 if (ret) {
4660 if (ret > 0)
4661 ret = 0;
4662 break;
4663 }
4664
4665 if (key.type != BTRFS_ROOT_ITEM_KEY ||
4666 (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4667 key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4668 key.objectid > BTRFS_LAST_FREE_OBJECTID)
4669 goto skip;
4670
4671 eb = path->nodes[0];
4672 slot = path->slots[0];
3212fa14 4673 item_size = btrfs_item_size(eb, slot);
803b2f54
SB
4674 if (item_size < sizeof(root_item))
4675 goto skip;
4676
803b2f54
SB
4677 read_extent_buffer(eb, &root_item,
4678 btrfs_item_ptr_offset(eb, slot),
4679 (int)sizeof(root_item));
4680 if (btrfs_root_refs(&root_item) == 0)
4681 goto skip;
f45388f3
FDBM
4682
4683 if (!btrfs_is_empty_uuid(root_item.uuid) ||
4684 !btrfs_is_empty_uuid(root_item.received_uuid)) {
4685 if (trans)
4686 goto update_tree;
4687
4688 btrfs_release_path(path);
803b2f54
SB
4689 /*
4690 * 1 - subvol uuid item
4691 * 1 - received_subvol uuid item
4692 */
4693 trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4694 if (IS_ERR(trans)) {
4695 ret = PTR_ERR(trans);
4696 break;
4697 }
f45388f3
FDBM
4698 continue;
4699 } else {
4700 goto skip;
4701 }
4702update_tree:
9771a5cf 4703 btrfs_release_path(path);
f45388f3 4704 if (!btrfs_is_empty_uuid(root_item.uuid)) {
cdb345a8 4705 ret = btrfs_uuid_tree_add(trans, root_item.uuid,
803b2f54
SB
4706 BTRFS_UUID_KEY_SUBVOL,
4707 key.objectid);
4708 if (ret < 0) {
efe120a0 4709 btrfs_warn(fs_info, "uuid_tree_add failed %d",
803b2f54 4710 ret);
803b2f54
SB
4711 break;
4712 }
4713 }
4714
4715 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
cdb345a8 4716 ret = btrfs_uuid_tree_add(trans,
803b2f54
SB
4717 root_item.received_uuid,
4718 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4719 key.objectid);
4720 if (ret < 0) {
efe120a0 4721 btrfs_warn(fs_info, "uuid_tree_add failed %d",
803b2f54 4722 ret);
803b2f54
SB
4723 break;
4724 }
4725 }
4726
f45388f3 4727skip:
9771a5cf 4728 btrfs_release_path(path);
803b2f54 4729 if (trans) {
3a45bb20 4730 ret = btrfs_end_transaction(trans);
f45388f3 4731 trans = NULL;
803b2f54
SB
4732 if (ret)
4733 break;
4734 }
4735
803b2f54
SB
4736 if (key.offset < (u64)-1) {
4737 key.offset++;
4738 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4739 key.offset = 0;
4740 key.type = BTRFS_ROOT_ITEM_KEY;
4741 } else if (key.objectid < (u64)-1) {
4742 key.offset = 0;
4743 key.type = BTRFS_ROOT_ITEM_KEY;
4744 key.objectid++;
4745 } else {
4746 break;
4747 }
4748 cond_resched();
4749 }
4750
4751out:
4752 btrfs_free_path(path);
f45388f3 4753 if (trans && !IS_ERR(trans))
3a45bb20 4754 btrfs_end_transaction(trans);
803b2f54 4755 if (ret)
efe120a0 4756 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
c94bec2c 4757 else if (!closing)
afcdd129 4758 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
803b2f54
SB
4759 up(&fs_info->uuid_tree_rescan_sem);
4760 return 0;
4761}
4762
f7a81ea4
SB
4763int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4764{
4765 struct btrfs_trans_handle *trans;
4766 struct btrfs_root *tree_root = fs_info->tree_root;
4767 struct btrfs_root *uuid_root;
803b2f54
SB
4768 struct task_struct *task;
4769 int ret;
f7a81ea4
SB
4770
4771 /*
4772 * 1 - root node
4773 * 1 - root item
4774 */
4775 trans = btrfs_start_transaction(tree_root, 2);
4776 if (IS_ERR(trans))
4777 return PTR_ERR(trans);
4778
9b7a2440 4779 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
f7a81ea4 4780 if (IS_ERR(uuid_root)) {
6d13f549 4781 ret = PTR_ERR(uuid_root);
66642832 4782 btrfs_abort_transaction(trans, ret);
3a45bb20 4783 btrfs_end_transaction(trans);
6d13f549 4784 return ret;
f7a81ea4
SB
4785 }
4786
4787 fs_info->uuid_root = uuid_root;
4788
3a45bb20 4789 ret = btrfs_commit_transaction(trans);
803b2f54
SB
4790 if (ret)
4791 return ret;
4792
4793 down(&fs_info->uuid_tree_rescan_sem);
4794 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4795 if (IS_ERR(task)) {
70f80175 4796 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
efe120a0 4797 btrfs_warn(fs_info, "failed to start uuid_scan task");
803b2f54
SB
4798 up(&fs_info->uuid_tree_rescan_sem);
4799 return PTR_ERR(task);
4800 }
4801
4802 return 0;
f7a81ea4 4803}
803b2f54 4804
8f18cf13
CM
4805/*
4806 * shrinking a device means finding all of the device extents past
4807 * the new size, and then following the back refs to the chunks.
4808 * The chunk relocation code actually frees the device extent
4809 */
4810int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4811{
0b246afa
JM
4812 struct btrfs_fs_info *fs_info = device->fs_info;
4813 struct btrfs_root *root = fs_info->dev_root;
8f18cf13 4814 struct btrfs_trans_handle *trans;
8f18cf13
CM
4815 struct btrfs_dev_extent *dev_extent = NULL;
4816 struct btrfs_path *path;
4817 u64 length;
8f18cf13
CM
4818 u64 chunk_offset;
4819 int ret;
4820 int slot;
ba1bf481
JB
4821 int failed = 0;
4822 bool retried = false;
8f18cf13
CM
4823 struct extent_buffer *l;
4824 struct btrfs_key key;
0b246afa 4825 struct btrfs_super_block *super_copy = fs_info->super_copy;
8f18cf13 4826 u64 old_total = btrfs_super_total_bytes(super_copy);
7cc8e58d 4827 u64 old_size = btrfs_device_get_total_bytes(device);
7dfb8be1 4828 u64 diff;
61d0d0d2 4829 u64 start;
7dfb8be1
NB
4830
4831 new_size = round_down(new_size, fs_info->sectorsize);
61d0d0d2 4832 start = new_size;
0e4324a4 4833 diff = round_down(old_size - new_size, fs_info->sectorsize);
8f18cf13 4834
401e29c1 4835 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
63a212ab
SB
4836 return -EINVAL;
4837
8f18cf13
CM
4838 path = btrfs_alloc_path();
4839 if (!path)
4840 return -ENOMEM;
4841
0338dff6 4842 path->reada = READA_BACK;
8f18cf13 4843
61d0d0d2
NB
4844 trans = btrfs_start_transaction(root, 0);
4845 if (IS_ERR(trans)) {
4846 btrfs_free_path(path);
4847 return PTR_ERR(trans);
4848 }
4849
34441361 4850 mutex_lock(&fs_info->chunk_mutex);
7d9eb12c 4851
7cc8e58d 4852 btrfs_device_set_total_bytes(device, new_size);
ebbede42 4853 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2b82032c 4854 device->fs_devices->total_rw_bytes -= diff;
a5ed45f8 4855 atomic64_sub(diff, &fs_info->free_chunk_space);
2bf64758 4856 }
61d0d0d2
NB
4857
4858 /*
4859 * Once the device's size has been set to the new size, ensure all
4860 * in-memory chunks are synced to disk so that the loop below sees them
4861 * and relocates them accordingly.
4862 */
1c11b63e 4863 if (contains_pending_extent(device, &start, diff)) {
61d0d0d2
NB
4864 mutex_unlock(&fs_info->chunk_mutex);
4865 ret = btrfs_commit_transaction(trans);
4866 if (ret)
4867 goto done;
4868 } else {
4869 mutex_unlock(&fs_info->chunk_mutex);
4870 btrfs_end_transaction(trans);
4871 }
8f18cf13 4872
ba1bf481 4873again:
8f18cf13
CM
4874 key.objectid = device->devid;
4875 key.offset = (u64)-1;
4876 key.type = BTRFS_DEV_EXTENT_KEY;
4877
213e64da 4878 do {
f3372065 4879 mutex_lock(&fs_info->reclaim_bgs_lock);
8f18cf13 4880 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
67c5e7d4 4881 if (ret < 0) {
f3372065 4882 mutex_unlock(&fs_info->reclaim_bgs_lock);
8f18cf13 4883 goto done;
67c5e7d4 4884 }
8f18cf13
CM
4885
4886 ret = btrfs_previous_item(root, path, 0, key.type);
8f18cf13 4887 if (ret) {
f3372065 4888 mutex_unlock(&fs_info->reclaim_bgs_lock);
7056bf69
NB
4889 if (ret < 0)
4890 goto done;
8f18cf13 4891 ret = 0;
b3b4aa74 4892 btrfs_release_path(path);
bf1fb512 4893 break;
8f18cf13
CM
4894 }
4895
4896 l = path->nodes[0];
4897 slot = path->slots[0];
4898 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4899
ba1bf481 4900 if (key.objectid != device->devid) {
f3372065 4901 mutex_unlock(&fs_info->reclaim_bgs_lock);
b3b4aa74 4902 btrfs_release_path(path);
bf1fb512 4903 break;
ba1bf481 4904 }
8f18cf13
CM
4905
4906 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4907 length = btrfs_dev_extent_length(l, dev_extent);
4908
ba1bf481 4909 if (key.offset + length <= new_size) {
f3372065 4910 mutex_unlock(&fs_info->reclaim_bgs_lock);
b3b4aa74 4911 btrfs_release_path(path);
d6397bae 4912 break;
ba1bf481 4913 }
8f18cf13 4914
8f18cf13 4915 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
b3b4aa74 4916 btrfs_release_path(path);
8f18cf13 4917
a6f93c71
LB
4918 /*
4919 * We may be relocating the only data chunk we have,
4920 * which could potentially end up with losing data's
4921 * raid profile, so lets allocate an empty one in
4922 * advance.
4923 */
4924 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4925 if (ret < 0) {
f3372065 4926 mutex_unlock(&fs_info->reclaim_bgs_lock);
a6f93c71
LB
4927 goto done;
4928 }
4929
0b246afa 4930 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
f3372065 4931 mutex_unlock(&fs_info->reclaim_bgs_lock);
eede2bf3 4932 if (ret == -ENOSPC) {
ba1bf481 4933 failed++;
eede2bf3
OS
4934 } else if (ret) {
4935 if (ret == -ETXTBSY) {
4936 btrfs_warn(fs_info,
4937 "could not shrink block group %llu due to active swapfile",
4938 chunk_offset);
4939 }
4940 goto done;
4941 }
213e64da 4942 } while (key.offset-- > 0);
ba1bf481
JB
4943
4944 if (failed && !retried) {
4945 failed = 0;
4946 retried = true;
4947 goto again;
4948 } else if (failed && retried) {
4949 ret = -ENOSPC;
ba1bf481 4950 goto done;
8f18cf13
CM
4951 }
4952
d6397bae 4953 /* Shrinking succeeded, else we would be at "done". */
a22285a6 4954 trans = btrfs_start_transaction(root, 0);
98d5dc13
TI
4955 if (IS_ERR(trans)) {
4956 ret = PTR_ERR(trans);
4957 goto done;
4958 }
4959
34441361 4960 mutex_lock(&fs_info->chunk_mutex);
c57dd1f2
QW
4961 /* Clear all state bits beyond the shrunk device size */
4962 clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4963 CHUNK_STATE_MASK);
4964
7cc8e58d 4965 btrfs_device_set_disk_total_bytes(device, new_size);
bbbf7243
NB
4966 if (list_empty(&device->post_commit_list))
4967 list_add_tail(&device->post_commit_list,
4968 &trans->transaction->dev_update_list);
d6397bae 4969
d6397bae 4970 WARN_ON(diff > old_total);
7dfb8be1
NB
4971 btrfs_set_super_total_bytes(super_copy,
4972 round_down(old_total - diff, fs_info->sectorsize));
34441361 4973 mutex_unlock(&fs_info->chunk_mutex);
2196d6e8 4974
2bb2e00e 4975 btrfs_reserve_chunk_metadata(trans, false);
2196d6e8
MX
4976 /* Now btrfs_update_device() will change the on-disk size. */
4977 ret = btrfs_update_device(trans, device);
2bb2e00e 4978 btrfs_trans_release_chunk_metadata(trans);
801660b0
AJ
4979 if (ret < 0) {
4980 btrfs_abort_transaction(trans, ret);
4981 btrfs_end_transaction(trans);
4982 } else {
4983 ret = btrfs_commit_transaction(trans);
4984 }
8f18cf13
CM
4985done:
4986 btrfs_free_path(path);
53e489bc 4987 if (ret) {
34441361 4988 mutex_lock(&fs_info->chunk_mutex);
53e489bc 4989 btrfs_device_set_total_bytes(device, old_size);
ebbede42 4990 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
53e489bc 4991 device->fs_devices->total_rw_bytes += diff;
a5ed45f8 4992 atomic64_add(diff, &fs_info->free_chunk_space);
34441361 4993 mutex_unlock(&fs_info->chunk_mutex);
53e489bc 4994 }
8f18cf13
CM
4995 return ret;
4996}
4997
2ff7e61e 4998static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
0b86a832
CM
4999 struct btrfs_key *key,
5000 struct btrfs_chunk *chunk, int item_size)
5001{
0b246afa 5002 struct btrfs_super_block *super_copy = fs_info->super_copy;
0b86a832
CM
5003 struct btrfs_disk_key disk_key;
5004 u32 array_size;
5005 u8 *ptr;
5006
79bd3712
FM
5007 lockdep_assert_held(&fs_info->chunk_mutex);
5008
0b86a832 5009 array_size = btrfs_super_sys_array_size(super_copy);
5f43f86e 5010 if (array_size + item_size + sizeof(disk_key)
79bd3712 5011 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
0b86a832
CM
5012 return -EFBIG;
5013
5014 ptr = super_copy->sys_chunk_array + array_size;
5015 btrfs_cpu_key_to_disk(&disk_key, key);
5016 memcpy(ptr, &disk_key, sizeof(disk_key));
5017 ptr += sizeof(disk_key);
5018 memcpy(ptr, chunk, item_size);
5019 item_size += sizeof(disk_key);
5020 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
fe48a5c0 5021
0b86a832
CM
5022 return 0;
5023}
5024
73c5de00
AJ
5025/*
5026 * sort the devices in descending order by max_avail, total_avail
5027 */
5028static int btrfs_cmp_device_info(const void *a, const void *b)
9b3f68b9 5029{
73c5de00
AJ
5030 const struct btrfs_device_info *di_a = a;
5031 const struct btrfs_device_info *di_b = b;
9b3f68b9 5032
73c5de00 5033 if (di_a->max_avail > di_b->max_avail)
b2117a39 5034 return -1;
73c5de00 5035 if (di_a->max_avail < di_b->max_avail)
b2117a39 5036 return 1;
73c5de00
AJ
5037 if (di_a->total_avail > di_b->total_avail)
5038 return -1;
5039 if (di_a->total_avail < di_b->total_avail)
5040 return 1;
5041 return 0;
b2117a39 5042}
0b86a832 5043
53b381b3
DW
5044static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
5045{
ffe2d203 5046 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
53b381b3
DW
5047 return;
5048
ceda0864 5049 btrfs_set_fs_incompat(info, RAID56);
53b381b3
DW
5050}
5051
cfbb825c
DS
5052static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
5053{
5054 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
5055 return;
5056
5057 btrfs_set_fs_incompat(info, RAID1C34);
5058}
5059
4f2bafe8 5060/*
f6f39f7a 5061 * Structure used internally for btrfs_create_chunk() function.
4f2bafe8
NA
5062 * Wraps needed parameters.
5063 */
5064struct alloc_chunk_ctl {
5065 u64 start;
5066 u64 type;
5067 /* Total number of stripes to allocate */
5068 int num_stripes;
5069 /* sub_stripes info for map */
5070 int sub_stripes;
5071 /* Stripes per device */
5072 int dev_stripes;
5073 /* Maximum number of devices to use */
5074 int devs_max;
5075 /* Minimum number of devices to use */
5076 int devs_min;
5077 /* ndevs has to be a multiple of this */
5078 int devs_increment;
5079 /* Number of copies */
5080 int ncopies;
5081 /* Number of stripes worth of bytes to store parity information */
5082 int nparity;
5083 u64 max_stripe_size;
5084 u64 max_chunk_size;
6aafb303 5085 u64 dev_extent_min;
4f2bafe8
NA
5086 u64 stripe_size;
5087 u64 chunk_size;
5088 int ndevs;
5089};
5090
27c314d5
NA
5091static void init_alloc_chunk_ctl_policy_regular(
5092 struct btrfs_fs_devices *fs_devices,
5093 struct alloc_chunk_ctl *ctl)
5094{
f6fca391 5095 struct btrfs_space_info *space_info;
27c314d5 5096
f6fca391
SR
5097 space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
5098 ASSERT(space_info);
5099
5100 ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
5101 ctl->max_stripe_size = ctl->max_chunk_size;
5102
5103 if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
5104 ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
27c314d5
NA
5105
5106 /* We don't want a chunk larger than 10% of writable space */
5107 ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
5108 ctl->max_chunk_size);
6aafb303 5109 ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
27c314d5
NA
5110}
5111
1cd6121f
NA
5112static void init_alloc_chunk_ctl_policy_zoned(
5113 struct btrfs_fs_devices *fs_devices,
5114 struct alloc_chunk_ctl *ctl)
5115{
5116 u64 zone_size = fs_devices->fs_info->zone_size;
5117 u64 limit;
5118 int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
5119 int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
5120 u64 min_chunk_size = min_data_stripes * zone_size;
5121 u64 type = ctl->type;
5122
5123 ctl->max_stripe_size = zone_size;
5124 if (type & BTRFS_BLOCK_GROUP_DATA) {
5125 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
5126 zone_size);
5127 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
5128 ctl->max_chunk_size = ctl->max_stripe_size;
5129 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
5130 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
5131 ctl->devs_max = min_t(int, ctl->devs_max,
5132 BTRFS_MAX_DEVS_SYS_CHUNK);
bb05b298
AB
5133 } else {
5134 BUG();
1cd6121f
NA
5135 }
5136
5137 /* We don't want a chunk larger than 10% of writable space */
5138 limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
5139 zone_size),
5140 min_chunk_size);
5141 ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
5142 ctl->dev_extent_min = zone_size * ctl->dev_stripes;
5143}
5144
27c314d5
NA
5145static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
5146 struct alloc_chunk_ctl *ctl)
5147{
5148 int index = btrfs_bg_flags_to_raid_index(ctl->type);
5149
5150 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
5151 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
5152 ctl->devs_max = btrfs_raid_array[index].devs_max;
5153 if (!ctl->devs_max)
5154 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
5155 ctl->devs_min = btrfs_raid_array[index].devs_min;
5156 ctl->devs_increment = btrfs_raid_array[index].devs_increment;
5157 ctl->ncopies = btrfs_raid_array[index].ncopies;
5158 ctl->nparity = btrfs_raid_array[index].nparity;
5159 ctl->ndevs = 0;
5160
5161 switch (fs_devices->chunk_alloc_policy) {
5162 case BTRFS_CHUNK_ALLOC_REGULAR:
5163 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
5164 break;
1cd6121f
NA
5165 case BTRFS_CHUNK_ALLOC_ZONED:
5166 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
5167 break;
27c314d5
NA
5168 default:
5169 BUG();
5170 }
5171}
5172
560156cb
NA
5173static int gather_device_info(struct btrfs_fs_devices *fs_devices,
5174 struct alloc_chunk_ctl *ctl,
5175 struct btrfs_device_info *devices_info)
b2117a39 5176{
560156cb 5177 struct btrfs_fs_info *info = fs_devices->fs_info;
ebcc9301 5178 struct btrfs_device *device;
73c5de00 5179 u64 total_avail;
560156cb 5180 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
73c5de00 5181 int ret;
560156cb
NA
5182 int ndevs = 0;
5183 u64 max_avail;
5184 u64 dev_offset;
0cad8a11 5185
9f680ce0 5186 /*
73c5de00
AJ
5187 * in the first pass through the devices list, we gather information
5188 * about the available holes on each device.
9f680ce0 5189 */
ebcc9301 5190 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
ebbede42 5191 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
31b1a2bd 5192 WARN(1, KERN_ERR
efe120a0 5193 "BTRFS: read-only device in alloc_list\n");
73c5de00
AJ
5194 continue;
5195 }
b2117a39 5196
e12c9621
AJ
5197 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
5198 &device->dev_state) ||
401e29c1 5199 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
73c5de00 5200 continue;
b2117a39 5201
73c5de00
AJ
5202 if (device->total_bytes > device->bytes_used)
5203 total_avail = device->total_bytes - device->bytes_used;
5204 else
5205 total_avail = 0;
38c01b96 5206
5207 /* If there is no space on this device, skip it. */
6aafb303 5208 if (total_avail < ctl->dev_extent_min)
38c01b96 5209 continue;
b2117a39 5210
560156cb
NA
5211 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
5212 &max_avail);
73c5de00 5213 if (ret && ret != -ENOSPC)
560156cb 5214 return ret;
b2117a39 5215
73c5de00 5216 if (ret == 0)
560156cb 5217 max_avail = dev_extent_want;
b2117a39 5218
6aafb303 5219 if (max_avail < ctl->dev_extent_min) {
4117f207
QW
5220 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5221 btrfs_debug(info,
560156cb 5222 "%s: devid %llu has no free space, have=%llu want=%llu",
4117f207 5223 __func__, device->devid, max_avail,
6aafb303 5224 ctl->dev_extent_min);
73c5de00 5225 continue;
4117f207 5226 }
b2117a39 5227
063d006f
ES
5228 if (ndevs == fs_devices->rw_devices) {
5229 WARN(1, "%s: found more than %llu devices\n",
5230 __func__, fs_devices->rw_devices);
5231 break;
5232 }
73c5de00
AJ
5233 devices_info[ndevs].dev_offset = dev_offset;
5234 devices_info[ndevs].max_avail = max_avail;
5235 devices_info[ndevs].total_avail = total_avail;
5236 devices_info[ndevs].dev = device;
5237 ++ndevs;
5238 }
560156cb 5239 ctl->ndevs = ndevs;
b2117a39 5240
73c5de00
AJ
5241 /*
5242 * now sort the devices by hole size / available space
5243 */
560156cb 5244 sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
73c5de00 5245 btrfs_cmp_device_info, NULL);
b2117a39 5246
560156cb
NA
5247 return 0;
5248}
5249
5badf512
NA
5250static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
5251 struct btrfs_device_info *devices_info)
5252{
5253 /* Number of stripes that count for block group size */
5254 int data_stripes;
5255
5256 /*
5257 * The primary goal is to maximize the number of stripes, so use as
5258 * many devices as possible, even if the stripes are not maximum sized.
5259 *
5260 * The DUP profile stores more than one stripe per device, the
5261 * max_avail is the total size so we have to adjust.
5262 */
5263 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
5264 ctl->dev_stripes);
5265 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5266
5267 /* This will have to be fixed for RAID1 and RAID10 over more drives */
5268 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5269
5270 /*
5271 * Use the number of data stripes to figure out how big this chunk is
5272 * really going to be in terms of logical address space, and compare
5273 * that answer with the max chunk size. If it's higher, we try to
5274 * reduce stripe_size.
5275 */
5276 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5277 /*
5278 * Reduce stripe_size, round it up to a 16MB boundary again and
5279 * then use it, unless it ends up being even bigger than the
5280 * previous value we had already.
5281 */
5282 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5283 data_stripes), SZ_16M),
5284 ctl->stripe_size);
5285 }
5286
5da431b7
QW
5287 /* Stripe size should not go beyond 1G. */
5288 ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);
5289
5badf512
NA
5290 /* Align to BTRFS_STRIPE_LEN */
5291 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5292 ctl->chunk_size = ctl->stripe_size * data_stripes;
5293
5294 return 0;
5295}
5296
1cd6121f
NA
5297static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
5298 struct btrfs_device_info *devices_info)
5299{
5300 u64 zone_size = devices_info[0].dev->zone_info->zone_size;
5301 /* Number of stripes that count for block group size */
5302 int data_stripes;
5303
5304 /*
5305 * It should hold because:
5306 * dev_extent_min == dev_extent_want == zone_size * dev_stripes
5307 */
5308 ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
5309
5310 ctl->stripe_size = zone_size;
5311 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5312 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5313
5314 /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
5315 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5316 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
5317 ctl->stripe_size) + ctl->nparity,
5318 ctl->dev_stripes);
5319 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5320 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5321 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
5322 }
5323
5324 ctl->chunk_size = ctl->stripe_size * data_stripes;
5325
5326 return 0;
5327}
5328
5badf512
NA
5329static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5330 struct alloc_chunk_ctl *ctl,
5331 struct btrfs_device_info *devices_info)
5332{
5333 struct btrfs_fs_info *info = fs_devices->fs_info;
5334
5335 /*
5336 * Round down to number of usable stripes, devs_increment can be any
5337 * number so we can't use round_down() that requires power of 2, while
5338 * rounddown is safe.
5339 */
5340 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5341
5342 if (ctl->ndevs < ctl->devs_min) {
5343 if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5344 btrfs_debug(info,
5345 "%s: not enough devices with free space: have=%d minimum required=%d",
5346 __func__, ctl->ndevs, ctl->devs_min);
5347 }
5348 return -ENOSPC;
5349 }
5350
5351 ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5352
5353 switch (fs_devices->chunk_alloc_policy) {
5354 case BTRFS_CHUNK_ALLOC_REGULAR:
5355 return decide_stripe_size_regular(ctl, devices_info);
1cd6121f
NA
5356 case BTRFS_CHUNK_ALLOC_ZONED:
5357 return decide_stripe_size_zoned(ctl, devices_info);
5badf512
NA
5358 default:
5359 BUG();
5360 }
5361}
5362
79bd3712 5363static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
dce580ca
NA
5364 struct alloc_chunk_ctl *ctl,
5365 struct btrfs_device_info *devices_info)
560156cb
NA
5366{
5367 struct btrfs_fs_info *info = trans->fs_info;
560156cb
NA
5368 struct map_lookup *map = NULL;
5369 struct extent_map_tree *em_tree;
79bd3712 5370 struct btrfs_block_group *block_group;
560156cb 5371 struct extent_map *em;
dce580ca
NA
5372 u64 start = ctl->start;
5373 u64 type = ctl->type;
560156cb
NA
5374 int ret;
5375 int i;
5376 int j;
5377
dce580ca
NA
5378 map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5379 if (!map)
79bd3712 5380 return ERR_PTR(-ENOMEM);
dce580ca 5381 map->num_stripes = ctl->num_stripes;
560156cb 5382
dce580ca
NA
5383 for (i = 0; i < ctl->ndevs; ++i) {
5384 for (j = 0; j < ctl->dev_stripes; ++j) {
5385 int s = i * ctl->dev_stripes + j;
73c5de00
AJ
5386 map->stripes[s].dev = devices_info[i].dev;
5387 map->stripes[s].physical = devices_info[i].dev_offset +
dce580ca 5388 j * ctl->stripe_size;
6324fbf3 5389 }
6324fbf3 5390 }
500ceed8
NB
5391 map->stripe_len = BTRFS_STRIPE_LEN;
5392 map->io_align = BTRFS_STRIPE_LEN;
5393 map->io_width = BTRFS_STRIPE_LEN;
2b82032c 5394 map->type = type;
dce580ca 5395 map->sub_stripes = ctl->sub_stripes;
0b86a832 5396
dce580ca 5397 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
1abe9b8a 5398
172ddd60 5399 em = alloc_extent_map();
2b82032c 5400 if (!em) {
298a8f9c 5401 kfree(map);
79bd3712 5402 return ERR_PTR(-ENOMEM);
593060d7 5403 }
298a8f9c 5404 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
95617d69 5405 em->map_lookup = map;
2b82032c 5406 em->start = start;
dce580ca 5407 em->len = ctl->chunk_size;
2b82032c
YZ
5408 em->block_start = 0;
5409 em->block_len = em->len;
dce580ca 5410 em->orig_block_len = ctl->stripe_size;
593060d7 5411
c8bf1b67 5412 em_tree = &info->mapping_tree;
890871be 5413 write_lock(&em_tree->lock);
09a2a8f9 5414 ret = add_extent_mapping(em_tree, em, 0);
0f5d42b2 5415 if (ret) {
1efb72a3 5416 write_unlock(&em_tree->lock);
0f5d42b2 5417 free_extent_map(em);
79bd3712 5418 return ERR_PTR(ret);
0f5d42b2 5419 }
1efb72a3
NB
5420 write_unlock(&em_tree->lock);
5421
79bd3712
FM
5422 block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
5423 if (IS_ERR(block_group))
6df9a95e 5424 goto error_del_extent;
2b82032c 5425
bbbf7243
NB
5426 for (i = 0; i < map->num_stripes; i++) {
5427 struct btrfs_device *dev = map->stripes[i].dev;
5428
4f2bafe8 5429 btrfs_device_set_bytes_used(dev,
dce580ca 5430 dev->bytes_used + ctl->stripe_size);
bbbf7243
NB
5431 if (list_empty(&dev->post_commit_list))
5432 list_add_tail(&dev->post_commit_list,
5433 &trans->transaction->dev_update_list);
5434 }
43530c46 5435
dce580ca 5436 atomic64_sub(ctl->stripe_size * map->num_stripes,
4f2bafe8 5437 &info->free_chunk_space);
1c116187 5438
0f5d42b2 5439 free_extent_map(em);
0b246afa 5440 check_raid56_incompat_flag(info, type);
cfbb825c 5441 check_raid1c34_incompat_flag(info, type);
53b381b3 5442
79bd3712 5443 return block_group;
b2117a39 5444
6df9a95e 5445error_del_extent:
0f5d42b2
JB
5446 write_lock(&em_tree->lock);
5447 remove_extent_mapping(em_tree, em);
5448 write_unlock(&em_tree->lock);
5449
5450 /* One for our allocation */
5451 free_extent_map(em);
5452 /* One for the tree reference */
5453 free_extent_map(em);
dce580ca 5454
79bd3712 5455 return block_group;
dce580ca
NA
5456}
5457
f6f39f7a 5458struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
79bd3712 5459 u64 type)
dce580ca
NA
5460{
5461 struct btrfs_fs_info *info = trans->fs_info;
5462 struct btrfs_fs_devices *fs_devices = info->fs_devices;
5463 struct btrfs_device_info *devices_info = NULL;
5464 struct alloc_chunk_ctl ctl;
79bd3712 5465 struct btrfs_block_group *block_group;
dce580ca
NA
5466 int ret;
5467
11c67b1a
NB
5468 lockdep_assert_held(&info->chunk_mutex);
5469
dce580ca
NA
5470 if (!alloc_profile_is_valid(type, 0)) {
5471 ASSERT(0);
79bd3712 5472 return ERR_PTR(-EINVAL);
dce580ca
NA
5473 }
5474
5475 if (list_empty(&fs_devices->alloc_list)) {
5476 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5477 btrfs_debug(info, "%s: no writable device", __func__);
79bd3712 5478 return ERR_PTR(-ENOSPC);
dce580ca
NA
5479 }
5480
5481 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5482 btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5483 ASSERT(0);
79bd3712 5484 return ERR_PTR(-EINVAL);
dce580ca
NA
5485 }
5486
11c67b1a 5487 ctl.start = find_next_chunk(info);
dce580ca
NA
5488 ctl.type = type;
5489 init_alloc_chunk_ctl(fs_devices, &ctl);
5490
5491 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5492 GFP_NOFS);
5493 if (!devices_info)
79bd3712 5494 return ERR_PTR(-ENOMEM);
dce580ca
NA
5495
5496 ret = gather_device_info(fs_devices, &ctl, devices_info);
79bd3712
FM
5497 if (ret < 0) {
5498 block_group = ERR_PTR(ret);
dce580ca 5499 goto out;
79bd3712 5500 }
dce580ca
NA
5501
5502 ret = decide_stripe_size(fs_devices, &ctl, devices_info);
79bd3712
FM
5503 if (ret < 0) {
5504 block_group = ERR_PTR(ret);
dce580ca 5505 goto out;
79bd3712 5506 }
dce580ca 5507
79bd3712 5508 block_group = create_chunk(trans, &ctl, devices_info);
dce580ca
NA
5509
5510out:
b2117a39 5511 kfree(devices_info);
79bd3712 5512 return block_group;
2b82032c
YZ
5513}
5514
79bd3712
FM
5515/*
5516 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
5517 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
5518 * chunks.
5519 *
5520 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
5521 * phases.
5522 */
5523int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
5524 struct btrfs_block_group *bg)
5525{
5526 struct btrfs_fs_info *fs_info = trans->fs_info;
79bd3712
FM
5527 struct btrfs_root *chunk_root = fs_info->chunk_root;
5528 struct btrfs_key key;
5529 struct btrfs_chunk *chunk;
5530 struct btrfs_stripe *stripe;
5531 struct extent_map *em;
5532 struct map_lookup *map;
5533 size_t item_size;
5534 int i;
5535 int ret;
5536
5537 /*
5538 * We take the chunk_mutex for 2 reasons:
5539 *
5540 * 1) Updates and insertions in the chunk btree must be done while holding
5541 * the chunk_mutex, as well as updating the system chunk array in the
5542 * superblock. See the comment on top of btrfs_chunk_alloc() for the
5543 * details;
5544 *
5545 * 2) To prevent races with the final phase of a device replace operation
5546 * that replaces the device object associated with the map's stripes,
5547 * because the device object's id can change at any time during that
5548 * final phase of the device replace operation
5549 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
5550 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
5551 * which would cause a failure when updating the device item, which does
5552 * not exists, or persisting a stripe of the chunk item with such ID.
5553 * Here we can't use the device_list_mutex because our caller already
5554 * has locked the chunk_mutex, and the final phase of device replace
5555 * acquires both mutexes - first the device_list_mutex and then the
5556 * chunk_mutex. Using any of those two mutexes protects us from a
5557 * concurrent device replace.
5558 */
5559 lockdep_assert_held(&fs_info->chunk_mutex);
5560
5561 em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
5562 if (IS_ERR(em)) {
5563 ret = PTR_ERR(em);
5564 btrfs_abort_transaction(trans, ret);
5565 return ret;
5566 }
5567
5568 map = em->map_lookup;
5569 item_size = btrfs_chunk_item_size(map->num_stripes);
5570
5571 chunk = kzalloc(item_size, GFP_NOFS);
5572 if (!chunk) {
5573 ret = -ENOMEM;
5574 btrfs_abort_transaction(trans, ret);
50460e37 5575 goto out;
2b82032c
YZ
5576 }
5577
79bd3712
FM
5578 for (i = 0; i < map->num_stripes; i++) {
5579 struct btrfs_device *device = map->stripes[i].dev;
5580
5581 ret = btrfs_update_device(trans, device);
5582 if (ret)
5583 goto out;
5584 }
5585
2b82032c 5586 stripe = &chunk->stripe;
6df9a95e 5587 for (i = 0; i < map->num_stripes; i++) {
79bd3712
FM
5588 struct btrfs_device *device = map->stripes[i].dev;
5589 const u64 dev_offset = map->stripes[i].physical;
0b86a832 5590
e17cade2
CM
5591 btrfs_set_stack_stripe_devid(stripe, device->devid);
5592 btrfs_set_stack_stripe_offset(stripe, dev_offset);
5593 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2b82032c 5594 stripe++;
0b86a832
CM
5595 }
5596
79bd3712 5597 btrfs_set_stack_chunk_length(chunk, bg->length);
fd51eb2f 5598 btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
2b82032c
YZ
5599 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
5600 btrfs_set_stack_chunk_type(chunk, map->type);
5601 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5602 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
5603 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
0b246afa 5604 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
2b82032c 5605 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
0b86a832 5606
2b82032c
YZ
5607 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5608 key.type = BTRFS_CHUNK_ITEM_KEY;
79bd3712 5609 key.offset = bg->start;
0b86a832 5610
2b82032c 5611 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
79bd3712
FM
5612 if (ret)
5613 goto out;
5614
3349b57f 5615 set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags);
79bd3712
FM
5616
5617 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2ff7e61e 5618 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
79bd3712
FM
5619 if (ret)
5620 goto out;
8f18cf13 5621 }
1abe9b8a 5622
6df9a95e 5623out:
0b86a832 5624 kfree(chunk);
6df9a95e 5625 free_extent_map(em);
4ed1d16e 5626 return ret;
2b82032c 5627}
0b86a832 5628
6f8e0fc7 5629static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
2b82032c 5630{
6f8e0fc7 5631 struct btrfs_fs_info *fs_info = trans->fs_info;
2b82032c 5632 u64 alloc_profile;
79bd3712
FM
5633 struct btrfs_block_group *meta_bg;
5634 struct btrfs_block_group *sys_bg;
5635
5636 /*
5637 * When adding a new device for sprouting, the seed device is read-only
5638 * so we must first allocate a metadata and a system chunk. But before
5639 * adding the block group items to the extent, device and chunk btrees,
5640 * we must first:
5641 *
5642 * 1) Create both chunks without doing any changes to the btrees, as
5643 * otherwise we would get -ENOSPC since the block groups from the
5644 * seed device are read-only;
5645 *
5646 * 2) Add the device item for the new sprout device - finishing the setup
5647 * of a new block group requires updating the device item in the chunk
5648 * btree, so it must exist when we attempt to do it. The previous step
5649 * ensures this does not fail with -ENOSPC.
5650 *
5651 * After that we can add the block group items to their btrees:
5652 * update existing device item in the chunk btree, add a new block group
5653 * item to the extent btree, add a new chunk item to the chunk btree and
5654 * finally add the new device extent items to the devices btree.
5655 */
2b82032c 5656
1b86826d 5657 alloc_profile = btrfs_metadata_alloc_profile(fs_info);
f6f39f7a 5658 meta_bg = btrfs_create_chunk(trans, alloc_profile);
79bd3712
FM
5659 if (IS_ERR(meta_bg))
5660 return PTR_ERR(meta_bg);
2b82032c 5661
1b86826d 5662 alloc_profile = btrfs_system_alloc_profile(fs_info);
f6f39f7a 5663 sys_bg = btrfs_create_chunk(trans, alloc_profile);
79bd3712
FM
5664 if (IS_ERR(sys_bg))
5665 return PTR_ERR(sys_bg);
5666
5667 return 0;
2b82032c
YZ
5668}
5669
d20983b4
MX
5670static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5671{
fc9a2ac7 5672 const int index = btrfs_bg_flags_to_raid_index(map->type);
2b82032c 5673
fc9a2ac7 5674 return btrfs_raid_array[index].tolerated_failures;
2b82032c
YZ
5675}
5676
a09f23c3 5677bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2b82032c
YZ
5678{
5679 struct extent_map *em;
5680 struct map_lookup *map;
d20983b4 5681 int miss_ndevs = 0;
2b82032c 5682 int i;
a09f23c3 5683 bool ret = true;
2b82032c 5684
60ca842e 5685 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
592d92ee 5686 if (IS_ERR(em))
a09f23c3 5687 return false;
2b82032c 5688
95617d69 5689 map = em->map_lookup;
2b82032c 5690 for (i = 0; i < map->num_stripes; i++) {
e6e674bd
AJ
5691 if (test_bit(BTRFS_DEV_STATE_MISSING,
5692 &map->stripes[i].dev->dev_state)) {
d20983b4
MX
5693 miss_ndevs++;
5694 continue;
5695 }
ebbede42
AJ
5696 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5697 &map->stripes[i].dev->dev_state)) {
a09f23c3 5698 ret = false;
d20983b4 5699 goto end;
2b82032c
YZ
5700 }
5701 }
d20983b4
MX
5702
5703 /*
a09f23c3
AJ
5704 * If the number of missing devices is larger than max errors, we can
5705 * not write the data into that chunk successfully.
d20983b4
MX
5706 */
5707 if (miss_ndevs > btrfs_chunk_max_errors(map))
a09f23c3 5708 ret = false;
d20983b4 5709end:
0b86a832 5710 free_extent_map(em);
a09f23c3 5711 return ret;
0b86a832
CM
5712}
5713
c8bf1b67 5714void btrfs_mapping_tree_free(struct extent_map_tree *tree)
0b86a832
CM
5715{
5716 struct extent_map *em;
5717
d397712b 5718 while (1) {
c8bf1b67
DS
5719 write_lock(&tree->lock);
5720 em = lookup_extent_mapping(tree, 0, (u64)-1);
0b86a832 5721 if (em)
c8bf1b67
DS
5722 remove_extent_mapping(tree, em);
5723 write_unlock(&tree->lock);
0b86a832
CM
5724 if (!em)
5725 break;
0b86a832
CM
5726 /* once for us */
5727 free_extent_map(em);
5728 /* once for the tree */
5729 free_extent_map(em);
5730 }
5731}
5732
5d964051 5733int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
f188591e
CM
5734{
5735 struct extent_map *em;
5736 struct map_lookup *map;
6d322b48
QW
5737 enum btrfs_raid_types index;
5738 int ret = 1;
f188591e 5739
60ca842e 5740 em = btrfs_get_chunk_map(fs_info, logical, len);
592d92ee
LB
5741 if (IS_ERR(em))
5742 /*
5743 * We could return errors for these cases, but that could get
5744 * ugly and we'd probably do the same thing which is just not do
5745 * anything else and exit, so return 1 so the callers don't try
5746 * to use other copies.
5747 */
fb7669b5 5748 return 1;
fb7669b5 5749
95617d69 5750 map = em->map_lookup;
6d322b48
QW
5751 index = btrfs_bg_flags_to_raid_index(map->type);
5752
5753 /* Non-RAID56, use their ncopies from btrfs_raid_array. */
5754 if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK))
5755 ret = btrfs_raid_array[index].ncopies;
53b381b3
DW
5756 else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5757 ret = 2;
5758 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
8810f751
LB
5759 /*
5760 * There could be two corrupted data stripes, we need
5761 * to loop retry in order to rebuild the correct data.
e7e02096 5762 *
8810f751
LB
5763 * Fail a stripe at a time on every retry except the
5764 * stripe under reconstruction.
5765 */
5766 ret = map->num_stripes;
f188591e 5767 free_extent_map(em);
ad6d620e 5768
cb5583dd 5769 down_read(&fs_info->dev_replace.rwsem);
6fad823f
LB
5770 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5771 fs_info->dev_replace.tgtdev)
ad6d620e 5772 ret++;
cb5583dd 5773 up_read(&fs_info->dev_replace.rwsem);
ad6d620e 5774
f188591e
CM
5775 return ret;
5776}
5777
2ff7e61e 5778unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
53b381b3
DW
5779 u64 logical)
5780{
5781 struct extent_map *em;
5782 struct map_lookup *map;
0b246afa 5783 unsigned long len = fs_info->sectorsize;
53b381b3 5784
b036f479
QW
5785 if (!btrfs_fs_incompat(fs_info, RAID56))
5786 return len;
5787
60ca842e 5788 em = btrfs_get_chunk_map(fs_info, logical, len);
53b381b3 5789
69f03f13
NB
5790 if (!WARN_ON(IS_ERR(em))) {
5791 map = em->map_lookup;
5792 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5793 len = map->stripe_len * nr_data_stripes(map);
5794 free_extent_map(em);
5795 }
53b381b3
DW
5796 return len;
5797}
5798
e4ff5fb5 5799int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
53b381b3
DW
5800{
5801 struct extent_map *em;
5802 struct map_lookup *map;
53b381b3
DW
5803 int ret = 0;
5804
b036f479
QW
5805 if (!btrfs_fs_incompat(fs_info, RAID56))
5806 return 0;
5807
60ca842e 5808 em = btrfs_get_chunk_map(fs_info, logical, len);
53b381b3 5809
69f03f13
NB
5810 if(!WARN_ON(IS_ERR(em))) {
5811 map = em->map_lookup;
5812 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5813 ret = 1;
5814 free_extent_map(em);
5815 }
53b381b3
DW
5816 return ret;
5817}
5818
30d9861f 5819static int find_live_mirror(struct btrfs_fs_info *fs_info,
99f92a7c 5820 struct map_lookup *map, int first,
8ba0ae78 5821 int dev_replace_is_ongoing)
dfe25020
CM
5822{
5823 int i;
99f92a7c 5824 int num_stripes;
8ba0ae78 5825 int preferred_mirror;
30d9861f
SB
5826 int tolerance;
5827 struct btrfs_device *srcdev;
5828
99f92a7c 5829 ASSERT((map->type &
c7369b3f 5830 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
99f92a7c
AJ
5831
5832 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5833 num_stripes = map->sub_stripes;
5834 else
5835 num_stripes = map->num_stripes;
5836
33fd2f71
AJ
5837 switch (fs_info->fs_devices->read_policy) {
5838 default:
5839 /* Shouldn't happen, just warn and use pid instead of failing */
5840 btrfs_warn_rl(fs_info,
5841 "unknown read_policy type %u, reset to pid",
5842 fs_info->fs_devices->read_policy);
5843 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
5844 fallthrough;
5845 case BTRFS_READ_POLICY_PID:
5846 preferred_mirror = first + (current->pid % num_stripes);
5847 break;
5848 }
8ba0ae78 5849
30d9861f
SB
5850 if (dev_replace_is_ongoing &&
5851 fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5852 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5853 srcdev = fs_info->dev_replace.srcdev;
5854 else
5855 srcdev = NULL;
5856
5857 /*
5858 * try to avoid the drive that is the source drive for a
5859 * dev-replace procedure, only choose it if no other non-missing
5860 * mirror is available
5861 */
5862 for (tolerance = 0; tolerance < 2; tolerance++) {
8ba0ae78
AJ
5863 if (map->stripes[preferred_mirror].dev->bdev &&
5864 (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5865 return preferred_mirror;
99f92a7c 5866 for (i = first; i < first + num_stripes; i++) {
30d9861f
SB
5867 if (map->stripes[i].dev->bdev &&
5868 (tolerance || map->stripes[i].dev != srcdev))
5869 return i;
5870 }
dfe25020 5871 }
30d9861f 5872
dfe25020
CM
5873 /* we couldn't find one that doesn't fail. Just return something
5874 * and the io error handling code will clean up eventually
5875 */
8ba0ae78 5876 return preferred_mirror;
dfe25020
CM
5877}
5878
53b381b3 5879/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4c664611 5880static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
53b381b3 5881{
53b381b3 5882 int i;
53b381b3
DW
5883 int again = 1;
5884
5885 while (again) {
5886 again = 0;
cc7539ed 5887 for (i = 0; i < num_stripes - 1; i++) {
eeb6f172 5888 /* Swap if parity is on a smaller index */
4c664611
QW
5889 if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
5890 swap(bioc->stripes[i], bioc->stripes[i + 1]);
5891 swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
53b381b3
DW
5892 again = 1;
5893 }
5894 }
5895 }
5896}
5897
731ccf15
QW
5898static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
5899 int total_stripes,
4c664611 5900 int real_stripes)
6e9606d2 5901{
4c664611
QW
5902 struct btrfs_io_context *bioc = kzalloc(
5903 /* The size of btrfs_io_context */
5904 sizeof(struct btrfs_io_context) +
5905 /* Plus the variable array for the stripes */
5906 sizeof(struct btrfs_io_stripe) * (total_stripes) +
5907 /* Plus the variable array for the tgt dev */
6e9606d2 5908 sizeof(int) * (real_stripes) +
e57cf21e 5909 /*
4c664611
QW
5910 * Plus the raid_map, which includes both the tgt dev
5911 * and the stripes.
e57cf21e
CM
5912 */
5913 sizeof(u64) * (total_stripes),
277fb5fc 5914 GFP_NOFS|__GFP_NOFAIL);
6e9606d2 5915
4c664611 5916 refcount_set(&bioc->refs, 1);
6e9606d2 5917
731ccf15 5918 bioc->fs_info = fs_info;
4c664611
QW
5919 bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
5920 bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
608769a4 5921
4c664611 5922 return bioc;
6e9606d2
ZL
5923}
5924
4c664611 5925void btrfs_get_bioc(struct btrfs_io_context *bioc)
6e9606d2 5926{
4c664611
QW
5927 WARN_ON(!refcount_read(&bioc->refs));
5928 refcount_inc(&bioc->refs);
6e9606d2
ZL
5929}
5930
4c664611 5931void btrfs_put_bioc(struct btrfs_io_context *bioc)
6e9606d2 5932{
4c664611 5933 if (!bioc)
6e9606d2 5934 return;
4c664611
QW
5935 if (refcount_dec_and_test(&bioc->refs))
5936 kfree(bioc);
6e9606d2
ZL
5937}
5938
0b3d4cd3
LB
5939/*
5940 * Please note that, discard won't be sent to target device of device
5941 * replace.
5942 */
a4012f06
CH
5943struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
5944 u64 logical, u64 *length_ret,
5945 u32 *num_stripes)
0b3d4cd3
LB
5946{
5947 struct extent_map *em;
5948 struct map_lookup *map;
a4012f06 5949 struct btrfs_discard_stripe *stripes;
6b7faadd 5950 u64 length = *length_ret;
0b3d4cd3
LB
5951 u64 offset;
5952 u64 stripe_nr;
5953 u64 stripe_nr_end;
5954 u64 stripe_end_offset;
5955 u64 stripe_cnt;
5956 u64 stripe_len;
5957 u64 stripe_offset;
0b3d4cd3
LB
5958 u32 stripe_index;
5959 u32 factor = 0;
5960 u32 sub_stripes = 0;
5961 u64 stripes_per_dev = 0;
5962 u32 remaining_stripes = 0;
5963 u32 last_stripe = 0;
a4012f06 5964 int ret;
0b3d4cd3
LB
5965 int i;
5966
60ca842e 5967 em = btrfs_get_chunk_map(fs_info, logical, length);
0b3d4cd3 5968 if (IS_ERR(em))
a4012f06 5969 return ERR_CAST(em);
0b3d4cd3
LB
5970
5971 map = em->map_lookup;
a4012f06 5972
0b3d4cd3
LB
5973 /* we don't discard raid56 yet */
5974 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5975 ret = -EOPNOTSUPP;
a4012f06
CH
5976 goto out_free_map;
5977}
0b3d4cd3
LB
5978
5979 offset = logical - em->start;
2d974619 5980 length = min_t(u64, em->start + em->len - logical, length);
6b7faadd 5981 *length_ret = length;
0b3d4cd3
LB
5982
5983 stripe_len = map->stripe_len;
5984 /*
5985 * stripe_nr counts the total number of stripes we have to stride
5986 * to get to this block
5987 */
5988 stripe_nr = div64_u64(offset, stripe_len);
5989
5990 /* stripe_offset is the offset of this block in its stripe */
5991 stripe_offset = offset - stripe_nr * stripe_len;
5992
5993 stripe_nr_end = round_up(offset + length, map->stripe_len);
42c61ab6 5994 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
0b3d4cd3
LB
5995 stripe_cnt = stripe_nr_end - stripe_nr;
5996 stripe_end_offset = stripe_nr_end * map->stripe_len -
5997 (offset + length);
5998 /*
5999 * after this, stripe_nr is the number of stripes on this
6000 * device we have to walk to find the data, and stripe_index is
6001 * the number of our device in the stripe array
6002 */
a4012f06 6003 *num_stripes = 1;
0b3d4cd3
LB
6004 stripe_index = 0;
6005 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
6006 BTRFS_BLOCK_GROUP_RAID10)) {
6007 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
6008 sub_stripes = 1;
6009 else
6010 sub_stripes = map->sub_stripes;
6011
6012 factor = map->num_stripes / sub_stripes;
a4012f06 6013 *num_stripes = min_t(u64, map->num_stripes,
0b3d4cd3
LB
6014 sub_stripes * stripe_cnt);
6015 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6016 stripe_index *= sub_stripes;
6017 stripes_per_dev = div_u64_rem(stripe_cnt, factor,
6018 &remaining_stripes);
6019 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
6020 last_stripe *= sub_stripes;
c7369b3f 6021 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
0b3d4cd3 6022 BTRFS_BLOCK_GROUP_DUP)) {
a4012f06 6023 *num_stripes = map->num_stripes;
0b3d4cd3
LB
6024 } else {
6025 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6026 &stripe_index);
6027 }
6028
a4012f06
CH
6029 stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
6030 if (!stripes) {
0b3d4cd3 6031 ret = -ENOMEM;
a4012f06 6032 goto out_free_map;
0b3d4cd3
LB
6033 }
6034
a4012f06
CH
6035 for (i = 0; i < *num_stripes; i++) {
6036 stripes[i].physical =
0b3d4cd3
LB
6037 map->stripes[stripe_index].physical +
6038 stripe_offset + stripe_nr * map->stripe_len;
a4012f06 6039 stripes[i].dev = map->stripes[stripe_index].dev;
0b3d4cd3
LB
6040
6041 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
6042 BTRFS_BLOCK_GROUP_RAID10)) {
a4012f06 6043 stripes[i].length = stripes_per_dev * map->stripe_len;
0b3d4cd3
LB
6044
6045 if (i / sub_stripes < remaining_stripes)
a4012f06 6046 stripes[i].length += map->stripe_len;
0b3d4cd3
LB
6047
6048 /*
6049 * Special for the first stripe and
6050 * the last stripe:
6051 *
6052 * |-------|...|-------|
6053 * |----------|
6054 * off end_off
6055 */
6056 if (i < sub_stripes)
a4012f06 6057 stripes[i].length -= stripe_offset;
0b3d4cd3
LB
6058
6059 if (stripe_index >= last_stripe &&
6060 stripe_index <= (last_stripe +
6061 sub_stripes - 1))
a4012f06 6062 stripes[i].length -= stripe_end_offset;
0b3d4cd3
LB
6063
6064 if (i == sub_stripes - 1)
6065 stripe_offset = 0;
6066 } else {
a4012f06 6067 stripes[i].length = length;
0b3d4cd3
LB
6068 }
6069
6070 stripe_index++;
6071 if (stripe_index == map->num_stripes) {
6072 stripe_index = 0;
6073 stripe_nr++;
6074 }
6075 }
6076
0b3d4cd3 6077 free_extent_map(em);
a4012f06
CH
6078 return stripes;
6079out_free_map:
6080 free_extent_map(em);
6081 return ERR_PTR(ret);
0b3d4cd3
LB
6082}
6083
5ab56090
LB
6084/*
6085 * In dev-replace case, for repair case (that's the only case where the mirror
6086 * is selected explicitly when calling btrfs_map_block), blocks left of the
6087 * left cursor can also be read from the target drive.
6088 *
6089 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
6090 * array of stripes.
6091 * For READ, it also needs to be supported using the same mirror number.
6092 *
6093 * If the requested block is not left of the left cursor, EIO is returned. This
6094 * can happen because btrfs_num_copies() returns one more in the dev-replace
6095 * case.
6096 */
6097static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
6098 u64 logical, u64 length,
6099 u64 srcdev_devid, int *mirror_num,
6100 u64 *physical)
6101{
4c664611 6102 struct btrfs_io_context *bioc = NULL;
5ab56090
LB
6103 int num_stripes;
6104 int index_srcdev = 0;
6105 int found = 0;
6106 u64 physical_of_found = 0;
6107 int i;
6108 int ret = 0;
6109
6110 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
03793cbb 6111 logical, &length, &bioc, NULL, NULL, 0);
5ab56090 6112 if (ret) {
4c664611 6113 ASSERT(bioc == NULL);
5ab56090
LB
6114 return ret;
6115 }
6116
4c664611 6117 num_stripes = bioc->num_stripes;
5ab56090
LB
6118 if (*mirror_num > num_stripes) {
6119 /*
6120 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
6121 * that means that the requested area is not left of the left
6122 * cursor
6123 */
4c664611 6124 btrfs_put_bioc(bioc);
5ab56090
LB
6125 return -EIO;
6126 }
6127
6128 /*
6129 * process the rest of the function using the mirror_num of the source
6130 * drive. Therefore look it up first. At the end, patch the device
6131 * pointer to the one of the target drive.
6132 */
6133 for (i = 0; i < num_stripes; i++) {
4c664611 6134 if (bioc->stripes[i].dev->devid != srcdev_devid)
5ab56090
LB
6135 continue;
6136
6137 /*
6138 * In case of DUP, in order to keep it simple, only add the
6139 * mirror with the lowest physical address
6140 */
6141 if (found &&
4c664611 6142 physical_of_found <= bioc->stripes[i].physical)
5ab56090
LB
6143 continue;
6144
6145 index_srcdev = i;
6146 found = 1;
4c664611 6147 physical_of_found = bioc->stripes[i].physical;
5ab56090
LB
6148 }
6149
4c664611 6150 btrfs_put_bioc(bioc);
5ab56090
LB
6151
6152 ASSERT(found);
6153 if (!found)
6154 return -EIO;
6155
6156 *mirror_num = index_srcdev + 1;
6157 *physical = physical_of_found;
6158 return ret;
6159}
6160
6143c23c
NA
6161static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
6162{
6163 struct btrfs_block_group *cache;
6164 bool ret;
6165
de17addc 6166 /* Non zoned filesystem does not use "to_copy" flag */
6143c23c
NA
6167 if (!btrfs_is_zoned(fs_info))
6168 return false;
6169
6170 cache = btrfs_lookup_block_group(fs_info, logical);
6171
3349b57f 6172 ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
6143c23c
NA
6173
6174 btrfs_put_block_group(cache);
6175 return ret;
6176}
6177
73c0f228 6178static void handle_ops_on_dev_replace(enum btrfs_map_op op,
4c664611 6179 struct btrfs_io_context **bioc_ret,
73c0f228 6180 struct btrfs_dev_replace *dev_replace,
6143c23c 6181 u64 logical,
73c0f228
LB
6182 int *num_stripes_ret, int *max_errors_ret)
6183{
4c664611 6184 struct btrfs_io_context *bioc = *bioc_ret;
73c0f228
LB
6185 u64 srcdev_devid = dev_replace->srcdev->devid;
6186 int tgtdev_indexes = 0;
6187 int num_stripes = *num_stripes_ret;
6188 int max_errors = *max_errors_ret;
6189 int i;
6190
6191 if (op == BTRFS_MAP_WRITE) {
6192 int index_where_to_add;
6193
6143c23c
NA
6194 /*
6195 * A block group which have "to_copy" set will eventually
6196 * copied by dev-replace process. We can avoid cloning IO here.
6197 */
6198 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
6199 return;
6200
73c0f228
LB
6201 /*
6202 * duplicate the write operations while the dev replace
6203 * procedure is running. Since the copying of the old disk to
6204 * the new disk takes place at run time while the filesystem is
6205 * mounted writable, the regular write operations to the old
6206 * disk have to be duplicated to go to the new disk as well.
6207 *
6208 * Note that device->missing is handled by the caller, and that
6209 * the write to the old disk is already set up in the stripes
6210 * array.
6211 */
6212 index_where_to_add = num_stripes;
6213 for (i = 0; i < num_stripes; i++) {
4c664611 6214 if (bioc->stripes[i].dev->devid == srcdev_devid) {
73c0f228 6215 /* write to new disk, too */
4c664611
QW
6216 struct btrfs_io_stripe *new =
6217 bioc->stripes + index_where_to_add;
6218 struct btrfs_io_stripe *old =
6219 bioc->stripes + i;
73c0f228
LB
6220
6221 new->physical = old->physical;
73c0f228 6222 new->dev = dev_replace->tgtdev;
4c664611 6223 bioc->tgtdev_map[i] = index_where_to_add;
73c0f228
LB
6224 index_where_to_add++;
6225 max_errors++;
6226 tgtdev_indexes++;
6227 }
6228 }
6229 num_stripes = index_where_to_add;
6230 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
6231 int index_srcdev = 0;
6232 int found = 0;
6233 u64 physical_of_found = 0;
6234
6235 /*
6236 * During the dev-replace procedure, the target drive can also
6237 * be used to read data in case it is needed to repair a corrupt
6238 * block elsewhere. This is possible if the requested area is
6239 * left of the left cursor. In this area, the target drive is a
6240 * full copy of the source drive.
6241 */
6242 for (i = 0; i < num_stripes; i++) {
4c664611 6243 if (bioc->stripes[i].dev->devid == srcdev_devid) {
73c0f228
LB
6244 /*
6245 * In case of DUP, in order to keep it simple,
6246 * only add the mirror with the lowest physical
6247 * address
6248 */
6249 if (found &&
4c664611 6250 physical_of_found <= bioc->stripes[i].physical)
73c0f228
LB
6251 continue;
6252 index_srcdev = i;
6253 found = 1;
4c664611 6254 physical_of_found = bioc->stripes[i].physical;
73c0f228
LB
6255 }
6256 }
6257 if (found) {
4c664611
QW
6258 struct btrfs_io_stripe *tgtdev_stripe =
6259 bioc->stripes + num_stripes;
73c0f228
LB
6260
6261 tgtdev_stripe->physical = physical_of_found;
73c0f228 6262 tgtdev_stripe->dev = dev_replace->tgtdev;
4c664611 6263 bioc->tgtdev_map[index_srcdev] = num_stripes;
73c0f228
LB
6264
6265 tgtdev_indexes++;
6266 num_stripes++;
6267 }
6268 }
6269
6270 *num_stripes_ret = num_stripes;
6271 *max_errors_ret = max_errors;
4c664611
QW
6272 bioc->num_tgtdevs = tgtdev_indexes;
6273 *bioc_ret = bioc;
73c0f228
LB
6274}
6275
2b19a1fe
LB
6276static bool need_full_stripe(enum btrfs_map_op op)
6277{
6278 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
6279}
6280
5f141126 6281/*
42034313
MR
6282 * Calculate the geometry of a particular (address, len) tuple. This
6283 * information is used to calculate how big a particular bio can get before it
6284 * straddles a stripe.
5f141126 6285 *
42034313
MR
6286 * @fs_info: the filesystem
6287 * @em: mapping containing the logical extent
6288 * @op: type of operation - write or read
6289 * @logical: address that we want to figure out the geometry of
42034313 6290 * @io_geom: pointer used to return values
5f141126
NB
6291 *
6292 * Returns < 0 in case a chunk for the given logical address cannot be found,
6293 * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
6294 */
42034313 6295int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
43c0d1a5 6296 enum btrfs_map_op op, u64 logical,
42034313 6297 struct btrfs_io_geometry *io_geom)
5f141126 6298{
5f141126 6299 struct map_lookup *map;
43c0d1a5 6300 u64 len;
5f141126
NB
6301 u64 offset;
6302 u64 stripe_offset;
6303 u64 stripe_nr;
cc353a8b 6304 u32 stripe_len;
5f141126
NB
6305 u64 raid56_full_stripe_start = (u64)-1;
6306 int data_stripes;
6307
6308 ASSERT(op != BTRFS_MAP_DISCARD);
6309
5f141126
NB
6310 map = em->map_lookup;
6311 /* Offset of this logical address in the chunk */
6312 offset = logical - em->start;
6313 /* Len of a stripe in a chunk */
6314 stripe_len = map->stripe_len;
cc353a8b
QW
6315 /*
6316 * Stripe_nr is where this block falls in
6317 * stripe_offset is the offset of this block in its stripe.
6318 */
6319 stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset);
6320 ASSERT(stripe_offset < U32_MAX);
5f141126 6321
5f141126
NB
6322 data_stripes = nr_data_stripes(map);
6323
bf08387f
QW
6324 /* Only stripe based profiles needs to check against stripe length. */
6325 if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) {
5f141126
NB
6326 u64 max_len = stripe_len - stripe_offset;
6327
6328 /*
6329 * In case of raid56, we need to know the stripe aligned start
6330 */
6331 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6332 unsigned long full_stripe_len = stripe_len * data_stripes;
6333 raid56_full_stripe_start = offset;
6334
6335 /*
6336 * Allow a write of a full stripe, but make sure we
6337 * don't allow straddling of stripes
6338 */
6339 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
6340 full_stripe_len);
6341 raid56_full_stripe_start *= full_stripe_len;
6342
6343 /*
6344 * For writes to RAID[56], allow a full stripeset across
6345 * all disks. For other RAID types and for RAID[56]
6346 * reads, just allow a single stripe (on a single disk).
6347 */
6348 if (op == BTRFS_MAP_WRITE) {
6349 max_len = stripe_len * data_stripes -
6350 (offset - raid56_full_stripe_start);
6351 }
6352 }
6353 len = min_t(u64, em->len - offset, max_len);
6354 } else {
6355 len = em->len - offset;
6356 }
6357
6358 io_geom->len = len;
6359 io_geom->offset = offset;
6360 io_geom->stripe_len = stripe_len;
6361 io_geom->stripe_nr = stripe_nr;
6362 io_geom->stripe_offset = stripe_offset;
6363 io_geom->raid56_stripe_offset = raid56_full_stripe_start;
6364
42034313 6365 return 0;
5f141126
NB
6366}
6367
03793cbb
CH
6368static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
6369 u32 stripe_index, u64 stripe_offset, u64 stripe_nr)
6370{
6371 dst->dev = map->stripes[stripe_index].dev;
6372 dst->physical = map->stripes[stripe_index].physical +
6373 stripe_offset + stripe_nr * map->stripe_len;
6374}
6375
cf8cddd3 6376static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
03793cbb 6377 enum btrfs_map_op op, u64 logical, u64 *length,
4c664611 6378 struct btrfs_io_context **bioc_ret,
03793cbb
CH
6379 struct btrfs_io_stripe *smap,
6380 int *mirror_num_ret, int need_raid_map)
0b86a832
CM
6381{
6382 struct extent_map *em;
6383 struct map_lookup *map;
593060d7
CM
6384 u64 stripe_offset;
6385 u64 stripe_nr;
53b381b3 6386 u64 stripe_len;
9d644a62 6387 u32 stripe_index;
cff82672 6388 int data_stripes;
cea9e445 6389 int i;
de11cc12 6390 int ret = 0;
03793cbb 6391 int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
f2d8d74d 6392 int num_stripes;
a236aed1 6393 int max_errors = 0;
2c8cdd6e 6394 int tgtdev_indexes = 0;
4c664611 6395 struct btrfs_io_context *bioc = NULL;
472262f3
SB
6396 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6397 int dev_replace_is_ongoing = 0;
6398 int num_alloc_stripes;
ad6d620e
SB
6399 int patch_the_first_stripe_for_dev_replace = 0;
6400 u64 physical_to_patch_in_first_stripe = 0;
53b381b3 6401 u64 raid56_full_stripe_start = (u64)-1;
89b798ad
NB
6402 struct btrfs_io_geometry geom;
6403
4c664611 6404 ASSERT(bioc_ret);
75fb2e9e 6405 ASSERT(op != BTRFS_MAP_DISCARD);
0b3d4cd3 6406
42034313
MR
6407 em = btrfs_get_chunk_map(fs_info, logical, *length);
6408 ASSERT(!IS_ERR(em));
6409
43c0d1a5 6410 ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
89b798ad
NB
6411 if (ret < 0)
6412 return ret;
0b86a832 6413
95617d69 6414 map = em->map_lookup;
593060d7 6415
89b798ad 6416 *length = geom.len;
89b798ad
NB
6417 stripe_len = geom.stripe_len;
6418 stripe_nr = geom.stripe_nr;
6419 stripe_offset = geom.stripe_offset;
6420 raid56_full_stripe_start = geom.raid56_stripe_offset;
cff82672 6421 data_stripes = nr_data_stripes(map);
593060d7 6422
cb5583dd 6423 down_read(&dev_replace->rwsem);
472262f3 6424 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
53176dde
DS
6425 /*
6426 * Hold the semaphore for read during the whole operation, write is
6427 * requested at commit time but must wait.
6428 */
472262f3 6429 if (!dev_replace_is_ongoing)
cb5583dd 6430 up_read(&dev_replace->rwsem);
472262f3 6431
ad6d620e 6432 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
2b19a1fe 6433 !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
5ab56090
LB
6434 ret = get_extra_mirror_from_replace(fs_info, logical, *length,
6435 dev_replace->srcdev->devid,
6436 &mirror_num,
6437 &physical_to_patch_in_first_stripe);
6438 if (ret)
ad6d620e 6439 goto out;
5ab56090
LB
6440 else
6441 patch_the_first_stripe_for_dev_replace = 1;
ad6d620e
SB
6442 } else if (mirror_num > map->num_stripes) {
6443 mirror_num = 0;
6444 }
6445
f2d8d74d 6446 num_stripes = 1;
cea9e445 6447 stripe_index = 0;
fce3bb9a 6448 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
47c5713f
DS
6449 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6450 &stripe_index);
de483734 6451 if (!need_full_stripe(op))
28e1cc7d 6452 mirror_num = 1;
c7369b3f 6453 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
de483734 6454 if (need_full_stripe(op))
f2d8d74d 6455 num_stripes = map->num_stripes;
2fff734f 6456 else if (mirror_num)
f188591e 6457 stripe_index = mirror_num - 1;
dfe25020 6458 else {
30d9861f 6459 stripe_index = find_live_mirror(fs_info, map, 0,
30d9861f 6460 dev_replace_is_ongoing);
a1d3c478 6461 mirror_num = stripe_index + 1;
dfe25020 6462 }
2fff734f 6463
611f0e00 6464 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
de483734 6465 if (need_full_stripe(op)) {
f2d8d74d 6466 num_stripes = map->num_stripes;
a1d3c478 6467 } else if (mirror_num) {
f188591e 6468 stripe_index = mirror_num - 1;
a1d3c478
JS
6469 } else {
6470 mirror_num = 1;
6471 }
2fff734f 6472
321aecc6 6473 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
9d644a62 6474 u32 factor = map->num_stripes / map->sub_stripes;
321aecc6 6475
47c5713f 6476 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
321aecc6
CM
6477 stripe_index *= map->sub_stripes;
6478
de483734 6479 if (need_full_stripe(op))
f2d8d74d 6480 num_stripes = map->sub_stripes;
321aecc6
CM
6481 else if (mirror_num)
6482 stripe_index += mirror_num - 1;
dfe25020 6483 else {
3e74317a 6484 int old_stripe_index = stripe_index;
30d9861f
SB
6485 stripe_index = find_live_mirror(fs_info, map,
6486 stripe_index,
30d9861f 6487 dev_replace_is_ongoing);
3e74317a 6488 mirror_num = stripe_index - old_stripe_index + 1;
dfe25020 6489 }
53b381b3 6490
ffe2d203 6491 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ff18a4af 6492 ASSERT(map->stripe_len == BTRFS_STRIPE_LEN);
de483734 6493 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
53b381b3 6494 /* push stripe_nr back to the start of the full stripe */
42c61ab6 6495 stripe_nr = div64_u64(raid56_full_stripe_start,
cff82672 6496 stripe_len * data_stripes);
53b381b3
DW
6497
6498 /* RAID[56] write or recovery. Return all stripes */
6499 num_stripes = map->num_stripes;
6dead96c 6500 max_errors = btrfs_chunk_max_errors(map);
53b381b3 6501
462b0b2a
QW
6502 /* Return the length to the full stripe end */
6503 *length = min(logical + *length,
6504 raid56_full_stripe_start + em->start +
6505 data_stripes * stripe_len) - logical;
53b381b3
DW
6506 stripe_index = 0;
6507 stripe_offset = 0;
6508 } else {
6509 /*
6510 * Mirror #0 or #1 means the original data block.
6511 * Mirror #2 is RAID5 parity block.
6512 * Mirror #3 is RAID6 Q block.
6513 */
47c5713f 6514 stripe_nr = div_u64_rem(stripe_nr,
cff82672 6515 data_stripes, &stripe_index);
53b381b3 6516 if (mirror_num > 1)
cff82672 6517 stripe_index = data_stripes + mirror_num - 2;
53b381b3
DW
6518
6519 /* We distribute the parity blocks across stripes */
47c5713f
DS
6520 div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
6521 &stripe_index);
de483734 6522 if (!need_full_stripe(op) && mirror_num <= 1)
28e1cc7d 6523 mirror_num = 1;
53b381b3 6524 }
8790d502
CM
6525 } else {
6526 /*
47c5713f
DS
6527 * after this, stripe_nr is the number of stripes on this
6528 * device we have to walk to find the data, and stripe_index is
6529 * the number of our device in the stripe array
8790d502 6530 */
47c5713f
DS
6531 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6532 &stripe_index);
a1d3c478 6533 mirror_num = stripe_index + 1;
8790d502 6534 }
e042d1ec 6535 if (stripe_index >= map->num_stripes) {
5d163e0e
JM
6536 btrfs_crit(fs_info,
6537 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
e042d1ec
JB
6538 stripe_index, map->num_stripes);
6539 ret = -EINVAL;
6540 goto out;
6541 }
cea9e445 6542
472262f3 6543 num_alloc_stripes = num_stripes;
6fad823f 6544 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
0b3d4cd3 6545 if (op == BTRFS_MAP_WRITE)
ad6d620e 6546 num_alloc_stripes <<= 1;
cf8cddd3 6547 if (op == BTRFS_MAP_GET_READ_MIRRORS)
ad6d620e 6548 num_alloc_stripes++;
2c8cdd6e 6549 tgtdev_indexes = num_stripes;
ad6d620e 6550 }
2c8cdd6e 6551
03793cbb
CH
6552 /*
6553 * If this I/O maps to a single device, try to return the device and
6554 * physical block information on the stack instead of allocating an
6555 * I/O context structure.
6556 */
6557 if (smap && num_alloc_stripes == 1 &&
6558 !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
6559 (!need_full_stripe(op) || !dev_replace_is_ongoing ||
6560 !dev_replace->tgtdev)) {
6561 if (patch_the_first_stripe_for_dev_replace) {
6562 smap->dev = dev_replace->tgtdev;
6563 smap->physical = physical_to_patch_in_first_stripe;
6564 *mirror_num_ret = map->num_stripes + 1;
6565 } else {
6566 set_io_stripe(smap, map, stripe_index, stripe_offset,
6567 stripe_nr);
6568 *mirror_num_ret = mirror_num;
6569 }
6570 *bioc_ret = NULL;
6571 ret = 0;
6572 goto out;
6573 }
6574
731ccf15 6575 bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
4c664611 6576 if (!bioc) {
de11cc12
LZ
6577 ret = -ENOMEM;
6578 goto out;
6579 }
608769a4
NB
6580
6581 for (i = 0; i < num_stripes; i++) {
03793cbb
CH
6582 set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset,
6583 stripe_nr);
608769a4
NB
6584 stripe_index++;
6585 }
de11cc12 6586
4c664611 6587 /* Build raid_map */
2b19a1fe
LB
6588 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
6589 (need_full_stripe(op) || mirror_num > 1)) {
8e5cfb55 6590 u64 tmp;
9d644a62 6591 unsigned rot;
8e5cfb55 6592
8e5cfb55 6593 /* Work out the disk rotation on this stripe-set */
47c5713f 6594 div_u64_rem(stripe_nr, num_stripes, &rot);
8e5cfb55
ZL
6595
6596 /* Fill in the logical address of each stripe */
cff82672
DS
6597 tmp = stripe_nr * data_stripes;
6598 for (i = 0; i < data_stripes; i++)
4c664611 6599 bioc->raid_map[(i + rot) % num_stripes] =
8e5cfb55
ZL
6600 em->start + (tmp + i) * map->stripe_len;
6601
4c664611 6602 bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
8e5cfb55 6603 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4c664611 6604 bioc->raid_map[(i + rot + 1) % num_stripes] =
8e5cfb55 6605 RAID6_Q_STRIPE;
8e5cfb55 6606
4c664611 6607 sort_parity_stripes(bioc, num_stripes);
593060d7 6608 }
de11cc12 6609
2b19a1fe 6610 if (need_full_stripe(op))
d20983b4 6611 max_errors = btrfs_chunk_max_errors(map);
de11cc12 6612
73c0f228 6613 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
2b19a1fe 6614 need_full_stripe(op)) {
4c664611 6615 handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
6143c23c 6616 &num_stripes, &max_errors);
472262f3
SB
6617 }
6618
4c664611
QW
6619 *bioc_ret = bioc;
6620 bioc->map_type = map->type;
6621 bioc->num_stripes = num_stripes;
6622 bioc->max_errors = max_errors;
6623 bioc->mirror_num = mirror_num;
ad6d620e
SB
6624
6625 /*
6626 * this is the case that REQ_READ && dev_replace_is_ongoing &&
6627 * mirror_num == num_stripes + 1 && dev_replace target drive is
6628 * available as a mirror
6629 */
6630 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6631 WARN_ON(num_stripes > 1);
4c664611
QW
6632 bioc->stripes[0].dev = dev_replace->tgtdev;
6633 bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
6634 bioc->mirror_num = map->num_stripes + 1;
ad6d620e 6635 }
cea9e445 6636out:
73beece9 6637 if (dev_replace_is_ongoing) {
53176dde
DS
6638 lockdep_assert_held(&dev_replace->rwsem);
6639 /* Unlock and let waiting writers proceed */
cb5583dd 6640 up_read(&dev_replace->rwsem);
73beece9 6641 }
0b86a832 6642 free_extent_map(em);
de11cc12 6643 return ret;
0b86a832
CM
6644}
6645
cf8cddd3 6646int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
f2d8d74d 6647 u64 logical, u64 *length,
4c664611 6648 struct btrfs_io_context **bioc_ret, int mirror_num)
f2d8d74d 6649{
4c664611 6650 return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
03793cbb 6651 NULL, &mirror_num, 0);
f2d8d74d
CM
6652}
6653
af8e2d1d 6654/* For Scrub/replace */
cf8cddd3 6655int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
af8e2d1d 6656 u64 logical, u64 *length,
4c664611 6657 struct btrfs_io_context **bioc_ret)
af8e2d1d 6658{
03793cbb
CH
6659 return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
6660 NULL, NULL, 1);
af8e2d1d
MX
6661}
6662
d45cfb88
CH
6663/*
6664 * Initialize a btrfs_bio structure. This skips the embedded bio itself as it
6665 * is already initialized by the block layer.
6666 */
917f32a2
CH
6667static inline void btrfs_bio_init(struct btrfs_bio *bbio,
6668 btrfs_bio_end_io_t end_io, void *private)
d45cfb88
CH
6669{
6670 memset(bbio, 0, offsetof(struct btrfs_bio, bio));
917f32a2
CH
6671 bbio->end_io = end_io;
6672 bbio->private = private;
d45cfb88
CH
6673}
6674
6675/*
6676 * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
6677 * btrfs, and is used for all I/O submitted through btrfs_submit_bio.
6678 *
6679 * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
6680 * a mempool.
6681 */
917f32a2
CH
6682struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
6683 btrfs_bio_end_io_t end_io, void *private)
d45cfb88
CH
6684{
6685 struct bio *bio;
6686
6b42f5e3 6687 bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
917f32a2 6688 btrfs_bio_init(btrfs_bio(bio), end_io, private);
d45cfb88
CH
6689 return bio;
6690}
6691
917f32a2
CH
6692struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
6693 btrfs_bio_end_io_t end_io, void *private)
d45cfb88
CH
6694{
6695 struct bio *bio;
6696 struct btrfs_bio *bbio;
6697
6698 ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
6699
6700 bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
6701 bbio = btrfs_bio(bio);
917f32a2 6702 btrfs_bio_init(bbio, end_io, private);
d45cfb88
CH
6703
6704 bio_trim(bio, offset >> 9, size >> 9);
6705 bbio->iter = bio->bi_iter;
6706 return bio;
c3a62baf
CH
6707}
6708
6709static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
6710{
6711 if (!dev || !dev->bdev)
6712 return;
6713 if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
6714 return;
d45cfb88 6715
c3a62baf
CH
6716 if (btrfs_op(bio) == BTRFS_MAP_WRITE)
6717 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
6718 if (!(bio->bi_opf & REQ_RAHEAD))
6719 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
6720 if (bio->bi_opf & REQ_PREFLUSH)
6721 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
d45cfb88
CH
6722}
6723
928ff3be
CH
6724static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
6725 struct bio *bio)
d7b9416f 6726{
928ff3be
CH
6727 if (bio->bi_opf & REQ_META)
6728 return fs_info->endio_meta_workers;
6729 return fs_info->endio_workers;
d7b9416f
CH
6730}
6731
6732static void btrfs_end_bio_work(struct work_struct *work)
6733{
6734 struct btrfs_bio *bbio =
6735 container_of(work, struct btrfs_bio, end_io_work);
6736
917f32a2 6737 bbio->end_io(bbio);
d7b9416f
CH
6738}
6739
928ff3be
CH
6740static void btrfs_simple_end_io(struct bio *bio)
6741{
6742 struct btrfs_fs_info *fs_info = bio->bi_private;
6743 struct btrfs_bio *bbio = btrfs_bio(bio);
6744
6745 btrfs_bio_counter_dec(fs_info);
6746
6747 if (bio->bi_status)
6748 btrfs_log_dev_io_error(bio, bbio->device);
6749
6750 if (bio_op(bio) == REQ_OP_READ) {
6751 INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
6752 queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
6753 } else {
6754 bbio->end_io(bbio);
6755 }
6756}
6757
f1c29379
CH
6758static void btrfs_raid56_end_io(struct bio *bio)
6759{
6760 struct btrfs_io_context *bioc = bio->bi_private;
6761 struct btrfs_bio *bbio = btrfs_bio(bio);
6762
6763 btrfs_bio_counter_dec(bioc->fs_info);
6764 bbio->mirror_num = bioc->mirror_num;
917f32a2 6765 bbio->end_io(bbio);
f1c29379
CH
6766
6767 btrfs_put_bioc(bioc);
6768}
6769
928ff3be 6770static void btrfs_orig_write_end_io(struct bio *bio)
8408c716 6771{
c3a62baf
CH
6772 struct btrfs_io_stripe *stripe = bio->bi_private;
6773 struct btrfs_io_context *bioc = stripe->bioc;
6774 struct btrfs_bio *bbio = btrfs_bio(bio);
326e1dbb 6775
2bbc72f1
CH
6776 btrfs_bio_counter_dec(bioc->fs_info);
6777
c3a62baf
CH
6778 if (bio->bi_status) {
6779 atomic_inc(&bioc->error);
6780 btrfs_log_dev_io_error(bio, stripe->dev);
6781 }
6782
b4c46bde
CH
6783 /*
6784 * Only send an error to the higher layers if it is beyond the tolerance
6785 * threshold.
6786 */
6787 if (atomic_read(&bioc->error) > bioc->max_errors)
c3a62baf 6788 bio->bi_status = BLK_STS_IOERR;
b4c46bde 6789 else
c3a62baf 6790 bio->bi_status = BLK_STS_OK;
d7b9416f 6791
928ff3be 6792 bbio->end_io(bbio);
4c664611 6793 btrfs_put_bioc(bioc);
8408c716
MX
6794}
6795
c3a62baf 6796static void btrfs_clone_write_end_io(struct bio *bio)
8790d502 6797{
9ff7ddd3 6798 struct btrfs_io_stripe *stripe = bio->bi_private;
8790d502 6799
4e4cbee9 6800 if (bio->bi_status) {
c3a62baf
CH
6801 atomic_inc(&stripe->bioc->error);
6802 btrfs_log_dev_io_error(bio, stripe->dev);
442a4f63 6803 }
8790d502 6804
c3a62baf
CH
6805 /* Pass on control to the original bio this one was cloned from */
6806 bio_endio(stripe->bioc->orig_bio);
6807 bio_put(bio);
8790d502
CM
6808}
6809
32747c44 6810static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
de1ee92a 6811{
c3a62baf
CH
6812 if (!dev || !dev->bdev ||
6813 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
6814 (btrfs_op(bio) == BTRFS_MAP_WRITE &&
6815 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6816 bio_io_error(bio);
6817 return;
6818 }
6819
6820 bio_set_dev(bio, dev->bdev);
6821
d8e3fb10
NA
6822 /*
6823 * For zone append writing, bi_sector must point the beginning of the
6824 * zone
6825 */
6826 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
32747c44
CH
6827 u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
6828
d8e3fb10 6829 if (btrfs_dev_is_sequential(dev, physical)) {
32747c44
CH
6830 u64 zone_start = round_down(physical,
6831 dev->fs_info->zone_size);
d8e3fb10
NA
6832
6833 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
6834 } else {
6835 bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
6836 bio->bi_opf |= REQ_OP_WRITE;
6837 }
6838 }
32747c44 6839 btrfs_debug_in_rcu(dev->fs_info,
1a722d8f
CH
6840 "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6841 __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
1db45a35
DS
6842 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6843 dev->devid, bio->bi_iter.bi_size);
c404e0dc 6844
58ff51f1
CH
6845 btrfsic_check_bio(bio);
6846 submit_bio(bio);
de1ee92a
JB
6847}
6848
928ff3be 6849static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
32747c44 6850{
28793b19 6851 struct bio *orig_bio = bioc->orig_bio, *bio;
32747c44 6852
928ff3be
CH
6853 ASSERT(bio_op(orig_bio) != REQ_OP_READ);
6854
32747c44 6855 /* Reuse the bio embedded into the btrfs_bio for the last mirror */
28793b19 6856 if (dev_nr == bioc->num_stripes - 1) {
32747c44 6857 bio = orig_bio;
928ff3be 6858 bio->bi_end_io = btrfs_orig_write_end_io;
32747c44
CH
6859 } else {
6860 bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
6861 bio_inc_remaining(orig_bio);
6862 bio->bi_end_io = btrfs_clone_write_end_io;
6863 }
6864
6865 bio->bi_private = &bioc->stripes[dev_nr];
6866 bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
6867 bioc->stripes[dev_nr].bioc = bioc;
6868 btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
6869}
6870
1a722d8f 6871void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
0b86a832 6872{
1201b58b 6873 u64 logical = bio->bi_iter.bi_sector << 9;
a316a259
CH
6874 u64 length = bio->bi_iter.bi_size;
6875 u64 map_length = length;
4c664611 6876 struct btrfs_io_context *bioc = NULL;
928ff3be
CH
6877 struct btrfs_io_stripe smap;
6878 int ret;
0b86a832 6879
0b246afa 6880 btrfs_bio_counter_inc_blocked(fs_info);
03793cbb 6881 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
928ff3be 6882 &bioc, &smap, &mirror_num, 1);
b9af128d
CH
6883 if (ret) {
6884 btrfs_bio_counter_dec(fs_info);
917f32a2 6885 btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
b9af128d
CH
6886 return;
6887 }
cea9e445 6888
cea9e445 6889 if (map_length < length) {
0b246afa 6890 btrfs_crit(fs_info,
5d163e0e
JM
6891 "mapping failed logical %llu bio len %llu len %llu",
6892 logical, length, map_length);
cea9e445
CM
6893 BUG();
6894 }
a1d3c478 6895
928ff3be
CH
6896 if (!bioc) {
6897 /* Single mirror read/write fast path */
6898 btrfs_bio(bio)->mirror_num = mirror_num;
6899 btrfs_bio(bio)->device = smap.dev;
6900 bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
6901 bio->bi_private = fs_info;
6902 bio->bi_end_io = btrfs_simple_end_io;
6903 btrfs_submit_dev_bio(smap.dev, bio);
6904 } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6905 /* Parity RAID write or read recovery */
6906 bio->bi_private = bioc;
6907 bio->bi_end_io = btrfs_raid56_end_io;
6908 if (bio_op(bio) == REQ_OP_READ)
6909 raid56_parity_recover(bio, bioc, mirror_num);
6910 else
6911 raid56_parity_write(bio, bioc);
6912 } else {
6913 /* Write to multiple mirrors */
6914 int total_devs = bioc->num_stripes;
6915 int dev_nr;
6916
6917 bioc->orig_bio = bio;
6918 for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
6919 btrfs_submit_mirrored_bio(bioc, dev_nr);
6920 }
0b86a832
CM
6921}
6922
562d7b15
JB
6923static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
6924 const struct btrfs_fs_devices *fs_devices)
6925{
6926 if (args->fsid == NULL)
6927 return true;
6928 if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
6929 return true;
6930 return false;
6931}
6932
6933static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
6934 const struct btrfs_device *device)
6935{
0fca385d
LS
6936 if (args->missing) {
6937 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
6938 !device->bdev)
6939 return true;
6940 return false;
6941 }
562d7b15 6942
0fca385d 6943 if (device->devid != args->devid)
562d7b15
JB
6944 return false;
6945 if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
6946 return false;
0fca385d 6947 return true;
562d7b15
JB
6948}
6949
09ba3bc9
AJ
6950/*
6951 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6952 * return NULL.
6953 *
6954 * If devid and uuid are both specified, the match must be exact, otherwise
6955 * only devid is used.
09ba3bc9 6956 */
562d7b15
JB
6957struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
6958 const struct btrfs_dev_lookup_args *args)
0b86a832 6959{
2b82032c 6960 struct btrfs_device *device;
944d3f9f
NB
6961 struct btrfs_fs_devices *seed_devs;
6962
562d7b15 6963 if (dev_args_match_fs_devices(args, fs_devices)) {
944d3f9f 6964 list_for_each_entry(device, &fs_devices->devices, dev_list) {
562d7b15 6965 if (dev_args_match_device(args, device))
944d3f9f
NB
6966 return device;
6967 }
6968 }
2b82032c 6969
944d3f9f 6970 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
562d7b15
JB
6971 if (!dev_args_match_fs_devices(args, seed_devs))
6972 continue;
6973 list_for_each_entry(device, &seed_devs->devices, dev_list) {
6974 if (dev_args_match_device(args, device))
6975 return device;
2b82032c 6976 }
2b82032c 6977 }
944d3f9f 6978
2b82032c 6979 return NULL;
0b86a832
CM
6980}
6981
2ff7e61e 6982static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
dfe25020
CM
6983 u64 devid, u8 *dev_uuid)
6984{
6985 struct btrfs_device *device;
fccc0007 6986 unsigned int nofs_flag;
dfe25020 6987
fccc0007
JB
6988 /*
6989 * We call this under the chunk_mutex, so we want to use NOFS for this
6990 * allocation, however we don't want to change btrfs_alloc_device() to
6991 * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6992 * places.
6993 */
6994 nofs_flag = memalloc_nofs_save();
12bd2fc0 6995 device = btrfs_alloc_device(NULL, &devid, dev_uuid);
fccc0007 6996 memalloc_nofs_restore(nofs_flag);
12bd2fc0 6997 if (IS_ERR(device))
adfb69af 6998 return device;
12bd2fc0
ID
6999
7000 list_add(&device->dev_list, &fs_devices->devices);
e4404d6e 7001 device->fs_devices = fs_devices;
dfe25020 7002 fs_devices->num_devices++;
12bd2fc0 7003
e6e674bd 7004 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
cd02dca5 7005 fs_devices->missing_devices++;
12bd2fc0 7006
dfe25020
CM
7007 return device;
7008}
7009
12bd2fc0
ID
7010/**
7011 * btrfs_alloc_device - allocate struct btrfs_device
7012 * @fs_info: used only for generating a new devid, can be NULL if
7013 * devid is provided (i.e. @devid != NULL).
7014 * @devid: a pointer to devid for this device. If NULL a new devid
7015 * is generated.
7016 * @uuid: a pointer to UUID for this device. If NULL a new UUID
7017 * is generated.
7018 *
7019 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
48dae9cf 7020 * on error. Returned struct is not linked onto any lists and must be
a425f9d4 7021 * destroyed with btrfs_free_device.
12bd2fc0
ID
7022 */
7023struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
7024 const u64 *devid,
7025 const u8 *uuid)
7026{
7027 struct btrfs_device *dev;
7028 u64 tmp;
7029
fae7f21c 7030 if (WARN_ON(!devid && !fs_info))
12bd2fc0 7031 return ERR_PTR(-EINVAL);
12bd2fc0 7032
fe4f46d4
DS
7033 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
7034 if (!dev)
7035 return ERR_PTR(-ENOMEM);
7036
fe4f46d4
DS
7037 INIT_LIST_HEAD(&dev->dev_list);
7038 INIT_LIST_HEAD(&dev->dev_alloc_list);
7039 INIT_LIST_HEAD(&dev->post_commit_list);
7040
fe4f46d4
DS
7041 atomic_set(&dev->dev_stats_ccnt, 0);
7042 btrfs_device_data_ordered_init(dev);
fe4f46d4
DS
7043 extent_io_tree_init(fs_info, &dev->alloc_state,
7044 IO_TREE_DEVICE_ALLOC_STATE, NULL);
12bd2fc0
ID
7045
7046 if (devid)
7047 tmp = *devid;
7048 else {
7049 int ret;
7050
7051 ret = find_next_devid(fs_info, &tmp);
7052 if (ret) {
a425f9d4 7053 btrfs_free_device(dev);
12bd2fc0
ID
7054 return ERR_PTR(ret);
7055 }
7056 }
7057 dev->devid = tmp;
7058
7059 if (uuid)
7060 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
7061 else
7062 generate_random_uuid(dev->uuid);
7063
12bd2fc0
ID
7064 return dev;
7065}
7066
5a2b8e60 7067static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
2b902dfc 7068 u64 devid, u8 *uuid, bool error)
5a2b8e60 7069{
2b902dfc
AJ
7070 if (error)
7071 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
7072 devid, uuid);
7073 else
7074 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
7075 devid, uuid);
5a2b8e60
AJ
7076}
7077
bc88b486 7078u64 btrfs_calc_stripe_length(const struct extent_map *em)
39e264a4 7079{
bc88b486
QW
7080 const struct map_lookup *map = em->map_lookup;
7081 const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
e4f6c6be 7082
bc88b486 7083 return div_u64(em->len, data_stripes);
39e264a4
NB
7084}
7085
e9306ad4
QW
7086#if BITS_PER_LONG == 32
7087/*
7088 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
7089 * can't be accessed on 32bit systems.
7090 *
7091 * This function do mount time check to reject the fs if it already has
7092 * metadata chunk beyond that limit.
7093 */
7094static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
7095 u64 logical, u64 length, u64 type)
7096{
7097 if (!(type & BTRFS_BLOCK_GROUP_METADATA))
7098 return 0;
7099
7100 if (logical + length < MAX_LFS_FILESIZE)
7101 return 0;
7102
7103 btrfs_err_32bit_limit(fs_info);
7104 return -EOVERFLOW;
7105}
7106
7107/*
7108 * This is to give early warning for any metadata chunk reaching
7109 * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
7110 * Although we can still access the metadata, it's not going to be possible
7111 * once the limit is reached.
7112 */
7113static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
7114 u64 logical, u64 length, u64 type)
7115{
7116 if (!(type & BTRFS_BLOCK_GROUP_METADATA))
7117 return;
7118
7119 if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
7120 return;
7121
7122 btrfs_warn_32bit_limit(fs_info);
7123}
7124#endif
7125
ff37c89f
NB
7126static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
7127 u64 devid, u8 *uuid)
7128{
7129 struct btrfs_device *dev;
7130
7131 if (!btrfs_test_opt(fs_info, DEGRADED)) {
7132 btrfs_report_missing_device(fs_info, devid, uuid, true);
7133 return ERR_PTR(-ENOENT);
7134 }
7135
7136 dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
7137 if (IS_ERR(dev)) {
7138 btrfs_err(fs_info, "failed to init missing device %llu: %ld",
7139 devid, PTR_ERR(dev));
7140 return dev;
7141 }
7142 btrfs_report_missing_device(fs_info, devid, uuid, false);
7143
7144 return dev;
7145}
7146
9690ac09 7147static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
e06cd3dd
LB
7148 struct btrfs_chunk *chunk)
7149{
562d7b15 7150 BTRFS_DEV_LOOKUP_ARGS(args);
9690ac09 7151 struct btrfs_fs_info *fs_info = leaf->fs_info;
c8bf1b67 7152 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
e06cd3dd
LB
7153 struct map_lookup *map;
7154 struct extent_map *em;
7155 u64 logical;
7156 u64 length;
e06cd3dd 7157 u64 devid;
e9306ad4 7158 u64 type;
e06cd3dd 7159 u8 uuid[BTRFS_UUID_SIZE];
76a66ba1 7160 int index;
e06cd3dd
LB
7161 int num_stripes;
7162 int ret;
7163 int i;
7164
7165 logical = key->offset;
7166 length = btrfs_chunk_length(leaf, chunk);
e9306ad4 7167 type = btrfs_chunk_type(leaf, chunk);
76a66ba1 7168 index = btrfs_bg_flags_to_raid_index(type);
e06cd3dd
LB
7169 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
7170
e9306ad4
QW
7171#if BITS_PER_LONG == 32
7172 ret = check_32bit_meta_chunk(fs_info, logical, length, type);
7173 if (ret < 0)
7174 return ret;
7175 warn_32bit_meta_chunk(fs_info, logical, length, type);
7176#endif
7177
075cb3c7
QW
7178 /*
7179 * Only need to verify chunk item if we're reading from sys chunk array,
7180 * as chunk item in tree block is already verified by tree-checker.
7181 */
7182 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
ddaf1d5a 7183 ret = btrfs_check_chunk_valid(leaf, chunk, logical);
075cb3c7
QW
7184 if (ret)
7185 return ret;
7186 }
a061fc8d 7187
c8bf1b67
DS
7188 read_lock(&map_tree->lock);
7189 em = lookup_extent_mapping(map_tree, logical, 1);
7190 read_unlock(&map_tree->lock);
0b86a832
CM
7191
7192 /* already mapped? */
7193 if (em && em->start <= logical && em->start + em->len > logical) {
7194 free_extent_map(em);
0b86a832
CM
7195 return 0;
7196 } else if (em) {
7197 free_extent_map(em);
7198 }
0b86a832 7199
172ddd60 7200 em = alloc_extent_map();
0b86a832
CM
7201 if (!em)
7202 return -ENOMEM;
593060d7 7203 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
0b86a832
CM
7204 if (!map) {
7205 free_extent_map(em);
7206 return -ENOMEM;
7207 }
7208
298a8f9c 7209 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
95617d69 7210 em->map_lookup = map;
0b86a832
CM
7211 em->start = logical;
7212 em->len = length;
70c8a91c 7213 em->orig_start = 0;
0b86a832 7214 em->block_start = 0;
c8b97818 7215 em->block_len = em->len;
0b86a832 7216
593060d7
CM
7217 map->num_stripes = num_stripes;
7218 map->io_width = btrfs_chunk_io_width(leaf, chunk);
7219 map->io_align = btrfs_chunk_io_align(leaf, chunk);
593060d7 7220 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
e9306ad4 7221 map->type = type;
76a66ba1
QW
7222 /*
7223 * We can't use the sub_stripes value, as for profiles other than
7224 * RAID10, they may have 0 as sub_stripes for filesystems created by
7225 * older mkfs (<v5.4).
7226 * In that case, it can cause divide-by-zero errors later.
7227 * Since currently sub_stripes is fixed for each profile, let's
7228 * use the trusted value instead.
7229 */
7230 map->sub_stripes = btrfs_raid_array[index].sub_stripes;
cf90d884 7231 map->verified_stripes = 0;
bc88b486 7232 em->orig_block_len = btrfs_calc_stripe_length(em);
593060d7
CM
7233 for (i = 0; i < num_stripes; i++) {
7234 map->stripes[i].physical =
7235 btrfs_stripe_offset_nr(leaf, chunk, i);
7236 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
562d7b15 7237 args.devid = devid;
a443755f
CM
7238 read_extent_buffer(leaf, uuid, (unsigned long)
7239 btrfs_stripe_dev_uuid_nr(chunk, i),
7240 BTRFS_UUID_SIZE);
562d7b15
JB
7241 args.uuid = uuid;
7242 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
dfe25020 7243 if (!map->stripes[i].dev) {
ff37c89f
NB
7244 map->stripes[i].dev = handle_missing_device(fs_info,
7245 devid, uuid);
adfb69af 7246 if (IS_ERR(map->stripes[i].dev)) {
dfe25020 7247 free_extent_map(em);
adfb69af 7248 return PTR_ERR(map->stripes[i].dev);
dfe25020
CM
7249 }
7250 }
ff37c89f 7251
e12c9621
AJ
7252 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
7253 &(map->stripes[i].dev->dev_state));
0b86a832
CM
7254 }
7255
c8bf1b67
DS
7256 write_lock(&map_tree->lock);
7257 ret = add_extent_mapping(map_tree, em, 0);
7258 write_unlock(&map_tree->lock);
64f64f43
QW
7259 if (ret < 0) {
7260 btrfs_err(fs_info,
7261 "failed to add chunk map, start=%llu len=%llu: %d",
7262 em->start, em->len, ret);
7263 }
0b86a832
CM
7264 free_extent_map(em);
7265
64f64f43 7266 return ret;
0b86a832
CM
7267}
7268
143bede5 7269static void fill_device_from_item(struct extent_buffer *leaf,
0b86a832
CM
7270 struct btrfs_dev_item *dev_item,
7271 struct btrfs_device *device)
7272{
7273 unsigned long ptr;
0b86a832
CM
7274
7275 device->devid = btrfs_device_id(leaf, dev_item);
d6397bae
CB
7276 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
7277 device->total_bytes = device->disk_total_bytes;
935e5cc9 7278 device->commit_total_bytes = device->disk_total_bytes;
0b86a832 7279 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
ce7213c7 7280 device->commit_bytes_used = device->bytes_used;
0b86a832
CM
7281 device->type = btrfs_device_type(leaf, dev_item);
7282 device->io_align = btrfs_device_io_align(leaf, dev_item);
7283 device->io_width = btrfs_device_io_width(leaf, dev_item);
7284 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
8dabb742 7285 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
401e29c1 7286 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
0b86a832 7287
410ba3a2 7288 ptr = btrfs_device_uuid(dev_item);
e17cade2 7289 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
0b86a832
CM
7290}
7291
2ff7e61e 7292static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
5f375835 7293 u8 *fsid)
2b82032c
YZ
7294{
7295 struct btrfs_fs_devices *fs_devices;
7296 int ret;
7297
a32bf9a3 7298 lockdep_assert_held(&uuid_mutex);
2dfeca9b 7299 ASSERT(fsid);
2b82032c 7300
427c8fdd 7301 /* This will match only for multi-device seed fs */
944d3f9f 7302 list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
44880fdc 7303 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
5f375835
MX
7304 return fs_devices;
7305
2b82032c 7306
7239ff4b 7307 fs_devices = find_fsid(fsid, NULL);
2b82032c 7308 if (!fs_devices) {
0b246afa 7309 if (!btrfs_test_opt(fs_info, DEGRADED))
5f375835
MX
7310 return ERR_PTR(-ENOENT);
7311
7239ff4b 7312 fs_devices = alloc_fs_devices(fsid, NULL);
5f375835
MX
7313 if (IS_ERR(fs_devices))
7314 return fs_devices;
7315
0395d84f 7316 fs_devices->seeding = true;
5f375835
MX
7317 fs_devices->opened = 1;
7318 return fs_devices;
2b82032c 7319 }
e4404d6e 7320
427c8fdd
NB
7321 /*
7322 * Upon first call for a seed fs fsid, just create a private copy of the
7323 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
7324 */
e4404d6e 7325 fs_devices = clone_fs_devices(fs_devices);
5f375835
MX
7326 if (IS_ERR(fs_devices))
7327 return fs_devices;
2b82032c 7328
897fb573 7329 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
48d28232
JL
7330 if (ret) {
7331 free_fs_devices(fs_devices);
c83b60c0 7332 return ERR_PTR(ret);
48d28232 7333 }
2b82032c
YZ
7334
7335 if (!fs_devices->seeding) {
0226e0eb 7336 close_fs_devices(fs_devices);
e4404d6e 7337 free_fs_devices(fs_devices);
c83b60c0 7338 return ERR_PTR(-EINVAL);
2b82032c
YZ
7339 }
7340
944d3f9f 7341 list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
c83b60c0 7342
5f375835 7343 return fs_devices;
2b82032c
YZ
7344}
7345
17850759 7346static int read_one_dev(struct extent_buffer *leaf,
0b86a832
CM
7347 struct btrfs_dev_item *dev_item)
7348{
562d7b15 7349 BTRFS_DEV_LOOKUP_ARGS(args);
17850759 7350 struct btrfs_fs_info *fs_info = leaf->fs_info;
0b246afa 7351 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
0b86a832
CM
7352 struct btrfs_device *device;
7353 u64 devid;
7354 int ret;
44880fdc 7355 u8 fs_uuid[BTRFS_FSID_SIZE];
a443755f
CM
7356 u8 dev_uuid[BTRFS_UUID_SIZE];
7357
c1867eb3
DS
7358 devid = btrfs_device_id(leaf, dev_item);
7359 args.devid = devid;
410ba3a2 7360 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
a443755f 7361 BTRFS_UUID_SIZE);
1473b24e 7362 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
44880fdc 7363 BTRFS_FSID_SIZE);
562d7b15
JB
7364 args.uuid = dev_uuid;
7365 args.fsid = fs_uuid;
2b82032c 7366
de37aa51 7367 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
2ff7e61e 7368 fs_devices = open_seed_devices(fs_info, fs_uuid);
5f375835
MX
7369 if (IS_ERR(fs_devices))
7370 return PTR_ERR(fs_devices);
2b82032c
YZ
7371 }
7372
562d7b15 7373 device = btrfs_find_device(fs_info->fs_devices, &args);
5f375835 7374 if (!device) {
c5502451 7375 if (!btrfs_test_opt(fs_info, DEGRADED)) {
2b902dfc
AJ
7376 btrfs_report_missing_device(fs_info, devid,
7377 dev_uuid, true);
45dbdbc9 7378 return -ENOENT;
c5502451 7379 }
2b82032c 7380
2ff7e61e 7381 device = add_missing_dev(fs_devices, devid, dev_uuid);
adfb69af
AJ
7382 if (IS_ERR(device)) {
7383 btrfs_err(fs_info,
7384 "failed to add missing dev %llu: %ld",
7385 devid, PTR_ERR(device));
7386 return PTR_ERR(device);
7387 }
2b902dfc 7388 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
5f375835 7389 } else {
c5502451 7390 if (!device->bdev) {
2b902dfc
AJ
7391 if (!btrfs_test_opt(fs_info, DEGRADED)) {
7392 btrfs_report_missing_device(fs_info,
7393 devid, dev_uuid, true);
45dbdbc9 7394 return -ENOENT;
2b902dfc
AJ
7395 }
7396 btrfs_report_missing_device(fs_info, devid,
7397 dev_uuid, false);
c5502451 7398 }
5f375835 7399
e6e674bd
AJ
7400 if (!device->bdev &&
7401 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
cd02dca5
CM
7402 /*
7403 * this happens when a device that was properly setup
7404 * in the device info lists suddenly goes bad.
7405 * device->bdev is NULL, and so we have to set
7406 * device->missing to one here
7407 */
5f375835 7408 device->fs_devices->missing_devices++;
e6e674bd 7409 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
2b82032c 7410 }
5f375835
MX
7411
7412 /* Move the device to its own fs_devices */
7413 if (device->fs_devices != fs_devices) {
e6e674bd
AJ
7414 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
7415 &device->dev_state));
5f375835
MX
7416
7417 list_move(&device->dev_list, &fs_devices->devices);
7418 device->fs_devices->num_devices--;
7419 fs_devices->num_devices++;
7420
7421 device->fs_devices->missing_devices--;
7422 fs_devices->missing_devices++;
7423
7424 device->fs_devices = fs_devices;
7425 }
2b82032c
YZ
7426 }
7427
0b246afa 7428 if (device->fs_devices != fs_info->fs_devices) {
ebbede42 7429 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
2b82032c
YZ
7430 if (device->generation !=
7431 btrfs_device_generation(leaf, dev_item))
7432 return -EINVAL;
6324fbf3 7433 }
0b86a832
CM
7434
7435 fill_device_from_item(leaf, dev_item, device);
3a160a93 7436 if (device->bdev) {
cda00eba 7437 u64 max_total_bytes = bdev_nr_bytes(device->bdev);
3a160a93
AJ
7438
7439 if (device->total_bytes > max_total_bytes) {
7440 btrfs_err(fs_info,
7441 "device total_bytes should be at most %llu but found %llu",
7442 max_total_bytes, device->total_bytes);
7443 return -EINVAL;
7444 }
7445 }
e12c9621 7446 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
ebbede42 7447 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
401e29c1 7448 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2b82032c 7449 device->fs_devices->total_rw_bytes += device->total_bytes;
a5ed45f8
NB
7450 atomic64_add(device->total_bytes - device->bytes_used,
7451 &fs_info->free_chunk_space);
2bf64758 7452 }
0b86a832 7453 ret = 0;
0b86a832
CM
7454 return ret;
7455}
7456
6bccf3ab 7457int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
0b86a832 7458{
ab8d0fc4 7459 struct btrfs_super_block *super_copy = fs_info->super_copy;
a061fc8d 7460 struct extent_buffer *sb;
0b86a832 7461 struct btrfs_disk_key *disk_key;
0b86a832 7462 struct btrfs_chunk *chunk;
1ffb22cf
DS
7463 u8 *array_ptr;
7464 unsigned long sb_array_offset;
84eed90f 7465 int ret = 0;
0b86a832
CM
7466 u32 num_stripes;
7467 u32 array_size;
7468 u32 len = 0;
1ffb22cf 7469 u32 cur_offset;
e06cd3dd 7470 u64 type;
84eed90f 7471 struct btrfs_key key;
0b86a832 7472
0b246afa 7473 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
e959d3c1 7474
a83fffb7 7475 /*
e959d3c1
QW
7476 * We allocated a dummy extent, just to use extent buffer accessors.
7477 * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
7478 * that's fine, we will not go beyond system chunk array anyway.
a83fffb7 7479 */
e959d3c1
QW
7480 sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
7481 if (!sb)
7482 return -ENOMEM;
4db8c528 7483 set_extent_buffer_uptodate(sb);
4008c04a 7484
a061fc8d 7485 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
0b86a832
CM
7486 array_size = btrfs_super_sys_array_size(super_copy);
7487
1ffb22cf
DS
7488 array_ptr = super_copy->sys_chunk_array;
7489 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
7490 cur_offset = 0;
0b86a832 7491
1ffb22cf
DS
7492 while (cur_offset < array_size) {
7493 disk_key = (struct btrfs_disk_key *)array_ptr;
e3540eab
DS
7494 len = sizeof(*disk_key);
7495 if (cur_offset + len > array_size)
7496 goto out_short_read;
7497
0b86a832
CM
7498 btrfs_disk_key_to_cpu(&key, disk_key);
7499
1ffb22cf
DS
7500 array_ptr += len;
7501 sb_array_offset += len;
7502 cur_offset += len;
0b86a832 7503
32ab3d1b
JT
7504 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
7505 btrfs_err(fs_info,
7506 "unexpected item type %u in sys_array at offset %u",
7507 (u32)key.type, cur_offset);
7508 ret = -EIO;
7509 break;
7510 }
f5cdedd7 7511
32ab3d1b
JT
7512 chunk = (struct btrfs_chunk *)sb_array_offset;
7513 /*
7514 * At least one btrfs_chunk with one stripe must be present,
7515 * exact stripe count check comes afterwards
7516 */
7517 len = btrfs_chunk_item_size(1);
7518 if (cur_offset + len > array_size)
7519 goto out_short_read;
e06cd3dd 7520
32ab3d1b
JT
7521 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7522 if (!num_stripes) {
7523 btrfs_err(fs_info,
7524 "invalid number of stripes %u in sys_array at offset %u",
7525 num_stripes, cur_offset);
7526 ret = -EIO;
7527 break;
7528 }
e3540eab 7529
32ab3d1b
JT
7530 type = btrfs_chunk_type(sb, chunk);
7531 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
ab8d0fc4 7532 btrfs_err(fs_info,
32ab3d1b
JT
7533 "invalid chunk type %llu in sys_array at offset %u",
7534 type, cur_offset);
84eed90f
CM
7535 ret = -EIO;
7536 break;
0b86a832 7537 }
32ab3d1b
JT
7538
7539 len = btrfs_chunk_item_size(num_stripes);
7540 if (cur_offset + len > array_size)
7541 goto out_short_read;
7542
7543 ret = read_one_chunk(&key, sb, chunk);
7544 if (ret)
7545 break;
7546
1ffb22cf
DS
7547 array_ptr += len;
7548 sb_array_offset += len;
7549 cur_offset += len;
0b86a832 7550 }
d865177a 7551 clear_extent_buffer_uptodate(sb);
1c8b5b6e 7552 free_extent_buffer_stale(sb);
84eed90f 7553 return ret;
e3540eab
DS
7554
7555out_short_read:
ab8d0fc4 7556 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
e3540eab 7557 len, cur_offset);
d865177a 7558 clear_extent_buffer_uptodate(sb);
1c8b5b6e 7559 free_extent_buffer_stale(sb);
e3540eab 7560 return -EIO;
0b86a832
CM
7561}
7562
21634a19
QW
7563/*
7564 * Check if all chunks in the fs are OK for read-write degraded mount
7565 *
6528b99d
AJ
7566 * If the @failing_dev is specified, it's accounted as missing.
7567 *
21634a19
QW
7568 * Return true if all chunks meet the minimal RW mount requirements.
7569 * Return false if any chunk doesn't meet the minimal RW mount requirements.
7570 */
6528b99d
AJ
7571bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7572 struct btrfs_device *failing_dev)
21634a19 7573{
c8bf1b67 7574 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
21634a19
QW
7575 struct extent_map *em;
7576 u64 next_start = 0;
7577 bool ret = true;
7578
c8bf1b67
DS
7579 read_lock(&map_tree->lock);
7580 em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7581 read_unlock(&map_tree->lock);
21634a19
QW
7582 /* No chunk at all? Return false anyway */
7583 if (!em) {
7584 ret = false;
7585 goto out;
7586 }
7587 while (em) {
7588 struct map_lookup *map;
7589 int missing = 0;
7590 int max_tolerated;
7591 int i;
7592
7593 map = em->map_lookup;
7594 max_tolerated =
7595 btrfs_get_num_tolerated_disk_barrier_failures(
7596 map->type);
7597 for (i = 0; i < map->num_stripes; i++) {
7598 struct btrfs_device *dev = map->stripes[i].dev;
7599
e6e674bd
AJ
7600 if (!dev || !dev->bdev ||
7601 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
21634a19
QW
7602 dev->last_flush_error)
7603 missing++;
6528b99d
AJ
7604 else if (failing_dev && failing_dev == dev)
7605 missing++;
21634a19
QW
7606 }
7607 if (missing > max_tolerated) {
6528b99d
AJ
7608 if (!failing_dev)
7609 btrfs_warn(fs_info,
52042d8e 7610 "chunk %llu missing %d devices, max tolerance is %d for writable mount",
21634a19
QW
7611 em->start, missing, max_tolerated);
7612 free_extent_map(em);
7613 ret = false;
7614 goto out;
7615 }
7616 next_start = extent_map_end(em);
7617 free_extent_map(em);
7618
c8bf1b67
DS
7619 read_lock(&map_tree->lock);
7620 em = lookup_extent_mapping(map_tree, next_start,
21634a19 7621 (u64)(-1) - next_start);
c8bf1b67 7622 read_unlock(&map_tree->lock);
21634a19
QW
7623 }
7624out:
7625 return ret;
7626}
7627
d85327b1
DS
7628static void readahead_tree_node_children(struct extent_buffer *node)
7629{
7630 int i;
7631 const int nr_items = btrfs_header_nritems(node);
7632
bfb484d9
JB
7633 for (i = 0; i < nr_items; i++)
7634 btrfs_readahead_node_child(node, i);
d85327b1
DS
7635}
7636
5b4aacef 7637int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
0b86a832 7638{
5b4aacef 7639 struct btrfs_root *root = fs_info->chunk_root;
0b86a832
CM
7640 struct btrfs_path *path;
7641 struct extent_buffer *leaf;
7642 struct btrfs_key key;
7643 struct btrfs_key found_key;
7644 int ret;
7645 int slot;
43cb1478 7646 int iter_ret = 0;
99e3ecfc 7647 u64 total_dev = 0;
d85327b1 7648 u64 last_ra_node = 0;
0b86a832 7649
0b86a832
CM
7650 path = btrfs_alloc_path();
7651 if (!path)
7652 return -ENOMEM;
7653
3dd0f7a3
AJ
7654 /*
7655 * uuid_mutex is needed only if we are mounting a sprout FS
7656 * otherwise we don't need it.
7657 */
b367e47f 7658 mutex_lock(&uuid_mutex);
b367e47f 7659
48cfa61b
BB
7660 /*
7661 * It is possible for mount and umount to race in such a way that
7662 * we execute this code path, but open_fs_devices failed to clear
7663 * total_rw_bytes. We certainly want it cleared before reading the
7664 * device items, so clear it here.
7665 */
7666 fs_info->fs_devices->total_rw_bytes = 0;
7667
4d9380e0
FM
7668 /*
7669 * Lockdep complains about possible circular locking dependency between
7670 * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
7671 * used for freeze procection of a fs (struct super_block.s_writers),
7672 * which we take when starting a transaction, and extent buffers of the
7673 * chunk tree if we call read_one_dev() while holding a lock on an
7674 * extent buffer of the chunk tree. Since we are mounting the filesystem
7675 * and at this point there can't be any concurrent task modifying the
7676 * chunk tree, to keep it simple, just skip locking on the chunk tree.
7677 */
7678 ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
7679 path->skip_locking = 1;
7680
395927a9
FDBM
7681 /*
7682 * Read all device items, and then all the chunk items. All
7683 * device items are found before any chunk item (their object id
7684 * is smaller than the lowest possible object id for a chunk
7685 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
0b86a832
CM
7686 */
7687 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7688 key.offset = 0;
7689 key.type = 0;
43cb1478
GN
7690 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
7691 struct extent_buffer *node = path->nodes[1];
d85327b1 7692
0b86a832
CM
7693 leaf = path->nodes[0];
7694 slot = path->slots[0];
43cb1478 7695
d85327b1
DS
7696 if (node) {
7697 if (last_ra_node != node->start) {
7698 readahead_tree_node_children(node);
7699 last_ra_node = node->start;
7700 }
7701 }
395927a9
FDBM
7702 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7703 struct btrfs_dev_item *dev_item;
7704 dev_item = btrfs_item_ptr(leaf, slot,
0b86a832 7705 struct btrfs_dev_item);
17850759 7706 ret = read_one_dev(leaf, dev_item);
395927a9
FDBM
7707 if (ret)
7708 goto error;
99e3ecfc 7709 total_dev++;
0b86a832
CM
7710 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7711 struct btrfs_chunk *chunk;
79bd3712
FM
7712
7713 /*
7714 * We are only called at mount time, so no need to take
7715 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
7716 * we always lock first fs_info->chunk_mutex before
7717 * acquiring any locks on the chunk tree. This is a
7718 * requirement for chunk allocation, see the comment on
7719 * top of btrfs_chunk_alloc() for details.
7720 */
0b86a832 7721 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
9690ac09 7722 ret = read_one_chunk(&found_key, leaf, chunk);
2b82032c
YZ
7723 if (ret)
7724 goto error;
0b86a832 7725 }
43cb1478
GN
7726 }
7727 /* Catch error found during iteration */
7728 if (iter_ret < 0) {
7729 ret = iter_ret;
7730 goto error;
0b86a832 7731 }
99e3ecfc
LB
7732
7733 /*
7734 * After loading chunk tree, we've got all device information,
7735 * do another round of validation checks.
7736 */
0b246afa 7737 if (total_dev != fs_info->fs_devices->total_devices) {
d201238c
QW
7738 btrfs_warn(fs_info,
7739"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
0b246afa 7740 btrfs_super_num_devices(fs_info->super_copy),
99e3ecfc 7741 total_dev);
d201238c
QW
7742 fs_info->fs_devices->total_devices = total_dev;
7743 btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
99e3ecfc 7744 }
0b246afa
JM
7745 if (btrfs_super_total_bytes(fs_info->super_copy) <
7746 fs_info->fs_devices->total_rw_bytes) {
7747 btrfs_err(fs_info,
99e3ecfc 7748 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
0b246afa
JM
7749 btrfs_super_total_bytes(fs_info->super_copy),
7750 fs_info->fs_devices->total_rw_bytes);
99e3ecfc
LB
7751 ret = -EINVAL;
7752 goto error;
7753 }
0b86a832
CM
7754 ret = 0;
7755error:
b367e47f
LZ
7756 mutex_unlock(&uuid_mutex);
7757
2b82032c 7758 btrfs_free_path(path);
0b86a832
CM
7759 return ret;
7760}
442a4f63 7761
a8d1b164 7762int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
cb517eab 7763{
944d3f9f 7764 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
cb517eab 7765 struct btrfs_device *device;
a8d1b164 7766 int ret = 0;
cb517eab 7767
944d3f9f
NB
7768 fs_devices->fs_info = fs_info;
7769
7770 mutex_lock(&fs_devices->device_list_mutex);
7771 list_for_each_entry(device, &fs_devices->devices, dev_list)
7772 device->fs_info = fs_info;
944d3f9f
NB
7773
7774 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
a8d1b164 7775 list_for_each_entry(device, &seed_devs->devices, dev_list) {
fb456252 7776 device->fs_info = fs_info;
a8d1b164
JT
7777 ret = btrfs_get_dev_zone_info(device, false);
7778 if (ret)
7779 break;
7780 }
29cc83f6 7781
944d3f9f 7782 seed_devs->fs_info = fs_info;
29cc83f6 7783 }
e17125b5 7784 mutex_unlock(&fs_devices->device_list_mutex);
a8d1b164
JT
7785
7786 return ret;
cb517eab
MX
7787}
7788
1dc990df
DS
7789static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7790 const struct btrfs_dev_stats_item *ptr,
7791 int index)
7792{
7793 u64 val;
7794
7795 read_extent_buffer(eb, &val,
7796 offsetof(struct btrfs_dev_stats_item, values) +
7797 ((unsigned long)ptr) + (index * sizeof(u64)),
7798 sizeof(val));
7799 return val;
7800}
7801
7802static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7803 struct btrfs_dev_stats_item *ptr,
7804 int index, u64 val)
7805{
7806 write_extent_buffer(eb, &val,
7807 offsetof(struct btrfs_dev_stats_item, values) +
7808 ((unsigned long)ptr) + (index * sizeof(u64)),
7809 sizeof(val));
7810}
7811
92e26df4
JB
7812static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7813 struct btrfs_path *path)
733f4fbb 7814{
124604eb 7815 struct btrfs_dev_stats_item *ptr;
733f4fbb 7816 struct extent_buffer *eb;
124604eb
JB
7817 struct btrfs_key key;
7818 int item_size;
7819 int i, ret, slot;
7820
82d62d06
JB
7821 if (!device->fs_info->dev_root)
7822 return 0;
7823
124604eb
JB
7824 key.objectid = BTRFS_DEV_STATS_OBJECTID;
7825 key.type = BTRFS_PERSISTENT_ITEM_KEY;
7826 key.offset = device->devid;
7827 ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7828 if (ret) {
7829 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7830 btrfs_dev_stat_set(device, i, 0);
7831 device->dev_stats_valid = 1;
7832 btrfs_release_path(path);
92e26df4 7833 return ret < 0 ? ret : 0;
124604eb
JB
7834 }
7835 slot = path->slots[0];
7836 eb = path->nodes[0];
3212fa14 7837 item_size = btrfs_item_size(eb, slot);
124604eb
JB
7838
7839 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7840
7841 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7842 if (item_size >= (1 + i) * sizeof(__le64))
7843 btrfs_dev_stat_set(device, i,
7844 btrfs_dev_stats_value(eb, ptr, i));
7845 else
7846 btrfs_dev_stat_set(device, i, 0);
7847 }
7848
7849 device->dev_stats_valid = 1;
7850 btrfs_dev_stat_print_on_load(device);
7851 btrfs_release_path(path);
92e26df4
JB
7852
7853 return 0;
124604eb
JB
7854}
7855
7856int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7857{
7858 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
733f4fbb
SB
7859 struct btrfs_device *device;
7860 struct btrfs_path *path = NULL;
92e26df4 7861 int ret = 0;
733f4fbb
SB
7862
7863 path = btrfs_alloc_path();
3b80a984
AJ
7864 if (!path)
7865 return -ENOMEM;
733f4fbb
SB
7866
7867 mutex_lock(&fs_devices->device_list_mutex);
92e26df4
JB
7868 list_for_each_entry(device, &fs_devices->devices, dev_list) {
7869 ret = btrfs_device_init_dev_stats(device, path);
7870 if (ret)
7871 goto out;
7872 }
124604eb 7873 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
92e26df4
JB
7874 list_for_each_entry(device, &seed_devs->devices, dev_list) {
7875 ret = btrfs_device_init_dev_stats(device, path);
7876 if (ret)
7877 goto out;
7878 }
733f4fbb 7879 }
92e26df4 7880out:
733f4fbb
SB
7881 mutex_unlock(&fs_devices->device_list_mutex);
7882
733f4fbb 7883 btrfs_free_path(path);
92e26df4 7884 return ret;
733f4fbb
SB
7885}
7886
7887static int update_dev_stat_item(struct btrfs_trans_handle *trans,
733f4fbb
SB
7888 struct btrfs_device *device)
7889{
5495f195 7890 struct btrfs_fs_info *fs_info = trans->fs_info;
6bccf3ab 7891 struct btrfs_root *dev_root = fs_info->dev_root;
733f4fbb
SB
7892 struct btrfs_path *path;
7893 struct btrfs_key key;
7894 struct extent_buffer *eb;
7895 struct btrfs_dev_stats_item *ptr;
7896 int ret;
7897 int i;
7898
242e2956
DS
7899 key.objectid = BTRFS_DEV_STATS_OBJECTID;
7900 key.type = BTRFS_PERSISTENT_ITEM_KEY;
733f4fbb
SB
7901 key.offset = device->devid;
7902
7903 path = btrfs_alloc_path();
fa252992
DS
7904 if (!path)
7905 return -ENOMEM;
733f4fbb
SB
7906 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7907 if (ret < 0) {
0b246afa 7908 btrfs_warn_in_rcu(fs_info,
ecaeb14b 7909 "error %d while searching for dev_stats item for device %s",
606686ee 7910 ret, rcu_str_deref(device->name));
733f4fbb
SB
7911 goto out;
7912 }
7913
7914 if (ret == 0 &&
3212fa14 7915 btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
733f4fbb
SB
7916 /* need to delete old one and insert a new one */
7917 ret = btrfs_del_item(trans, dev_root, path);
7918 if (ret != 0) {
0b246afa 7919 btrfs_warn_in_rcu(fs_info,
ecaeb14b 7920 "delete too small dev_stats item for device %s failed %d",
606686ee 7921 rcu_str_deref(device->name), ret);
733f4fbb
SB
7922 goto out;
7923 }
7924 ret = 1;
7925 }
7926
7927 if (ret == 1) {
7928 /* need to insert a new item */
7929 btrfs_release_path(path);
7930 ret = btrfs_insert_empty_item(trans, dev_root, path,
7931 &key, sizeof(*ptr));
7932 if (ret < 0) {
0b246afa 7933 btrfs_warn_in_rcu(fs_info,
ecaeb14b
DS
7934 "insert dev_stats item for device %s failed %d",
7935 rcu_str_deref(device->name), ret);
733f4fbb
SB
7936 goto out;
7937 }
7938 }
7939
7940 eb = path->nodes[0];
7941 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7942 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7943 btrfs_set_dev_stats_value(eb, ptr, i,
7944 btrfs_dev_stat_read(device, i));
7945 btrfs_mark_buffer_dirty(eb);
7946
7947out:
7948 btrfs_free_path(path);
7949 return ret;
7950}
7951
7952/*
7953 * called from commit_transaction. Writes all changed device stats to disk.
7954 */
196c9d8d 7955int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
733f4fbb 7956{
196c9d8d 7957 struct btrfs_fs_info *fs_info = trans->fs_info;
733f4fbb
SB
7958 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7959 struct btrfs_device *device;
addc3fa7 7960 int stats_cnt;
733f4fbb
SB
7961 int ret = 0;
7962
7963 mutex_lock(&fs_devices->device_list_mutex);
7964 list_for_each_entry(device, &fs_devices->devices, dev_list) {
9deae968
NB
7965 stats_cnt = atomic_read(&device->dev_stats_ccnt);
7966 if (!device->dev_stats_valid || stats_cnt == 0)
733f4fbb
SB
7967 continue;
7968
9deae968
NB
7969
7970 /*
7971 * There is a LOAD-LOAD control dependency between the value of
7972 * dev_stats_ccnt and updating the on-disk values which requires
7973 * reading the in-memory counters. Such control dependencies
7974 * require explicit read memory barriers.
7975 *
7976 * This memory barriers pairs with smp_mb__before_atomic in
7977 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7978 * barrier implied by atomic_xchg in
7979 * btrfs_dev_stats_read_and_reset
7980 */
7981 smp_rmb();
7982
5495f195 7983 ret = update_dev_stat_item(trans, device);
733f4fbb 7984 if (!ret)
addc3fa7 7985 atomic_sub(stats_cnt, &device->dev_stats_ccnt);
733f4fbb
SB
7986 }
7987 mutex_unlock(&fs_devices->device_list_mutex);
7988
7989 return ret;
7990}
7991
442a4f63
SB
7992void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7993{
7994 btrfs_dev_stat_inc(dev, index);
442a4f63 7995
733f4fbb
SB
7996 if (!dev->dev_stats_valid)
7997 return;
fb456252 7998 btrfs_err_rl_in_rcu(dev->fs_info,
b14af3b4 7999 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
606686ee 8000 rcu_str_deref(dev->name),
442a4f63
SB
8001 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
8002 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
8003 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
efe120a0
FH
8004 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
8005 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
442a4f63 8006}
c11d2c23 8007
733f4fbb
SB
8008static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
8009{
a98cdb85
SB
8010 int i;
8011
8012 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
8013 if (btrfs_dev_stat_read(dev, i) != 0)
8014 break;
8015 if (i == BTRFS_DEV_STAT_VALUES_MAX)
8016 return; /* all values == 0, suppress message */
8017
fb456252 8018 btrfs_info_in_rcu(dev->fs_info,
ecaeb14b 8019 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
606686ee 8020 rcu_str_deref(dev->name),
733f4fbb
SB
8021 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
8022 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
8023 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
8024 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
8025 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
8026}
8027
2ff7e61e 8028int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
b27f7c0c 8029 struct btrfs_ioctl_get_dev_stats *stats)
c11d2c23 8030{
562d7b15 8031 BTRFS_DEV_LOOKUP_ARGS(args);
c11d2c23 8032 struct btrfs_device *dev;
0b246afa 8033 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
c11d2c23
SB
8034 int i;
8035
8036 mutex_lock(&fs_devices->device_list_mutex);
562d7b15
JB
8037 args.devid = stats->devid;
8038 dev = btrfs_find_device(fs_info->fs_devices, &args);
c11d2c23
SB
8039 mutex_unlock(&fs_devices->device_list_mutex);
8040
8041 if (!dev) {
0b246afa 8042 btrfs_warn(fs_info, "get dev_stats failed, device not found");
c11d2c23 8043 return -ENODEV;
733f4fbb 8044 } else if (!dev->dev_stats_valid) {
0b246afa 8045 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
733f4fbb 8046 return -ENODEV;
b27f7c0c 8047 } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
c11d2c23
SB
8048 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
8049 if (stats->nr_items > i)
8050 stats->values[i] =
8051 btrfs_dev_stat_read_and_reset(dev, i);
8052 else
4e411a7d 8053 btrfs_dev_stat_set(dev, i, 0);
c11d2c23 8054 }
a69976bc
AJ
8055 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
8056 current->comm, task_pid_nr(current));
c11d2c23
SB
8057 } else {
8058 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
8059 if (stats->nr_items > i)
8060 stats->values[i] = btrfs_dev_stat_read(dev, i);
8061 }
8062 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
8063 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
8064 return 0;
8065}
a8a6dab7 8066
935e5cc9 8067/*
bbbf7243
NB
8068 * Update the size and bytes used for each device where it changed. This is
8069 * delayed since we would otherwise get errors while writing out the
8070 * superblocks.
8071 *
8072 * Must be invoked during transaction commit.
935e5cc9 8073 */
bbbf7243 8074void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
935e5cc9 8075{
935e5cc9
MX
8076 struct btrfs_device *curr, *next;
8077
bbbf7243 8078 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
ce7213c7 8079
bbbf7243 8080 if (list_empty(&trans->dev_update_list))
ce7213c7
MX
8081 return;
8082
bbbf7243
NB
8083 /*
8084 * We don't need the device_list_mutex here. This list is owned by the
8085 * transaction and the transaction must complete before the device is
8086 * released.
8087 */
8088 mutex_lock(&trans->fs_info->chunk_mutex);
8089 list_for_each_entry_safe(curr, next, &trans->dev_update_list,
8090 post_commit_list) {
8091 list_del_init(&curr->post_commit_list);
8092 curr->commit_total_bytes = curr->disk_total_bytes;
8093 curr->commit_bytes_used = curr->bytes_used;
ce7213c7 8094 }
bbbf7243 8095 mutex_unlock(&trans->fs_info->chunk_mutex);
ce7213c7 8096}
5a13f430 8097
46df06b8
DS
8098/*
8099 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
8100 */
8101int btrfs_bg_type_to_factor(u64 flags)
8102{
44b28ada
DS
8103 const int index = btrfs_bg_flags_to_raid_index(flags);
8104
8105 return btrfs_raid_array[index].ncopies;
46df06b8 8106}
cf90d884
QW
8107
8108
cf90d884
QW
8109
8110static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
8111 u64 chunk_offset, u64 devid,
8112 u64 physical_offset, u64 physical_len)
8113{
562d7b15 8114 struct btrfs_dev_lookup_args args = { .devid = devid };
c8bf1b67 8115 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
cf90d884
QW
8116 struct extent_map *em;
8117 struct map_lookup *map;
05a37c48 8118 struct btrfs_device *dev;
cf90d884
QW
8119 u64 stripe_len;
8120 bool found = false;
8121 int ret = 0;
8122 int i;
8123
8124 read_lock(&em_tree->lock);
8125 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
8126 read_unlock(&em_tree->lock);
8127
8128 if (!em) {
8129 btrfs_err(fs_info,
8130"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
8131 physical_offset, devid);
8132 ret = -EUCLEAN;
8133 goto out;
8134 }
8135
8136 map = em->map_lookup;
bc88b486 8137 stripe_len = btrfs_calc_stripe_length(em);
cf90d884
QW
8138 if (physical_len != stripe_len) {
8139 btrfs_err(fs_info,
8140"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
8141 physical_offset, devid, em->start, physical_len,
8142 stripe_len);
8143 ret = -EUCLEAN;
8144 goto out;
8145 }
8146
3613249a
QW
8147 /*
8148 * Very old mkfs.btrfs (before v4.1) will not respect the reserved
8149 * space. Although kernel can handle it without problem, better to warn
8150 * the users.
8151 */
8152 if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
8153 btrfs_warn(fs_info,
8154 "devid %llu physical %llu len %llu inside the reserved space",
8155 devid, physical_offset, physical_len);
8156
cf90d884
QW
8157 for (i = 0; i < map->num_stripes; i++) {
8158 if (map->stripes[i].dev->devid == devid &&
8159 map->stripes[i].physical == physical_offset) {
8160 found = true;
8161 if (map->verified_stripes >= map->num_stripes) {
8162 btrfs_err(fs_info,
8163 "too many dev extents for chunk %llu found",
8164 em->start);
8165 ret = -EUCLEAN;
8166 goto out;
8167 }
8168 map->verified_stripes++;
8169 break;
8170 }
8171 }
8172 if (!found) {
8173 btrfs_err(fs_info,
8174 "dev extent physical offset %llu devid %llu has no corresponding chunk",
8175 physical_offset, devid);
8176 ret = -EUCLEAN;
8177 }
05a37c48 8178
1a9fd417 8179 /* Make sure no dev extent is beyond device boundary */
562d7b15 8180 dev = btrfs_find_device(fs_info->fs_devices, &args);
05a37c48
QW
8181 if (!dev) {
8182 btrfs_err(fs_info, "failed to find devid %llu", devid);
8183 ret = -EUCLEAN;
8184 goto out;
8185 }
1b3922a8 8186
05a37c48
QW
8187 if (physical_offset + physical_len > dev->disk_total_bytes) {
8188 btrfs_err(fs_info,
8189"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
8190 devid, physical_offset, physical_len,
8191 dev->disk_total_bytes);
8192 ret = -EUCLEAN;
8193 goto out;
8194 }
381a696e
NA
8195
8196 if (dev->zone_info) {
8197 u64 zone_size = dev->zone_info->zone_size;
8198
8199 if (!IS_ALIGNED(physical_offset, zone_size) ||
8200 !IS_ALIGNED(physical_len, zone_size)) {
8201 btrfs_err(fs_info,
8202"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
8203 devid, physical_offset, physical_len);
8204 ret = -EUCLEAN;
8205 goto out;
8206 }
8207 }
8208
cf90d884
QW
8209out:
8210 free_extent_map(em);
8211 return ret;
8212}
8213
8214static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
8215{
c8bf1b67 8216 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
cf90d884
QW
8217 struct extent_map *em;
8218 struct rb_node *node;
8219 int ret = 0;
8220
8221 read_lock(&em_tree->lock);
07e1ce09 8222 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
cf90d884
QW
8223 em = rb_entry(node, struct extent_map, rb_node);
8224 if (em->map_lookup->num_stripes !=
8225 em->map_lookup->verified_stripes) {
8226 btrfs_err(fs_info,
8227 "chunk %llu has missing dev extent, have %d expect %d",
8228 em->start, em->map_lookup->verified_stripes,
8229 em->map_lookup->num_stripes);
8230 ret = -EUCLEAN;
8231 goto out;
8232 }
8233 }
8234out:
8235 read_unlock(&em_tree->lock);
8236 return ret;
8237}
8238
8239/*
8240 * Ensure that all dev extents are mapped to correct chunk, otherwise
8241 * later chunk allocation/free would cause unexpected behavior.
8242 *
8243 * NOTE: This will iterate through the whole device tree, which should be of
8244 * the same size level as the chunk tree. This slightly increases mount time.
8245 */
8246int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
8247{
8248 struct btrfs_path *path;
8249 struct btrfs_root *root = fs_info->dev_root;
8250 struct btrfs_key key;
5eb19381
QW
8251 u64 prev_devid = 0;
8252 u64 prev_dev_ext_end = 0;
cf90d884
QW
8253 int ret = 0;
8254
42437a63
JB
8255 /*
8256 * We don't have a dev_root because we mounted with ignorebadroots and
8257 * failed to load the root, so we want to skip the verification in this
8258 * case for sure.
8259 *
8260 * However if the dev root is fine, but the tree itself is corrupted
8261 * we'd still fail to mount. This verification is only to make sure
8262 * writes can happen safely, so instead just bypass this check
8263 * completely in the case of IGNOREBADROOTS.
8264 */
8265 if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
8266 return 0;
8267
cf90d884
QW
8268 key.objectid = 1;
8269 key.type = BTRFS_DEV_EXTENT_KEY;
8270 key.offset = 0;
8271
8272 path = btrfs_alloc_path();
8273 if (!path)
8274 return -ENOMEM;
8275
8276 path->reada = READA_FORWARD;
8277 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
8278 if (ret < 0)
8279 goto out;
8280
8281 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ad9a9378 8282 ret = btrfs_next_leaf(root, path);
cf90d884
QW
8283 if (ret < 0)
8284 goto out;
8285 /* No dev extents at all? Not good */
8286 if (ret > 0) {
8287 ret = -EUCLEAN;
8288 goto out;
8289 }
8290 }
8291 while (1) {
8292 struct extent_buffer *leaf = path->nodes[0];
8293 struct btrfs_dev_extent *dext;
8294 int slot = path->slots[0];
8295 u64 chunk_offset;
8296 u64 physical_offset;
8297 u64 physical_len;
8298 u64 devid;
8299
8300 btrfs_item_key_to_cpu(leaf, &key, slot);
8301 if (key.type != BTRFS_DEV_EXTENT_KEY)
8302 break;
8303 devid = key.objectid;
8304 physical_offset = key.offset;
8305
8306 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
8307 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
8308 physical_len = btrfs_dev_extent_length(leaf, dext);
8309
5eb19381
QW
8310 /* Check if this dev extent overlaps with the previous one */
8311 if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
8312 btrfs_err(fs_info,
8313"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
8314 devid, physical_offset, prev_dev_ext_end);
8315 ret = -EUCLEAN;
8316 goto out;
8317 }
8318
cf90d884
QW
8319 ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
8320 physical_offset, physical_len);
8321 if (ret < 0)
8322 goto out;
5eb19381
QW
8323 prev_devid = devid;
8324 prev_dev_ext_end = physical_offset + physical_len;
8325
cf90d884
QW
8326 ret = btrfs_next_item(root, path);
8327 if (ret < 0)
8328 goto out;
8329 if (ret > 0) {
8330 ret = 0;
8331 break;
8332 }
8333 }
8334
8335 /* Ensure all chunks have corresponding dev extents */
8336 ret = verify_chunk_dev_extent_mapping(fs_info);
8337out:
8338 btrfs_free_path(path);
8339 return ret;
8340}
eede2bf3
OS
8341
8342/*
8343 * Check whether the given block group or device is pinned by any inode being
8344 * used as a swapfile.
8345 */
8346bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
8347{
8348 struct btrfs_swapfile_pin *sp;
8349 struct rb_node *node;
8350
8351 spin_lock(&fs_info->swapfile_pins_lock);
8352 node = fs_info->swapfile_pins.rb_node;
8353 while (node) {
8354 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
8355 if (ptr < sp->ptr)
8356 node = node->rb_left;
8357 else if (ptr > sp->ptr)
8358 node = node->rb_right;
8359 else
8360 break;
8361 }
8362 spin_unlock(&fs_info->swapfile_pins_lock);
8363 return node != NULL;
8364}
f7ef5287
NA
8365
8366static int relocating_repair_kthread(void *data)
8367{
0d031dc4 8368 struct btrfs_block_group *cache = data;
f7ef5287
NA
8369 struct btrfs_fs_info *fs_info = cache->fs_info;
8370 u64 target;
8371 int ret = 0;
8372
8373 target = cache->start;
8374 btrfs_put_block_group(cache);
8375
ca5e4ea0 8376 sb_start_write(fs_info->sb);
f7ef5287
NA
8377 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
8378 btrfs_info(fs_info,
8379 "zoned: skip relocating block group %llu to repair: EBUSY",
8380 target);
ca5e4ea0 8381 sb_end_write(fs_info->sb);
f7ef5287
NA
8382 return -EBUSY;
8383 }
8384
f3372065 8385 mutex_lock(&fs_info->reclaim_bgs_lock);
f7ef5287
NA
8386
8387 /* Ensure block group still exists */
8388 cache = btrfs_lookup_block_group(fs_info, target);
8389 if (!cache)
8390 goto out;
8391
3349b57f 8392 if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
f7ef5287
NA
8393 goto out;
8394
8395 ret = btrfs_may_alloc_data_chunk(fs_info, target);
8396 if (ret < 0)
8397 goto out;
8398
8399 btrfs_info(fs_info,
8400 "zoned: relocating block group %llu to repair IO failure",
8401 target);
8402 ret = btrfs_relocate_chunk(fs_info, target);
8403
8404out:
8405 if (cache)
8406 btrfs_put_block_group(cache);
f3372065 8407 mutex_unlock(&fs_info->reclaim_bgs_lock);
f7ef5287 8408 btrfs_exclop_finish(fs_info);
ca5e4ea0 8409 sb_end_write(fs_info->sb);
f7ef5287
NA
8410
8411 return ret;
8412}
8413
554aed7d 8414bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
f7ef5287
NA
8415{
8416 struct btrfs_block_group *cache;
8417
554aed7d
JT
8418 if (!btrfs_is_zoned(fs_info))
8419 return false;
8420
f7ef5287
NA
8421 /* Do not attempt to repair in degraded state */
8422 if (btrfs_test_opt(fs_info, DEGRADED))
554aed7d 8423 return true;
f7ef5287
NA
8424
8425 cache = btrfs_lookup_block_group(fs_info, logical);
8426 if (!cache)
554aed7d 8427 return true;
f7ef5287 8428
3349b57f 8429 if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) {
f7ef5287 8430 btrfs_put_block_group(cache);
554aed7d 8431 return true;
f7ef5287 8432 }
f7ef5287
NA
8433
8434 kthread_run(relocating_repair_kthread, cache,
8435 "btrfs-relocating-repair");
8436
554aed7d 8437 return true;
f7ef5287 8438}
d45cfb88
CH
8439
8440int __init btrfs_bioset_init(void)
8441{
8442 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
8443 offsetof(struct btrfs_bio, bio),
8444 BIOSET_NEED_BVECS))
8445 return -ENOMEM;
8446 return 0;
8447}
8448
8449void __cold btrfs_bioset_exit(void)
8450{
8451 bioset_exit(&btrfs_bioset);
8452}
This page took 2.956226 seconds and 4 git commands to generate.