]> Git Repo - J-linux.git/blob - fs/btrfs/zoned.c
Merge tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[J-linux.git] / fs / btrfs / zoned.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include <linux/bitops.h>
4 #include <linux/slab.h>
5 #include <linux/blkdev.h>
6 #include <linux/sched/mm.h>
7 #include <linux/atomic.h>
8 #include <linux/vmalloc.h>
9 #include "ctree.h"
10 #include "volumes.h"
11 #include "zoned.h"
12 #include "rcu-string.h"
13 #include "disk-io.h"
14 #include "block-group.h"
15 #include "dev-replace.h"
16 #include "space-info.h"
17 #include "fs.h"
18 #include "accessors.h"
19 #include "bio.h"
20
21 /* Maximum number of zones to report per blkdev_report_zones() call */
22 #define BTRFS_REPORT_NR_ZONES   4096
23 /* Invalid allocation pointer value for missing devices */
24 #define WP_MISSING_DEV ((u64)-1)
25 /* Pseudo write pointer value for conventional zone */
26 #define WP_CONVENTIONAL ((u64)-2)
27
28 /*
29  * Location of the first zone of superblock logging zone pairs.
30  *
31  * - primary superblock:    0B (zone 0)
32  * - first copy:          512G (zone starting at that offset)
33  * - second copy:           4T (zone starting at that offset)
34  */
35 #define BTRFS_SB_LOG_PRIMARY_OFFSET     (0ULL)
36 #define BTRFS_SB_LOG_FIRST_OFFSET       (512ULL * SZ_1G)
37 #define BTRFS_SB_LOG_SECOND_OFFSET      (4096ULL * SZ_1G)
38
39 #define BTRFS_SB_LOG_FIRST_SHIFT        const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
40 #define BTRFS_SB_LOG_SECOND_SHIFT       const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
41
42 /* Number of superblock log zones */
43 #define BTRFS_NR_SB_LOG_ZONES 2
44
45 /*
46  * Minimum of active zones we need:
47  *
48  * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
49  * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
50  * - 1 zone for tree-log dedicated block group
51  * - 1 zone for relocation
52  */
53 #define BTRFS_MIN_ACTIVE_ZONES          (BTRFS_SUPER_MIRROR_MAX + 5)
54
55 /*
56  * Minimum / maximum supported zone size. Currently, SMR disks have a zone
57  * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
58  * We do not expect the zone size to become larger than 8GiB or smaller than
59  * 4MiB in the near future.
60  */
61 #define BTRFS_MAX_ZONE_SIZE             SZ_8G
62 #define BTRFS_MIN_ZONE_SIZE             SZ_4M
63
64 #define SUPER_INFO_SECTORS      ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
65
66 static void wait_eb_writebacks(struct btrfs_block_group *block_group);
67 static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written);
68
69 static inline bool sb_zone_is_full(const struct blk_zone *zone)
70 {
71         return (zone->cond == BLK_ZONE_COND_FULL) ||
72                 (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
73 }
74
75 static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
76 {
77         struct blk_zone *zones = data;
78
79         memcpy(&zones[idx], zone, sizeof(*zone));
80
81         return 0;
82 }
83
84 static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
85                             u64 *wp_ret)
86 {
87         bool empty[BTRFS_NR_SB_LOG_ZONES];
88         bool full[BTRFS_NR_SB_LOG_ZONES];
89         sector_t sector;
90
91         for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
92                 ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
93                 empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
94                 full[i] = sb_zone_is_full(&zones[i]);
95         }
96
97         /*
98          * Possible states of log buffer zones
99          *
100          *           Empty[0]  In use[0]  Full[0]
101          * Empty[1]         *          0        1
102          * In use[1]        x          x        1
103          * Full[1]          0          0        C
104          *
105          * Log position:
106          *   *: Special case, no superblock is written
107          *   0: Use write pointer of zones[0]
108          *   1: Use write pointer of zones[1]
109          *   C: Compare super blocks from zones[0] and zones[1], use the latest
110          *      one determined by generation
111          *   x: Invalid state
112          */
113
114         if (empty[0] && empty[1]) {
115                 /* Special case to distinguish no superblock to read */
116                 *wp_ret = zones[0].start << SECTOR_SHIFT;
117                 return -ENOENT;
118         } else if (full[0] && full[1]) {
119                 /* Compare two super blocks */
120                 struct address_space *mapping = bdev->bd_mapping;
121                 struct page *page[BTRFS_NR_SB_LOG_ZONES];
122                 struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
123
124                 for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
125                         u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT;
126                         u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) -
127                                                 BTRFS_SUPER_INFO_SIZE;
128
129                         page[i] = read_cache_page_gfp(mapping,
130                                         bytenr >> PAGE_SHIFT, GFP_NOFS);
131                         if (IS_ERR(page[i])) {
132                                 if (i == 1)
133                                         btrfs_release_disk_super(super[0]);
134                                 return PTR_ERR(page[i]);
135                         }
136                         super[i] = page_address(page[i]);
137                 }
138
139                 if (btrfs_super_generation(super[0]) >
140                     btrfs_super_generation(super[1]))
141                         sector = zones[1].start;
142                 else
143                         sector = zones[0].start;
144
145                 for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
146                         btrfs_release_disk_super(super[i]);
147         } else if (!full[0] && (empty[1] || full[1])) {
148                 sector = zones[0].wp;
149         } else if (full[0]) {
150                 sector = zones[1].wp;
151         } else {
152                 return -EUCLEAN;
153         }
154         *wp_ret = sector << SECTOR_SHIFT;
155         return 0;
156 }
157
158 /*
159  * Get the first zone number of the superblock mirror
160  */
161 static inline u32 sb_zone_number(int shift, int mirror)
162 {
163         u64 zone = U64_MAX;
164
165         ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
166         switch (mirror) {
167         case 0: zone = 0; break;
168         case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
169         case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
170         }
171
172         ASSERT(zone <= U32_MAX);
173
174         return (u32)zone;
175 }
176
177 static inline sector_t zone_start_sector(u32 zone_number,
178                                          struct block_device *bdev)
179 {
180         return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
181 }
182
183 static inline u64 zone_start_physical(u32 zone_number,
184                                       struct btrfs_zoned_device_info *zone_info)
185 {
186         return (u64)zone_number << zone_info->zone_size_shift;
187 }
188
189 /*
190  * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
191  * device into static sized chunks and fake a conventional zone on each of
192  * them.
193  */
194 static int emulate_report_zones(struct btrfs_device *device, u64 pos,
195                                 struct blk_zone *zones, unsigned int nr_zones)
196 {
197         const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
198         sector_t bdev_size = bdev_nr_sectors(device->bdev);
199         unsigned int i;
200
201         pos >>= SECTOR_SHIFT;
202         for (i = 0; i < nr_zones; i++) {
203                 zones[i].start = i * zone_sectors + pos;
204                 zones[i].len = zone_sectors;
205                 zones[i].capacity = zone_sectors;
206                 zones[i].wp = zones[i].start + zone_sectors;
207                 zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
208                 zones[i].cond = BLK_ZONE_COND_NOT_WP;
209
210                 if (zones[i].wp >= bdev_size) {
211                         i++;
212                         break;
213                 }
214         }
215
216         return i;
217 }
218
219 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
220                                struct blk_zone *zones, unsigned int *nr_zones)
221 {
222         struct btrfs_zoned_device_info *zinfo = device->zone_info;
223         int ret;
224
225         if (!*nr_zones)
226                 return 0;
227
228         if (!bdev_is_zoned(device->bdev)) {
229                 ret = emulate_report_zones(device, pos, zones, *nr_zones);
230                 *nr_zones = ret;
231                 return 0;
232         }
233
234         /* Check cache */
235         if (zinfo->zone_cache) {
236                 unsigned int i;
237                 u32 zno;
238
239                 ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
240                 zno = pos >> zinfo->zone_size_shift;
241                 /*
242                  * We cannot report zones beyond the zone end. So, it is OK to
243                  * cap *nr_zones to at the end.
244                  */
245                 *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
246
247                 for (i = 0; i < *nr_zones; i++) {
248                         struct blk_zone *zone_info;
249
250                         zone_info = &zinfo->zone_cache[zno + i];
251                         if (!zone_info->len)
252                                 break;
253                 }
254
255                 if (i == *nr_zones) {
256                         /* Cache hit on all the zones */
257                         memcpy(zones, zinfo->zone_cache + zno,
258                                sizeof(*zinfo->zone_cache) * *nr_zones);
259                         return 0;
260                 }
261         }
262
263         ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
264                                   copy_zone_info_cb, zones);
265         if (ret < 0) {
266                 btrfs_err_in_rcu(device->fs_info,
267                                  "zoned: failed to read zone %llu on %s (devid %llu)",
268                                  pos, rcu_str_deref(device->name),
269                                  device->devid);
270                 return ret;
271         }
272         *nr_zones = ret;
273         if (!ret)
274                 return -EIO;
275
276         /* Populate cache */
277         if (zinfo->zone_cache) {
278                 u32 zno = pos >> zinfo->zone_size_shift;
279
280                 memcpy(zinfo->zone_cache + zno, zones,
281                        sizeof(*zinfo->zone_cache) * *nr_zones);
282         }
283
284         return 0;
285 }
286
287 /* The emulated zone size is determined from the size of device extent */
288 static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
289 {
290         BTRFS_PATH_AUTO_FREE(path);
291         struct btrfs_root *root = fs_info->dev_root;
292         struct btrfs_key key;
293         struct extent_buffer *leaf;
294         struct btrfs_dev_extent *dext;
295         int ret = 0;
296
297         key.objectid = 1;
298         key.type = BTRFS_DEV_EXTENT_KEY;
299         key.offset = 0;
300
301         path = btrfs_alloc_path();
302         if (!path)
303                 return -ENOMEM;
304
305         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
306         if (ret < 0)
307                 return ret;
308
309         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
310                 ret = btrfs_next_leaf(root, path);
311                 if (ret < 0)
312                         return ret;
313                 /* No dev extents at all? Not good */
314                 if (ret > 0)
315                         return -EUCLEAN;
316         }
317
318         leaf = path->nodes[0];
319         dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
320         fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
321         return 0;
322 }
323
324 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
325 {
326         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
327         struct btrfs_device *device;
328         int ret = 0;
329
330         /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
331         if (!btrfs_fs_incompat(fs_info, ZONED))
332                 return 0;
333
334         mutex_lock(&fs_devices->device_list_mutex);
335         list_for_each_entry(device, &fs_devices->devices, dev_list) {
336                 /* We can skip reading of zone info for missing devices */
337                 if (!device->bdev)
338                         continue;
339
340                 ret = btrfs_get_dev_zone_info(device, true);
341                 if (ret)
342                         break;
343         }
344         mutex_unlock(&fs_devices->device_list_mutex);
345
346         return ret;
347 }
348
349 int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
350 {
351         struct btrfs_fs_info *fs_info = device->fs_info;
352         struct btrfs_zoned_device_info *zone_info = NULL;
353         struct block_device *bdev = device->bdev;
354         unsigned int max_active_zones;
355         unsigned int nactive;
356         sector_t nr_sectors;
357         sector_t sector = 0;
358         struct blk_zone *zones = NULL;
359         unsigned int i, nreported = 0, nr_zones;
360         sector_t zone_sectors;
361         char *model, *emulated;
362         int ret;
363
364         /*
365          * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
366          * yet be set.
367          */
368         if (!btrfs_fs_incompat(fs_info, ZONED))
369                 return 0;
370
371         if (device->zone_info)
372                 return 0;
373
374         zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
375         if (!zone_info)
376                 return -ENOMEM;
377
378         device->zone_info = zone_info;
379
380         if (!bdev_is_zoned(bdev)) {
381                 if (!fs_info->zone_size) {
382                         ret = calculate_emulated_zone_size(fs_info);
383                         if (ret)
384                                 goto out;
385                 }
386
387                 ASSERT(fs_info->zone_size);
388                 zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
389         } else {
390                 zone_sectors = bdev_zone_sectors(bdev);
391         }
392
393         ASSERT(is_power_of_two_u64(zone_sectors));
394         zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
395
396         /* We reject devices with a zone size larger than 8GB */
397         if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
398                 btrfs_err_in_rcu(fs_info,
399                 "zoned: %s: zone size %llu larger than supported maximum %llu",
400                                  rcu_str_deref(device->name),
401                                  zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
402                 ret = -EINVAL;
403                 goto out;
404         } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
405                 btrfs_err_in_rcu(fs_info,
406                 "zoned: %s: zone size %llu smaller than supported minimum %u",
407                                  rcu_str_deref(device->name),
408                                  zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
409                 ret = -EINVAL;
410                 goto out;
411         }
412
413         nr_sectors = bdev_nr_sectors(bdev);
414         zone_info->zone_size_shift = ilog2(zone_info->zone_size);
415         zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
416         if (!IS_ALIGNED(nr_sectors, zone_sectors))
417                 zone_info->nr_zones++;
418
419         max_active_zones = bdev_max_active_zones(bdev);
420         if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
421                 btrfs_err_in_rcu(fs_info,
422 "zoned: %s: max active zones %u is too small, need at least %u active zones",
423                                  rcu_str_deref(device->name), max_active_zones,
424                                  BTRFS_MIN_ACTIVE_ZONES);
425                 ret = -EINVAL;
426                 goto out;
427         }
428         zone_info->max_active_zones = max_active_zones;
429
430         zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
431         if (!zone_info->seq_zones) {
432                 ret = -ENOMEM;
433                 goto out;
434         }
435
436         zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
437         if (!zone_info->empty_zones) {
438                 ret = -ENOMEM;
439                 goto out;
440         }
441
442         zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
443         if (!zone_info->active_zones) {
444                 ret = -ENOMEM;
445                 goto out;
446         }
447
448         zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
449         if (!zones) {
450                 ret = -ENOMEM;
451                 goto out;
452         }
453
454         /*
455          * Enable zone cache only for a zoned device. On a non-zoned device, we
456          * fill the zone info with emulated CONVENTIONAL zones, so no need to
457          * use the cache.
458          */
459         if (populate_cache && bdev_is_zoned(device->bdev)) {
460                 zone_info->zone_cache = vcalloc(zone_info->nr_zones,
461                                                 sizeof(struct blk_zone));
462                 if (!zone_info->zone_cache) {
463                         btrfs_err_in_rcu(device->fs_info,
464                                 "zoned: failed to allocate zone cache for %s",
465                                 rcu_str_deref(device->name));
466                         ret = -ENOMEM;
467                         goto out;
468                 }
469         }
470
471         /* Get zones type */
472         nactive = 0;
473         while (sector < nr_sectors) {
474                 nr_zones = BTRFS_REPORT_NR_ZONES;
475                 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
476                                           &nr_zones);
477                 if (ret)
478                         goto out;
479
480                 for (i = 0; i < nr_zones; i++) {
481                         if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
482                                 __set_bit(nreported, zone_info->seq_zones);
483                         switch (zones[i].cond) {
484                         case BLK_ZONE_COND_EMPTY:
485                                 __set_bit(nreported, zone_info->empty_zones);
486                                 break;
487                         case BLK_ZONE_COND_IMP_OPEN:
488                         case BLK_ZONE_COND_EXP_OPEN:
489                         case BLK_ZONE_COND_CLOSED:
490                                 __set_bit(nreported, zone_info->active_zones);
491                                 nactive++;
492                                 break;
493                         }
494                         nreported++;
495                 }
496                 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
497         }
498
499         if (nreported != zone_info->nr_zones) {
500                 btrfs_err_in_rcu(device->fs_info,
501                                  "inconsistent number of zones on %s (%u/%u)",
502                                  rcu_str_deref(device->name), nreported,
503                                  zone_info->nr_zones);
504                 ret = -EIO;
505                 goto out;
506         }
507
508         if (max_active_zones) {
509                 if (nactive > max_active_zones) {
510                         btrfs_err_in_rcu(device->fs_info,
511                         "zoned: %u active zones on %s exceeds max_active_zones %u",
512                                          nactive, rcu_str_deref(device->name),
513                                          max_active_zones);
514                         ret = -EIO;
515                         goto out;
516                 }
517                 atomic_set(&zone_info->active_zones_left,
518                            max_active_zones - nactive);
519                 set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
520         }
521
522         /* Validate superblock log */
523         nr_zones = BTRFS_NR_SB_LOG_ZONES;
524         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
525                 u32 sb_zone;
526                 u64 sb_wp;
527                 int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
528
529                 sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
530                 if (sb_zone + 1 >= zone_info->nr_zones)
531                         continue;
532
533                 ret = btrfs_get_dev_zones(device,
534                                           zone_start_physical(sb_zone, zone_info),
535                                           &zone_info->sb_zones[sb_pos],
536                                           &nr_zones);
537                 if (ret)
538                         goto out;
539
540                 if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
541                         btrfs_err_in_rcu(device->fs_info,
542         "zoned: failed to read super block log zone info at devid %llu zone %u",
543                                          device->devid, sb_zone);
544                         ret = -EUCLEAN;
545                         goto out;
546                 }
547
548                 /*
549                  * If zones[0] is conventional, always use the beginning of the
550                  * zone to record superblock. No need to validate in that case.
551                  */
552                 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
553                     BLK_ZONE_TYPE_CONVENTIONAL)
554                         continue;
555
556                 ret = sb_write_pointer(device->bdev,
557                                        &zone_info->sb_zones[sb_pos], &sb_wp);
558                 if (ret != -ENOENT && ret) {
559                         btrfs_err_in_rcu(device->fs_info,
560                         "zoned: super block log zone corrupted devid %llu zone %u",
561                                          device->devid, sb_zone);
562                         ret = -EUCLEAN;
563                         goto out;
564                 }
565         }
566
567
568         kvfree(zones);
569
570         if (bdev_is_zoned(bdev)) {
571                 model = "host-managed zoned";
572                 emulated = "";
573         } else {
574                 model = "regular";
575                 emulated = "emulated ";
576         }
577
578         btrfs_info_in_rcu(fs_info,
579                 "%s block device %s, %u %szones of %llu bytes",
580                 model, rcu_str_deref(device->name), zone_info->nr_zones,
581                 emulated, zone_info->zone_size);
582
583         return 0;
584
585 out:
586         kvfree(zones);
587         btrfs_destroy_dev_zone_info(device);
588         return ret;
589 }
590
591 void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
592 {
593         struct btrfs_zoned_device_info *zone_info = device->zone_info;
594
595         if (!zone_info)
596                 return;
597
598         bitmap_free(zone_info->active_zones);
599         bitmap_free(zone_info->seq_zones);
600         bitmap_free(zone_info->empty_zones);
601         vfree(zone_info->zone_cache);
602         kfree(zone_info);
603         device->zone_info = NULL;
604 }
605
606 struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev)
607 {
608         struct btrfs_zoned_device_info *zone_info;
609
610         zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL);
611         if (!zone_info)
612                 return NULL;
613
614         zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
615         if (!zone_info->seq_zones)
616                 goto out;
617
618         bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones,
619                     zone_info->nr_zones);
620
621         zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
622         if (!zone_info->empty_zones)
623                 goto out;
624
625         bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones,
626                     zone_info->nr_zones);
627
628         zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
629         if (!zone_info->active_zones)
630                 goto out;
631
632         bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones,
633                     zone_info->nr_zones);
634         zone_info->zone_cache = NULL;
635
636         return zone_info;
637
638 out:
639         bitmap_free(zone_info->seq_zones);
640         bitmap_free(zone_info->empty_zones);
641         bitmap_free(zone_info->active_zones);
642         kfree(zone_info);
643         return NULL;
644 }
645
646 static int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone)
647 {
648         unsigned int nr_zones = 1;
649         int ret;
650
651         ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
652         if (ret != 0 || !nr_zones)
653                 return ret ? ret : -EIO;
654
655         return 0;
656 }
657
658 static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
659 {
660         struct btrfs_device *device;
661
662         list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
663                 if (device->bdev && bdev_is_zoned(device->bdev)) {
664                         btrfs_err(fs_info,
665                                 "zoned: mode not enabled but zoned device found: %pg",
666                                 device->bdev);
667                         return -EINVAL;
668                 }
669         }
670
671         return 0;
672 }
673
674 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
675 {
676         struct queue_limits *lim = &fs_info->limits;
677         struct btrfs_device *device;
678         u64 zone_size = 0;
679         int ret;
680
681         /*
682          * Host-Managed devices can't be used without the ZONED flag.  With the
683          * ZONED all devices can be used, using zone emulation if required.
684          */
685         if (!btrfs_fs_incompat(fs_info, ZONED))
686                 return btrfs_check_for_zoned_device(fs_info);
687
688         blk_set_stacking_limits(lim);
689
690         list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
691                 struct btrfs_zoned_device_info *zone_info = device->zone_info;
692
693                 if (!device->bdev)
694                         continue;
695
696                 if (!zone_size) {
697                         zone_size = zone_info->zone_size;
698                 } else if (zone_info->zone_size != zone_size) {
699                         btrfs_err(fs_info,
700                 "zoned: unequal block device zone sizes: have %llu found %llu",
701                                   zone_info->zone_size, zone_size);
702                         return -EINVAL;
703                 }
704
705                 /*
706                  * With the zoned emulation, we can have non-zoned device on the
707                  * zoned mode. In this case, we don't have a valid max zone
708                  * append size.
709                  */
710                 if (bdev_is_zoned(device->bdev))
711                         blk_stack_limits(lim, bdev_limits(device->bdev), 0);
712         }
713
714         ret = blk_validate_limits(lim);
715         if (ret) {
716                 btrfs_err(fs_info, "zoned: failed to validate queue limits");
717                 return ret;
718         }
719
720         /*
721          * stripe_size is always aligned to BTRFS_STRIPE_LEN in
722          * btrfs_create_chunk(). Since we want stripe_len == zone_size,
723          * check the alignment here.
724          */
725         if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
726                 btrfs_err(fs_info,
727                           "zoned: zone size %llu not aligned to stripe %u",
728                           zone_size, BTRFS_STRIPE_LEN);
729                 return -EINVAL;
730         }
731
732         if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
733                 btrfs_err(fs_info, "zoned: mixed block groups not supported");
734                 return -EINVAL;
735         }
736
737         fs_info->zone_size = zone_size;
738         /*
739          * Also limit max_zone_append_size by max_segments * PAGE_SIZE.
740          * Technically, we can have multiple pages per segment. But, since
741          * we add the pages one by one to a bio, and cannot increase the
742          * metadata reservation even if it increases the number of extents, it
743          * is safe to stick with the limit.
744          */
745         fs_info->max_zone_append_size = ALIGN_DOWN(
746                 min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT,
747                      (u64)lim->max_sectors << SECTOR_SHIFT,
748                      (u64)lim->max_segments << PAGE_SHIFT),
749                 fs_info->sectorsize);
750         fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
751         if (fs_info->max_zone_append_size < fs_info->max_extent_size)
752                 fs_info->max_extent_size = fs_info->max_zone_append_size;
753
754         /*
755          * Check mount options here, because we might change fs_info->zoned
756          * from fs_info->zone_size.
757          */
758         ret = btrfs_check_mountopts_zoned(fs_info, &fs_info->mount_opt);
759         if (ret)
760                 return ret;
761
762         btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
763         return 0;
764 }
765
766 int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info,
767                                 unsigned long long *mount_opt)
768 {
769         if (!btrfs_is_zoned(info))
770                 return 0;
771
772         /*
773          * Space cache writing is not COWed. Disable that to avoid write errors
774          * in sequential zones.
775          */
776         if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
777                 btrfs_err(info, "zoned: space cache v1 is not supported");
778                 return -EINVAL;
779         }
780
781         if (btrfs_raw_test_opt(*mount_opt, NODATACOW)) {
782                 btrfs_err(info, "zoned: NODATACOW not supported");
783                 return -EINVAL;
784         }
785
786         if (btrfs_raw_test_opt(*mount_opt, DISCARD_ASYNC)) {
787                 btrfs_info(info,
788                            "zoned: async discard ignored and disabled for zoned mode");
789                 btrfs_clear_opt(*mount_opt, DISCARD_ASYNC);
790         }
791
792         return 0;
793 }
794
795 static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
796                            int rw, u64 *bytenr_ret)
797 {
798         u64 wp;
799         int ret;
800
801         if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
802                 *bytenr_ret = zones[0].start << SECTOR_SHIFT;
803                 return 0;
804         }
805
806         ret = sb_write_pointer(bdev, zones, &wp);
807         if (ret != -ENOENT && ret < 0)
808                 return ret;
809
810         if (rw == WRITE) {
811                 struct blk_zone *reset = NULL;
812
813                 if (wp == zones[0].start << SECTOR_SHIFT)
814                         reset = &zones[0];
815                 else if (wp == zones[1].start << SECTOR_SHIFT)
816                         reset = &zones[1];
817
818                 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
819                         unsigned int nofs_flags;
820
821                         ASSERT(sb_zone_is_full(reset));
822
823                         nofs_flags = memalloc_nofs_save();
824                         ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
825                                                reset->start, reset->len);
826                         memalloc_nofs_restore(nofs_flags);
827                         if (ret)
828                                 return ret;
829
830                         reset->cond = BLK_ZONE_COND_EMPTY;
831                         reset->wp = reset->start;
832                 }
833         } else if (ret != -ENOENT) {
834                 /*
835                  * For READ, we want the previous one. Move write pointer to
836                  * the end of a zone, if it is at the head of a zone.
837                  */
838                 u64 zone_end = 0;
839
840                 if (wp == zones[0].start << SECTOR_SHIFT)
841                         zone_end = zones[1].start + zones[1].capacity;
842                 else if (wp == zones[1].start << SECTOR_SHIFT)
843                         zone_end = zones[0].start + zones[0].capacity;
844                 if (zone_end)
845                         wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
846                                         BTRFS_SUPER_INFO_SIZE);
847
848                 wp -= BTRFS_SUPER_INFO_SIZE;
849         }
850
851         *bytenr_ret = wp;
852         return 0;
853
854 }
855
856 int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
857                                u64 *bytenr_ret)
858 {
859         struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
860         sector_t zone_sectors;
861         u32 sb_zone;
862         int ret;
863         u8 zone_sectors_shift;
864         sector_t nr_sectors;
865         u32 nr_zones;
866
867         if (!bdev_is_zoned(bdev)) {
868                 *bytenr_ret = btrfs_sb_offset(mirror);
869                 return 0;
870         }
871
872         ASSERT(rw == READ || rw == WRITE);
873
874         zone_sectors = bdev_zone_sectors(bdev);
875         if (!is_power_of_2(zone_sectors))
876                 return -EINVAL;
877         zone_sectors_shift = ilog2(zone_sectors);
878         nr_sectors = bdev_nr_sectors(bdev);
879         nr_zones = nr_sectors >> zone_sectors_shift;
880
881         sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
882         if (sb_zone + 1 >= nr_zones)
883                 return -ENOENT;
884
885         ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
886                                   BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
887                                   zones);
888         if (ret < 0)
889                 return ret;
890         if (ret != BTRFS_NR_SB_LOG_ZONES)
891                 return -EIO;
892
893         return sb_log_location(bdev, zones, rw, bytenr_ret);
894 }
895
896 int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
897                           u64 *bytenr_ret)
898 {
899         struct btrfs_zoned_device_info *zinfo = device->zone_info;
900         u32 zone_num;
901
902         /*
903          * For a zoned filesystem on a non-zoned block device, use the same
904          * super block locations as regular filesystem. Doing so, the super
905          * block can always be retrieved and the zoned flag of the volume
906          * detected from the super block information.
907          */
908         if (!bdev_is_zoned(device->bdev)) {
909                 *bytenr_ret = btrfs_sb_offset(mirror);
910                 return 0;
911         }
912
913         zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
914         if (zone_num + 1 >= zinfo->nr_zones)
915                 return -ENOENT;
916
917         return sb_log_location(device->bdev,
918                                &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
919                                rw, bytenr_ret);
920 }
921
922 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
923                                   int mirror)
924 {
925         u32 zone_num;
926
927         if (!zinfo)
928                 return false;
929
930         zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
931         if (zone_num + 1 >= zinfo->nr_zones)
932                 return false;
933
934         if (!test_bit(zone_num, zinfo->seq_zones))
935                 return false;
936
937         return true;
938 }
939
940 int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
941 {
942         struct btrfs_zoned_device_info *zinfo = device->zone_info;
943         struct blk_zone *zone;
944         int i;
945
946         if (!is_sb_log_zone(zinfo, mirror))
947                 return 0;
948
949         zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
950         for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
951                 /* Advance the next zone */
952                 if (zone->cond == BLK_ZONE_COND_FULL) {
953                         zone++;
954                         continue;
955                 }
956
957                 if (zone->cond == BLK_ZONE_COND_EMPTY)
958                         zone->cond = BLK_ZONE_COND_IMP_OPEN;
959
960                 zone->wp += SUPER_INFO_SECTORS;
961
962                 if (sb_zone_is_full(zone)) {
963                         /*
964                          * No room left to write new superblock. Since
965                          * superblock is written with REQ_SYNC, it is safe to
966                          * finish the zone now.
967                          *
968                          * If the write pointer is exactly at the capacity,
969                          * explicit ZONE_FINISH is not necessary.
970                          */
971                         if (zone->wp != zone->start + zone->capacity) {
972                                 unsigned int nofs_flags;
973                                 int ret;
974
975                                 nofs_flags = memalloc_nofs_save();
976                                 ret = blkdev_zone_mgmt(device->bdev,
977                                                 REQ_OP_ZONE_FINISH, zone->start,
978                                                 zone->len);
979                                 memalloc_nofs_restore(nofs_flags);
980                                 if (ret)
981                                         return ret;
982                         }
983
984                         zone->wp = zone->start + zone->len;
985                         zone->cond = BLK_ZONE_COND_FULL;
986                 }
987                 return 0;
988         }
989
990         /* All the zones are FULL. Should not reach here. */
991         ASSERT(0);
992         return -EIO;
993 }
994
995 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
996 {
997         unsigned int nofs_flags;
998         sector_t zone_sectors;
999         sector_t nr_sectors;
1000         u8 zone_sectors_shift;
1001         u32 sb_zone;
1002         u32 nr_zones;
1003         int ret;
1004
1005         zone_sectors = bdev_zone_sectors(bdev);
1006         zone_sectors_shift = ilog2(zone_sectors);
1007         nr_sectors = bdev_nr_sectors(bdev);
1008         nr_zones = nr_sectors >> zone_sectors_shift;
1009
1010         sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
1011         if (sb_zone + 1 >= nr_zones)
1012                 return -ENOENT;
1013
1014         nofs_flags = memalloc_nofs_save();
1015         ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
1016                                zone_start_sector(sb_zone, bdev),
1017                                zone_sectors * BTRFS_NR_SB_LOG_ZONES);
1018         memalloc_nofs_restore(nofs_flags);
1019         return ret;
1020 }
1021
1022 /*
1023  * Find allocatable zones within a given region.
1024  *
1025  * @device:     the device to allocate a region on
1026  * @hole_start: the position of the hole to allocate the region
1027  * @num_bytes:  size of wanted region
1028  * @hole_end:   the end of the hole
1029  * @return:     position of allocatable zones
1030  *
1031  * Allocatable region should not contain any superblock locations.
1032  */
1033 u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
1034                                  u64 hole_end, u64 num_bytes)
1035 {
1036         struct btrfs_zoned_device_info *zinfo = device->zone_info;
1037         const u8 shift = zinfo->zone_size_shift;
1038         u64 nzones = num_bytes >> shift;
1039         u64 pos = hole_start;
1040         u64 begin, end;
1041         bool have_sb;
1042         int i;
1043
1044         ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
1045         ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
1046
1047         while (pos < hole_end) {
1048                 begin = pos >> shift;
1049                 end = begin + nzones;
1050
1051                 if (end > zinfo->nr_zones)
1052                         return hole_end;
1053
1054                 /* Check if zones in the region are all empty */
1055                 if (btrfs_dev_is_sequential(device, pos) &&
1056                     !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) {
1057                         pos += zinfo->zone_size;
1058                         continue;
1059                 }
1060
1061                 have_sb = false;
1062                 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1063                         u32 sb_zone;
1064                         u64 sb_pos;
1065
1066                         sb_zone = sb_zone_number(shift, i);
1067                         if (!(end <= sb_zone ||
1068                               sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
1069                                 have_sb = true;
1070                                 pos = zone_start_physical(
1071                                         sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
1072                                 break;
1073                         }
1074
1075                         /* We also need to exclude regular superblock positions */
1076                         sb_pos = btrfs_sb_offset(i);
1077                         if (!(pos + num_bytes <= sb_pos ||
1078                               sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
1079                                 have_sb = true;
1080                                 pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
1081                                             zinfo->zone_size);
1082                                 break;
1083                         }
1084                 }
1085                 if (!have_sb)
1086                         break;
1087         }
1088
1089         return pos;
1090 }
1091
1092 static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
1093 {
1094         struct btrfs_zoned_device_info *zone_info = device->zone_info;
1095         unsigned int zno = (pos >> zone_info->zone_size_shift);
1096
1097         /* We can use any number of zones */
1098         if (zone_info->max_active_zones == 0)
1099                 return true;
1100
1101         if (!test_bit(zno, zone_info->active_zones)) {
1102                 /* Active zone left? */
1103                 if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
1104                         return false;
1105                 if (test_and_set_bit(zno, zone_info->active_zones)) {
1106                         /* Someone already set the bit */
1107                         atomic_inc(&zone_info->active_zones_left);
1108                 }
1109         }
1110
1111         return true;
1112 }
1113
1114 static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
1115 {
1116         struct btrfs_zoned_device_info *zone_info = device->zone_info;
1117         unsigned int zno = (pos >> zone_info->zone_size_shift);
1118
1119         /* We can use any number of zones */
1120         if (zone_info->max_active_zones == 0)
1121                 return;
1122
1123         if (test_and_clear_bit(zno, zone_info->active_zones))
1124                 atomic_inc(&zone_info->active_zones_left);
1125 }
1126
1127 int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
1128                             u64 length, u64 *bytes)
1129 {
1130         unsigned int nofs_flags;
1131         int ret;
1132
1133         *bytes = 0;
1134         nofs_flags = memalloc_nofs_save();
1135         ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
1136                                physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT);
1137         memalloc_nofs_restore(nofs_flags);
1138         if (ret)
1139                 return ret;
1140
1141         *bytes = length;
1142         while (length) {
1143                 btrfs_dev_set_zone_empty(device, physical);
1144                 btrfs_dev_clear_active_zone(device, physical);
1145                 physical += device->zone_info->zone_size;
1146                 length -= device->zone_info->zone_size;
1147         }
1148
1149         return 0;
1150 }
1151
1152 int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
1153 {
1154         struct btrfs_zoned_device_info *zinfo = device->zone_info;
1155         const u8 shift = zinfo->zone_size_shift;
1156         unsigned long begin = start >> shift;
1157         unsigned long nbits = size >> shift;
1158         u64 pos;
1159         int ret;
1160
1161         ASSERT(IS_ALIGNED(start, zinfo->zone_size));
1162         ASSERT(IS_ALIGNED(size, zinfo->zone_size));
1163
1164         if (begin + nbits > zinfo->nr_zones)
1165                 return -ERANGE;
1166
1167         /* All the zones are conventional */
1168         if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits))
1169                 return 0;
1170
1171         /* All the zones are sequential and empty */
1172         if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) &&
1173             bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits))
1174                 return 0;
1175
1176         for (pos = start; pos < start + size; pos += zinfo->zone_size) {
1177                 u64 reset_bytes;
1178
1179                 if (!btrfs_dev_is_sequential(device, pos) ||
1180                     btrfs_dev_is_empty_zone(device, pos))
1181                         continue;
1182
1183                 /* Free regions should be empty */
1184                 btrfs_warn_in_rcu(
1185                         device->fs_info,
1186                 "zoned: resetting device %s (devid %llu) zone %llu for allocation",
1187                         rcu_str_deref(device->name), device->devid, pos >> shift);
1188                 WARN_ON_ONCE(1);
1189
1190                 ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
1191                                               &reset_bytes);
1192                 if (ret)
1193                         return ret;
1194         }
1195
1196         return 0;
1197 }
1198
1199 /*
1200  * Calculate an allocation pointer from the extent allocation information
1201  * for a block group consist of conventional zones. It is pointed to the
1202  * end of the highest addressed extent in the block group as an allocation
1203  * offset.
1204  */
1205 static int calculate_alloc_pointer(struct btrfs_block_group *cache,
1206                                    u64 *offset_ret, bool new)
1207 {
1208         struct btrfs_fs_info *fs_info = cache->fs_info;
1209         struct btrfs_root *root;
1210         BTRFS_PATH_AUTO_FREE(path);
1211         struct btrfs_key key;
1212         struct btrfs_key found_key;
1213         int ret;
1214         u64 length;
1215
1216         /*
1217          * Avoid  tree lookups for a new block group, there's no use for it.
1218          * It must always be 0.
1219          *
1220          * Also, we have a lock chain of extent buffer lock -> chunk mutex.
1221          * For new a block group, this function is called from
1222          * btrfs_make_block_group() which is already taking the chunk mutex.
1223          * Thus, we cannot call calculate_alloc_pointer() which takes extent
1224          * buffer locks to avoid deadlock.
1225          */
1226         if (new) {
1227                 *offset_ret = 0;
1228                 return 0;
1229         }
1230
1231         path = btrfs_alloc_path();
1232         if (!path)
1233                 return -ENOMEM;
1234
1235         key.objectid = cache->start + cache->length;
1236         key.type = 0;
1237         key.offset = 0;
1238
1239         root = btrfs_extent_root(fs_info, key.objectid);
1240         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1241         /* We should not find the exact match */
1242         if (!ret)
1243                 ret = -EUCLEAN;
1244         if (ret < 0)
1245                 return ret;
1246
1247         ret = btrfs_previous_extent_item(root, path, cache->start);
1248         if (ret) {
1249                 if (ret == 1) {
1250                         ret = 0;
1251                         *offset_ret = 0;
1252                 }
1253                 return ret;
1254         }
1255
1256         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
1257
1258         if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
1259                 length = found_key.offset;
1260         else
1261                 length = fs_info->nodesize;
1262
1263         if (!(found_key.objectid >= cache->start &&
1264                found_key.objectid + length <= cache->start + cache->length)) {
1265                 return -EUCLEAN;
1266         }
1267         *offset_ret = found_key.objectid + length - cache->start;
1268         return 0;
1269 }
1270
1271 struct zone_info {
1272         u64 physical;
1273         u64 capacity;
1274         u64 alloc_offset;
1275 };
1276
1277 static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
1278                                 struct zone_info *info, unsigned long *active,
1279                                 struct btrfs_chunk_map *map)
1280 {
1281         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1282         struct btrfs_device *device;
1283         int dev_replace_is_ongoing = 0;
1284         unsigned int nofs_flag;
1285         struct blk_zone zone;
1286         int ret;
1287
1288         info->physical = map->stripes[zone_idx].physical;
1289
1290         down_read(&dev_replace->rwsem);
1291         device = map->stripes[zone_idx].dev;
1292
1293         if (!device->bdev) {
1294                 up_read(&dev_replace->rwsem);
1295                 info->alloc_offset = WP_MISSING_DEV;
1296                 return 0;
1297         }
1298
1299         /* Consider a zone as active if we can allow any number of active zones. */
1300         if (!device->zone_info->max_active_zones)
1301                 __set_bit(zone_idx, active);
1302
1303         if (!btrfs_dev_is_sequential(device, info->physical)) {
1304                 up_read(&dev_replace->rwsem);
1305                 info->alloc_offset = WP_CONVENTIONAL;
1306                 return 0;
1307         }
1308
1309         /* This zone will be used for allocation, so mark this zone non-empty. */
1310         btrfs_dev_clear_zone_empty(device, info->physical);
1311
1312         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
1313         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
1314                 btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical);
1315
1316         /*
1317          * The group is mapped to a sequential zone. Get the zone write pointer
1318          * to determine the allocation offset within the zone.
1319          */
1320         WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size));
1321         nofs_flag = memalloc_nofs_save();
1322         ret = btrfs_get_dev_zone(device, info->physical, &zone);
1323         memalloc_nofs_restore(nofs_flag);
1324         if (ret) {
1325                 up_read(&dev_replace->rwsem);
1326                 if (ret != -EIO && ret != -EOPNOTSUPP)
1327                         return ret;
1328                 info->alloc_offset = WP_MISSING_DEV;
1329                 return 0;
1330         }
1331
1332         if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
1333                 btrfs_err_in_rcu(fs_info,
1334                 "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
1335                         zone.start << SECTOR_SHIFT, rcu_str_deref(device->name),
1336                         device->devid);
1337                 up_read(&dev_replace->rwsem);
1338                 return -EIO;
1339         }
1340
1341         info->capacity = (zone.capacity << SECTOR_SHIFT);
1342
1343         switch (zone.cond) {
1344         case BLK_ZONE_COND_OFFLINE:
1345         case BLK_ZONE_COND_READONLY:
1346                 btrfs_err_in_rcu(fs_info,
1347                 "zoned: offline/readonly zone %llu on device %s (devid %llu)",
1348                           (info->physical >> device->zone_info->zone_size_shift),
1349                           rcu_str_deref(device->name), device->devid);
1350                 info->alloc_offset = WP_MISSING_DEV;
1351                 break;
1352         case BLK_ZONE_COND_EMPTY:
1353                 info->alloc_offset = 0;
1354                 break;
1355         case BLK_ZONE_COND_FULL:
1356                 info->alloc_offset = info->capacity;
1357                 break;
1358         default:
1359                 /* Partially used zone. */
1360                 info->alloc_offset = ((zone.wp - zone.start) << SECTOR_SHIFT);
1361                 __set_bit(zone_idx, active);
1362                 break;
1363         }
1364
1365         up_read(&dev_replace->rwsem);
1366
1367         return 0;
1368 }
1369
1370 static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
1371                                          struct zone_info *info,
1372                                          unsigned long *active)
1373 {
1374         if (info->alloc_offset == WP_MISSING_DEV) {
1375                 btrfs_err(bg->fs_info,
1376                         "zoned: cannot recover write pointer for zone %llu",
1377                         info->physical);
1378                 return -EIO;
1379         }
1380
1381         bg->alloc_offset = info->alloc_offset;
1382         bg->zone_capacity = info->capacity;
1383         if (test_bit(0, active))
1384                 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
1385         return 0;
1386 }
1387
1388 static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
1389                                       struct btrfs_chunk_map *map,
1390                                       struct zone_info *zone_info,
1391                                       unsigned long *active)
1392 {
1393         struct btrfs_fs_info *fs_info = bg->fs_info;
1394
1395         if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
1396                 btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree");
1397                 return -EINVAL;
1398         }
1399
1400         bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
1401
1402         if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
1403                 btrfs_err(bg->fs_info,
1404                           "zoned: cannot recover write pointer for zone %llu",
1405                           zone_info[0].physical);
1406                 return -EIO;
1407         }
1408         if (zone_info[1].alloc_offset == WP_MISSING_DEV) {
1409                 btrfs_err(bg->fs_info,
1410                           "zoned: cannot recover write pointer for zone %llu",
1411                           zone_info[1].physical);
1412                 return -EIO;
1413         }
1414         if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
1415                 btrfs_err(bg->fs_info,
1416                           "zoned: write pointer offset mismatch of zones in DUP profile");
1417                 return -EIO;
1418         }
1419
1420         if (test_bit(0, active) != test_bit(1, active)) {
1421                 if (!btrfs_zone_activate(bg))
1422                         return -EIO;
1423         } else if (test_bit(0, active)) {
1424                 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
1425         }
1426
1427         bg->alloc_offset = zone_info[0].alloc_offset;
1428         return 0;
1429 }
1430
1431 static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
1432                                         struct btrfs_chunk_map *map,
1433                                         struct zone_info *zone_info,
1434                                         unsigned long *active)
1435 {
1436         struct btrfs_fs_info *fs_info = bg->fs_info;
1437         int i;
1438
1439         if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
1440                 btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
1441                           btrfs_bg_type_to_raid_name(map->type));
1442                 return -EINVAL;
1443         }
1444
1445         /* In case a device is missing we have a cap of 0, so don't use it. */
1446         bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
1447
1448         for (i = 0; i < map->num_stripes; i++) {
1449                 if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
1450                     zone_info[i].alloc_offset == WP_CONVENTIONAL)
1451                         continue;
1452
1453                 if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
1454                     !btrfs_test_opt(fs_info, DEGRADED)) {
1455                         btrfs_err(fs_info,
1456                         "zoned: write pointer offset mismatch of zones in %s profile",
1457                                   btrfs_bg_type_to_raid_name(map->type));
1458                         return -EIO;
1459                 }
1460                 if (test_bit(0, active) != test_bit(i, active)) {
1461                         if (!btrfs_test_opt(fs_info, DEGRADED) &&
1462                             !btrfs_zone_activate(bg)) {
1463                                 return -EIO;
1464                         }
1465                 } else {
1466                         if (test_bit(0, active))
1467                                 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
1468                 }
1469         }
1470
1471         if (zone_info[0].alloc_offset != WP_MISSING_DEV)
1472                 bg->alloc_offset = zone_info[0].alloc_offset;
1473         else
1474                 bg->alloc_offset = zone_info[i - 1].alloc_offset;
1475
1476         return 0;
1477 }
1478
1479 static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
1480                                         struct btrfs_chunk_map *map,
1481                                         struct zone_info *zone_info,
1482                                         unsigned long *active)
1483 {
1484         struct btrfs_fs_info *fs_info = bg->fs_info;
1485
1486         if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
1487                 btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
1488                           btrfs_bg_type_to_raid_name(map->type));
1489                 return -EINVAL;
1490         }
1491
1492         for (int i = 0; i < map->num_stripes; i++) {
1493                 if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
1494                     zone_info[i].alloc_offset == WP_CONVENTIONAL)
1495                         continue;
1496
1497                 if (test_bit(0, active) != test_bit(i, active)) {
1498                         if (!btrfs_zone_activate(bg))
1499                                 return -EIO;
1500                 } else {
1501                         if (test_bit(0, active))
1502                                 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
1503                 }
1504                 bg->zone_capacity += zone_info[i].capacity;
1505                 bg->alloc_offset += zone_info[i].alloc_offset;
1506         }
1507
1508         return 0;
1509 }
1510
1511 static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
1512                                          struct btrfs_chunk_map *map,
1513                                          struct zone_info *zone_info,
1514                                          unsigned long *active)
1515 {
1516         struct btrfs_fs_info *fs_info = bg->fs_info;
1517
1518         if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
1519                 btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
1520                           btrfs_bg_type_to_raid_name(map->type));
1521                 return -EINVAL;
1522         }
1523
1524         for (int i = 0; i < map->num_stripes; i++) {
1525                 if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
1526                     zone_info[i].alloc_offset == WP_CONVENTIONAL)
1527                         continue;
1528
1529                 if (test_bit(0, active) != test_bit(i, active)) {
1530                         if (!btrfs_zone_activate(bg))
1531                                 return -EIO;
1532                 } else {
1533                         if (test_bit(0, active))
1534                                 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
1535                 }
1536
1537                 if ((i % map->sub_stripes) == 0) {
1538                         bg->zone_capacity += zone_info[i].capacity;
1539                         bg->alloc_offset += zone_info[i].alloc_offset;
1540                 }
1541         }
1542
1543         return 0;
1544 }
1545
1546 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
1547 {
1548         struct btrfs_fs_info *fs_info = cache->fs_info;
1549         struct btrfs_chunk_map *map;
1550         u64 logical = cache->start;
1551         u64 length = cache->length;
1552         struct zone_info *zone_info = NULL;
1553         int ret;
1554         int i;
1555         unsigned long *active = NULL;
1556         u64 last_alloc = 0;
1557         u32 num_sequential = 0, num_conventional = 0;
1558         u64 profile;
1559
1560         if (!btrfs_is_zoned(fs_info))
1561                 return 0;
1562
1563         /* Sanity check */
1564         if (!IS_ALIGNED(length, fs_info->zone_size)) {
1565                 btrfs_err(fs_info,
1566                 "zoned: block group %llu len %llu unaligned to zone size %llu",
1567                           logical, length, fs_info->zone_size);
1568                 return -EIO;
1569         }
1570
1571         map = btrfs_find_chunk_map(fs_info, logical, length);
1572         if (!map)
1573                 return -EINVAL;
1574
1575         cache->physical_map = map;
1576
1577         zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS);
1578         if (!zone_info) {
1579                 ret = -ENOMEM;
1580                 goto out;
1581         }
1582
1583         active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
1584         if (!active) {
1585                 ret = -ENOMEM;
1586                 goto out;
1587         }
1588
1589         for (i = 0; i < map->num_stripes; i++) {
1590                 ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map);
1591                 if (ret)
1592                         goto out;
1593
1594                 if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
1595                         num_conventional++;
1596                 else
1597                         num_sequential++;
1598         }
1599
1600         if (num_sequential > 0)
1601                 set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
1602
1603         if (num_conventional > 0) {
1604                 /* Zone capacity is always zone size in emulation */
1605                 cache->zone_capacity = cache->length;
1606                 ret = calculate_alloc_pointer(cache, &last_alloc, new);
1607                 if (ret) {
1608                         btrfs_err(fs_info,
1609                         "zoned: failed to determine allocation offset of bg %llu",
1610                                   cache->start);
1611                         goto out;
1612                 } else if (map->num_stripes == num_conventional) {
1613                         cache->alloc_offset = last_alloc;
1614                         set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
1615                         goto out;
1616                 }
1617         }
1618
1619         profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
1620         switch (profile) {
1621         case 0: /* single */
1622                 ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
1623                 break;
1624         case BTRFS_BLOCK_GROUP_DUP:
1625                 ret = btrfs_load_block_group_dup(cache, map, zone_info, active);
1626                 break;
1627         case BTRFS_BLOCK_GROUP_RAID1:
1628         case BTRFS_BLOCK_GROUP_RAID1C3:
1629         case BTRFS_BLOCK_GROUP_RAID1C4:
1630                 ret = btrfs_load_block_group_raid1(cache, map, zone_info, active);
1631                 break;
1632         case BTRFS_BLOCK_GROUP_RAID0:
1633                 ret = btrfs_load_block_group_raid0(cache, map, zone_info, active);
1634                 break;
1635         case BTRFS_BLOCK_GROUP_RAID10:
1636                 ret = btrfs_load_block_group_raid10(cache, map, zone_info, active);
1637                 break;
1638         case BTRFS_BLOCK_GROUP_RAID5:
1639         case BTRFS_BLOCK_GROUP_RAID6:
1640         default:
1641                 btrfs_err(fs_info, "zoned: profile %s not yet supported",
1642                           btrfs_bg_type_to_raid_name(map->type));
1643                 ret = -EINVAL;
1644                 goto out;
1645         }
1646
1647         if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 &&
1648             profile != BTRFS_BLOCK_GROUP_RAID10) {
1649                 /*
1650                  * Detected broken write pointer.  Make this block group
1651                  * unallocatable by setting the allocation pointer at the end of
1652                  * allocatable region. Relocating this block group will fix the
1653                  * mismatch.
1654                  *
1655                  * Currently, we cannot handle RAID0 or RAID10 case like this
1656                  * because we don't have a proper zone_capacity value. But,
1657                  * reading from this block group won't work anyway by a missing
1658                  * stripe.
1659                  */
1660                 cache->alloc_offset = cache->zone_capacity;
1661                 ret = 0;
1662         }
1663
1664 out:
1665         /* Reject non SINGLE data profiles without RST */
1666         if ((map->type & BTRFS_BLOCK_GROUP_DATA) &&
1667             (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
1668             !fs_info->stripe_root) {
1669                 btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
1670                           btrfs_bg_type_to_raid_name(map->type));
1671                 return -EINVAL;
1672         }
1673
1674         if (cache->alloc_offset > cache->zone_capacity) {
1675                 btrfs_err(fs_info,
1676 "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
1677                           cache->alloc_offset, cache->zone_capacity,
1678                           cache->start);
1679                 ret = -EIO;
1680         }
1681
1682         /* An extent is allocated after the write pointer */
1683         if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
1684                 btrfs_err(fs_info,
1685                           "zoned: got wrong write pointer in BG %llu: %llu > %llu",
1686                           logical, last_alloc, cache->alloc_offset);
1687                 ret = -EIO;
1688         }
1689
1690         if (!ret) {
1691                 cache->meta_write_pointer = cache->alloc_offset + cache->start;
1692                 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
1693                         btrfs_get_block_group(cache);
1694                         spin_lock(&fs_info->zone_active_bgs_lock);
1695                         list_add_tail(&cache->active_bg_list,
1696                                       &fs_info->zone_active_bgs);
1697                         spin_unlock(&fs_info->zone_active_bgs_lock);
1698                 }
1699         } else {
1700                 btrfs_free_chunk_map(cache->physical_map);
1701                 cache->physical_map = NULL;
1702         }
1703         bitmap_free(active);
1704         kfree(zone_info);
1705
1706         return ret;
1707 }
1708
1709 void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
1710 {
1711         u64 unusable, free;
1712
1713         if (!btrfs_is_zoned(cache->fs_info))
1714                 return;
1715
1716         WARN_ON(cache->bytes_super != 0);
1717         unusable = (cache->alloc_offset - cache->used) +
1718                    (cache->length - cache->zone_capacity);
1719         free = cache->zone_capacity - cache->alloc_offset;
1720
1721         /* We only need ->free_space in ALLOC_SEQ block groups */
1722         cache->cached = BTRFS_CACHE_FINISHED;
1723         cache->free_space_ctl->free_space = free;
1724         cache->zone_unusable = unusable;
1725 }
1726
1727 bool btrfs_use_zone_append(struct btrfs_bio *bbio)
1728 {
1729         u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
1730         struct btrfs_inode *inode = bbio->inode;
1731         struct btrfs_fs_info *fs_info = bbio->fs_info;
1732         struct btrfs_block_group *cache;
1733         bool ret = false;
1734
1735         if (!btrfs_is_zoned(fs_info))
1736                 return false;
1737
1738         if (!inode || !is_data_inode(inode))
1739                 return false;
1740
1741         if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
1742                 return false;
1743
1744         /*
1745          * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the
1746          * extent layout the relocation code has.
1747          * Furthermore we have set aside own block-group from which only the
1748          * relocation "process" can allocate and make sure only one process at a
1749          * time can add pages to an extent that gets relocated, so it's safe to
1750          * use regular REQ_OP_WRITE for this special case.
1751          */
1752         if (btrfs_is_data_reloc_root(inode->root))
1753                 return false;
1754
1755         cache = btrfs_lookup_block_group(fs_info, start);
1756         ASSERT(cache);
1757         if (!cache)
1758                 return false;
1759
1760         ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
1761         btrfs_put_block_group(cache);
1762
1763         return ret;
1764 }
1765
1766 void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
1767 {
1768         const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
1769         struct btrfs_ordered_sum *sum = bbio->sums;
1770
1771         if (physical < bbio->orig_physical)
1772                 sum->logical -= bbio->orig_physical - physical;
1773         else
1774                 sum->logical += physical - bbio->orig_physical;
1775 }
1776
1777 static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
1778                                         u64 logical)
1779 {
1780         struct extent_map_tree *em_tree = &ordered->inode->extent_tree;
1781         struct extent_map *em;
1782
1783         ordered->disk_bytenr = logical;
1784
1785         write_lock(&em_tree->lock);
1786         em = search_extent_mapping(em_tree, ordered->file_offset,
1787                                    ordered->num_bytes);
1788         /* The em should be a new COW extent, thus it should not have an offset. */
1789         ASSERT(em->offset == 0);
1790         em->disk_bytenr = logical;
1791         free_extent_map(em);
1792         write_unlock(&em_tree->lock);
1793 }
1794
1795 static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
1796                                       u64 logical, u64 len)
1797 {
1798         struct btrfs_ordered_extent *new;
1799
1800         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
1801             split_extent_map(ordered->inode, ordered->file_offset,
1802                              ordered->num_bytes, len, logical))
1803                 return false;
1804
1805         new = btrfs_split_ordered_extent(ordered, len);
1806         if (IS_ERR(new))
1807                 return false;
1808         new->disk_bytenr = logical;
1809         btrfs_finish_one_ordered(new);
1810         return true;
1811 }
1812
1813 void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered)
1814 {
1815         struct btrfs_inode *inode = ordered->inode;
1816         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1817         struct btrfs_ordered_sum *sum;
1818         u64 logical, len;
1819
1820         /*
1821          * Write to pre-allocated region is for the data relocation, and so
1822          * it should use WRITE operation. No split/rewrite are necessary.
1823          */
1824         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
1825                 return;
1826
1827         ASSERT(!list_empty(&ordered->list));
1828         /* The ordered->list can be empty in the above pre-alloc case. */
1829         sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list);
1830         logical = sum->logical;
1831         len = sum->len;
1832
1833         while (len < ordered->disk_num_bytes) {
1834                 sum = list_next_entry(sum, list);
1835                 if (sum->logical == logical + len) {
1836                         len += sum->len;
1837                         continue;
1838                 }
1839                 if (!btrfs_zoned_split_ordered(ordered, logical, len)) {
1840                         set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
1841                         btrfs_err(fs_info, "failed to split ordered extent");
1842                         goto out;
1843                 }
1844                 logical = sum->logical;
1845                 len = sum->len;
1846         }
1847
1848         if (ordered->disk_bytenr != logical)
1849                 btrfs_rewrite_logical_zoned(ordered, logical);
1850
1851 out:
1852         /*
1853          * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures
1854          * were allocated by btrfs_alloc_dummy_sum only to record the logical
1855          * addresses and don't contain actual checksums.  We thus must free them
1856          * here so that we don't attempt to log the csums later.
1857          */
1858         if ((inode->flags & BTRFS_INODE_NODATASUM) ||
1859             test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) {
1860                 while ((sum = list_first_entry_or_null(&ordered->list,
1861                                                        typeof(*sum), list))) {
1862                         list_del(&sum->list);
1863                         kfree(sum);
1864                 }
1865         }
1866 }
1867
1868 static bool check_bg_is_active(struct btrfs_eb_write_context *ctx,
1869                                struct btrfs_block_group **active_bg)
1870 {
1871         const struct writeback_control *wbc = ctx->wbc;
1872         struct btrfs_block_group *block_group = ctx->zoned_bg;
1873         struct btrfs_fs_info *fs_info = block_group->fs_info;
1874
1875         if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
1876                 return true;
1877
1878         if (fs_info->treelog_bg == block_group->start) {
1879                 if (!btrfs_zone_activate(block_group)) {
1880                         int ret_fin = btrfs_zone_finish_one_bg(fs_info);
1881
1882                         if (ret_fin != 1 || !btrfs_zone_activate(block_group))
1883                                 return false;
1884                 }
1885         } else if (*active_bg != block_group) {
1886                 struct btrfs_block_group *tgt = *active_bg;
1887
1888                 /* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */
1889                 lockdep_assert_held(&fs_info->zoned_meta_io_lock);
1890
1891                 if (tgt) {
1892                         /*
1893                          * If there is an unsent IO left in the allocated area,
1894                          * we cannot wait for them as it may cause a deadlock.
1895                          */
1896                         if (tgt->meta_write_pointer < tgt->start + tgt->alloc_offset) {
1897                                 if (wbc->sync_mode == WB_SYNC_NONE ||
1898                                     (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync))
1899                                         return false;
1900                         }
1901
1902                         /* Pivot active metadata/system block group. */
1903                         btrfs_zoned_meta_io_unlock(fs_info);
1904                         wait_eb_writebacks(tgt);
1905                         do_zone_finish(tgt, true);
1906                         btrfs_zoned_meta_io_lock(fs_info);
1907                         if (*active_bg == tgt) {
1908                                 btrfs_put_block_group(tgt);
1909                                 *active_bg = NULL;
1910                         }
1911                 }
1912                 if (!btrfs_zone_activate(block_group))
1913                         return false;
1914                 if (*active_bg != block_group) {
1915                         ASSERT(*active_bg == NULL);
1916                         *active_bg = block_group;
1917                         btrfs_get_block_group(block_group);
1918                 }
1919         }
1920
1921         return true;
1922 }
1923
1924 /*
1925  * Check if @ctx->eb is aligned to the write pointer.
1926  *
1927  * Return:
1928  *   0:        @ctx->eb is at the write pointer. You can write it.
1929  *   -EAGAIN:  There is a hole. The caller should handle the case.
1930  *   -EBUSY:   There is a hole, but the caller can just bail out.
1931  */
1932 int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
1933                                    struct btrfs_eb_write_context *ctx)
1934 {
1935         const struct writeback_control *wbc = ctx->wbc;
1936         const struct extent_buffer *eb = ctx->eb;
1937         struct btrfs_block_group *block_group = ctx->zoned_bg;
1938
1939         if (!btrfs_is_zoned(fs_info))
1940                 return 0;
1941
1942         if (block_group) {
1943                 if (block_group->start > eb->start ||
1944                     block_group->start + block_group->length <= eb->start) {
1945                         btrfs_put_block_group(block_group);
1946                         block_group = NULL;
1947                         ctx->zoned_bg = NULL;
1948                 }
1949         }
1950
1951         if (!block_group) {
1952                 block_group = btrfs_lookup_block_group(fs_info, eb->start);
1953                 if (!block_group)
1954                         return 0;
1955                 ctx->zoned_bg = block_group;
1956         }
1957
1958         if (block_group->meta_write_pointer == eb->start) {
1959                 struct btrfs_block_group **tgt;
1960
1961                 if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags))
1962                         return 0;
1963
1964                 if (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)
1965                         tgt = &fs_info->active_system_bg;
1966                 else
1967                         tgt = &fs_info->active_meta_bg;
1968                 if (check_bg_is_active(ctx, tgt))
1969                         return 0;
1970         }
1971
1972         /*
1973          * Since we may release fs_info->zoned_meta_io_lock, someone can already
1974          * start writing this eb. In that case, we can just bail out.
1975          */
1976         if (block_group->meta_write_pointer > eb->start)
1977                 return -EBUSY;
1978
1979         /* If for_sync, this hole will be filled with transaction commit. */
1980         if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
1981                 return -EAGAIN;
1982         return -EBUSY;
1983 }
1984
1985 int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
1986 {
1987         if (!btrfs_dev_is_sequential(device, physical))
1988                 return -EOPNOTSUPP;
1989
1990         return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
1991                                     length >> SECTOR_SHIFT, GFP_NOFS, 0);
1992 }
1993
1994 static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
1995                           struct blk_zone *zone)
1996 {
1997         struct btrfs_io_context *bioc = NULL;
1998         u64 mapped_length = PAGE_SIZE;
1999         unsigned int nofs_flag;
2000         int nmirrors;
2001         int i, ret;
2002
2003         ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2004                               &mapped_length, &bioc, NULL, NULL);
2005         if (ret || !bioc || mapped_length < PAGE_SIZE) {
2006                 ret = -EIO;
2007                 goto out_put_bioc;
2008         }
2009
2010         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
2011                 ret = -EINVAL;
2012                 goto out_put_bioc;
2013         }
2014
2015         nofs_flag = memalloc_nofs_save();
2016         nmirrors = (int)bioc->num_stripes;
2017         for (i = 0; i < nmirrors; i++) {
2018                 u64 physical = bioc->stripes[i].physical;
2019                 struct btrfs_device *dev = bioc->stripes[i].dev;
2020
2021                 /* Missing device */
2022                 if (!dev->bdev)
2023                         continue;
2024
2025                 ret = btrfs_get_dev_zone(dev, physical, zone);
2026                 /* Failing device */
2027                 if (ret == -EIO || ret == -EOPNOTSUPP)
2028                         continue;
2029                 break;
2030         }
2031         memalloc_nofs_restore(nofs_flag);
2032 out_put_bioc:
2033         btrfs_put_bioc(bioc);
2034         return ret;
2035 }
2036
2037 /*
2038  * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
2039  * filling zeros between @physical_pos to a write pointer of dev-replace
2040  * source device.
2041  */
2042 int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
2043                                     u64 physical_start, u64 physical_pos)
2044 {
2045         struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
2046         struct blk_zone zone;
2047         u64 length;
2048         u64 wp;
2049         int ret;
2050
2051         if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
2052                 return 0;
2053
2054         ret = read_zone_info(fs_info, logical, &zone);
2055         if (ret)
2056                 return ret;
2057
2058         wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
2059
2060         if (physical_pos == wp)
2061                 return 0;
2062
2063         if (physical_pos > wp)
2064                 return -EUCLEAN;
2065
2066         length = wp - physical_pos;
2067         return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
2068 }
2069
2070 /*
2071  * Activate block group and underlying device zones
2072  *
2073  * @block_group: the block group to activate
2074  *
2075  * Return: true on success, false otherwise
2076  */
2077 bool btrfs_zone_activate(struct btrfs_block_group *block_group)
2078 {
2079         struct btrfs_fs_info *fs_info = block_group->fs_info;
2080         struct btrfs_chunk_map *map;
2081         struct btrfs_device *device;
2082         u64 physical;
2083         const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA);
2084         bool ret;
2085         int i;
2086
2087         if (!btrfs_is_zoned(block_group->fs_info))
2088                 return true;
2089
2090         map = block_group->physical_map;
2091
2092         spin_lock(&fs_info->zone_active_bgs_lock);
2093         spin_lock(&block_group->lock);
2094         if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
2095                 ret = true;
2096                 goto out_unlock;
2097         }
2098
2099         /* No space left */
2100         if (btrfs_zoned_bg_is_full(block_group)) {
2101                 ret = false;
2102                 goto out_unlock;
2103         }
2104
2105         for (i = 0; i < map->num_stripes; i++) {
2106                 struct btrfs_zoned_device_info *zinfo;
2107                 int reserved = 0;
2108
2109                 device = map->stripes[i].dev;
2110                 physical = map->stripes[i].physical;
2111                 zinfo = device->zone_info;
2112
2113                 if (zinfo->max_active_zones == 0)
2114                         continue;
2115
2116                 if (is_data)
2117                         reserved = zinfo->reserved_active_zones;
2118                 /*
2119                  * For the data block group, leave active zones for one
2120                  * metadata block group and one system block group.
2121                  */
2122                 if (atomic_read(&zinfo->active_zones_left) <= reserved) {
2123                         ret = false;
2124                         goto out_unlock;
2125                 }
2126
2127                 if (!btrfs_dev_set_active_zone(device, physical)) {
2128                         /* Cannot activate the zone */
2129                         ret = false;
2130                         goto out_unlock;
2131                 }
2132                 if (!is_data)
2133                         zinfo->reserved_active_zones--;
2134         }
2135
2136         /* Successfully activated all the zones */
2137         set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
2138         spin_unlock(&block_group->lock);
2139
2140         /* For the active block group list */
2141         btrfs_get_block_group(block_group);
2142         list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
2143         spin_unlock(&fs_info->zone_active_bgs_lock);
2144
2145         return true;
2146
2147 out_unlock:
2148         spin_unlock(&block_group->lock);
2149         spin_unlock(&fs_info->zone_active_bgs_lock);
2150         return ret;
2151 }
2152
2153 static void wait_eb_writebacks(struct btrfs_block_group *block_group)
2154 {
2155         struct btrfs_fs_info *fs_info = block_group->fs_info;
2156         const u64 end = block_group->start + block_group->length;
2157         struct radix_tree_iter iter;
2158         struct extent_buffer *eb;
2159         void __rcu **slot;
2160
2161         rcu_read_lock();
2162         radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
2163                                  block_group->start >> fs_info->sectorsize_bits) {
2164                 eb = radix_tree_deref_slot(slot);
2165                 if (!eb)
2166                         continue;
2167                 if (radix_tree_deref_retry(eb)) {
2168                         slot = radix_tree_iter_retry(&iter);
2169                         continue;
2170                 }
2171
2172                 if (eb->start < block_group->start)
2173                         continue;
2174                 if (eb->start >= end)
2175                         break;
2176
2177                 slot = radix_tree_iter_resume(slot, &iter);
2178                 rcu_read_unlock();
2179                 wait_on_extent_buffer_writeback(eb);
2180                 rcu_read_lock();
2181         }
2182         rcu_read_unlock();
2183 }
2184
2185 static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
2186 {
2187         struct btrfs_fs_info *fs_info = block_group->fs_info;
2188         struct btrfs_chunk_map *map;
2189         const bool is_metadata = (block_group->flags &
2190                         (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
2191         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2192         int ret = 0;
2193         int i;
2194
2195         spin_lock(&block_group->lock);
2196         if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
2197                 spin_unlock(&block_group->lock);
2198                 return 0;
2199         }
2200
2201         /* Check if we have unwritten allocated space */
2202         if (is_metadata &&
2203             block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
2204                 spin_unlock(&block_group->lock);
2205                 return -EAGAIN;
2206         }
2207
2208         /*
2209          * If we are sure that the block group is full (= no more room left for
2210          * new allocation) and the IO for the last usable block is completed, we
2211          * don't need to wait for the other IOs. This holds because we ensure
2212          * the sequential IO submissions using the ZONE_APPEND command for data
2213          * and block_group->meta_write_pointer for metadata.
2214          */
2215         if (!fully_written) {
2216                 if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
2217                         spin_unlock(&block_group->lock);
2218                         return -EAGAIN;
2219                 }
2220                 spin_unlock(&block_group->lock);
2221
2222                 ret = btrfs_inc_block_group_ro(block_group, false);
2223                 if (ret)
2224                         return ret;
2225
2226                 /* Ensure all writes in this block group finish */
2227                 btrfs_wait_block_group_reservations(block_group);
2228                 /* No need to wait for NOCOW writers. Zoned mode does not allow that */
2229                 btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group);
2230                 /* Wait for extent buffers to be written. */
2231                 if (is_metadata)
2232                         wait_eb_writebacks(block_group);
2233
2234                 spin_lock(&block_group->lock);
2235
2236                 /*
2237                  * Bail out if someone already deactivated the block group, or
2238                  * allocated space is left in the block group.
2239                  */
2240                 if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
2241                               &block_group->runtime_flags)) {
2242                         spin_unlock(&block_group->lock);
2243                         btrfs_dec_block_group_ro(block_group);
2244                         return 0;
2245                 }
2246
2247                 if (block_group->reserved ||
2248                     test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
2249                              &block_group->runtime_flags)) {
2250                         spin_unlock(&block_group->lock);
2251                         btrfs_dec_block_group_ro(block_group);
2252                         return -EAGAIN;
2253                 }
2254         }
2255
2256         clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
2257         block_group->alloc_offset = block_group->zone_capacity;
2258         if (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))
2259                 block_group->meta_write_pointer = block_group->start +
2260                                                   block_group->zone_capacity;
2261         block_group->free_space_ctl->free_space = 0;
2262         btrfs_clear_treelog_bg(block_group);
2263         btrfs_clear_data_reloc_bg(block_group);
2264         spin_unlock(&block_group->lock);
2265
2266         down_read(&dev_replace->rwsem);
2267         map = block_group->physical_map;
2268         for (i = 0; i < map->num_stripes; i++) {
2269                 struct btrfs_device *device = map->stripes[i].dev;
2270                 const u64 physical = map->stripes[i].physical;
2271                 struct btrfs_zoned_device_info *zinfo = device->zone_info;
2272                 unsigned int nofs_flags;
2273
2274                 if (zinfo->max_active_zones == 0)
2275                         continue;
2276
2277                 nofs_flags = memalloc_nofs_save();
2278                 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
2279                                        physical >> SECTOR_SHIFT,
2280                                        zinfo->zone_size >> SECTOR_SHIFT);
2281                 memalloc_nofs_restore(nofs_flags);
2282
2283                 if (ret) {
2284                         up_read(&dev_replace->rwsem);
2285                         return ret;
2286                 }
2287
2288                 if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
2289                         zinfo->reserved_active_zones++;
2290                 btrfs_dev_clear_active_zone(device, physical);
2291         }
2292         up_read(&dev_replace->rwsem);
2293
2294         if (!fully_written)
2295                 btrfs_dec_block_group_ro(block_group);
2296
2297         spin_lock(&fs_info->zone_active_bgs_lock);
2298         ASSERT(!list_empty(&block_group->active_bg_list));
2299         list_del_init(&block_group->active_bg_list);
2300         spin_unlock(&fs_info->zone_active_bgs_lock);
2301
2302         /* For active_bg_list */
2303         btrfs_put_block_group(block_group);
2304
2305         clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
2306
2307         return 0;
2308 }
2309
2310 int btrfs_zone_finish(struct btrfs_block_group *block_group)
2311 {
2312         if (!btrfs_is_zoned(block_group->fs_info))
2313                 return 0;
2314
2315         return do_zone_finish(block_group, false);
2316 }
2317
2318 bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
2319 {
2320         struct btrfs_fs_info *fs_info = fs_devices->fs_info;
2321         struct btrfs_device *device;
2322         bool ret = false;
2323
2324         if (!btrfs_is_zoned(fs_info))
2325                 return true;
2326
2327         /* Check if there is a device with active zones left */
2328         mutex_lock(&fs_info->chunk_mutex);
2329         spin_lock(&fs_info->zone_active_bgs_lock);
2330         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
2331                 struct btrfs_zoned_device_info *zinfo = device->zone_info;
2332                 int reserved = 0;
2333
2334                 if (!device->bdev)
2335                         continue;
2336
2337                 if (!zinfo->max_active_zones) {
2338                         ret = true;
2339                         break;
2340                 }
2341
2342                 if (flags & BTRFS_BLOCK_GROUP_DATA)
2343                         reserved = zinfo->reserved_active_zones;
2344
2345                 switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
2346                 case 0: /* single */
2347                         ret = (atomic_read(&zinfo->active_zones_left) >= (1 + reserved));
2348                         break;
2349                 case BTRFS_BLOCK_GROUP_DUP:
2350                         ret = (atomic_read(&zinfo->active_zones_left) >= (2 + reserved));
2351                         break;
2352                 }
2353                 if (ret)
2354                         break;
2355         }
2356         spin_unlock(&fs_info->zone_active_bgs_lock);
2357         mutex_unlock(&fs_info->chunk_mutex);
2358
2359         if (!ret)
2360                 set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
2361
2362         return ret;
2363 }
2364
2365 void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
2366 {
2367         struct btrfs_block_group *block_group;
2368         u64 min_alloc_bytes;
2369
2370         if (!btrfs_is_zoned(fs_info))
2371                 return;
2372
2373         block_group = btrfs_lookup_block_group(fs_info, logical);
2374         ASSERT(block_group);
2375
2376         /* No MIXED_BG on zoned btrfs. */
2377         if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
2378                 min_alloc_bytes = fs_info->sectorsize;
2379         else
2380                 min_alloc_bytes = fs_info->nodesize;
2381
2382         /* Bail out if we can allocate more data from this block group. */
2383         if (logical + length + min_alloc_bytes <=
2384             block_group->start + block_group->zone_capacity)
2385                 goto out;
2386
2387         do_zone_finish(block_group, true);
2388
2389 out:
2390         btrfs_put_block_group(block_group);
2391 }
2392
2393 static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
2394 {
2395         struct btrfs_block_group *bg =
2396                 container_of(work, struct btrfs_block_group, zone_finish_work);
2397
2398         wait_on_extent_buffer_writeback(bg->last_eb);
2399         free_extent_buffer(bg->last_eb);
2400         btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
2401         btrfs_put_block_group(bg);
2402 }
2403
2404 void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
2405                                    struct extent_buffer *eb)
2406 {
2407         if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) ||
2408             eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
2409                 return;
2410
2411         if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
2412                 btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
2413                           bg->start);
2414                 return;
2415         }
2416
2417         /* For the work */
2418         btrfs_get_block_group(bg);
2419         atomic_inc(&eb->refs);
2420         bg->last_eb = eb;
2421         INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
2422         queue_work(system_unbound_wq, &bg->zone_finish_work);
2423 }
2424
2425 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
2426 {
2427         struct btrfs_fs_info *fs_info = bg->fs_info;
2428
2429         spin_lock(&fs_info->relocation_bg_lock);
2430         if (fs_info->data_reloc_bg == bg->start)
2431                 fs_info->data_reloc_bg = 0;
2432         spin_unlock(&fs_info->relocation_bg_lock);
2433 }
2434
2435 void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
2436 {
2437         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2438         struct btrfs_device *device;
2439
2440         if (!btrfs_is_zoned(fs_info))
2441                 return;
2442
2443         mutex_lock(&fs_devices->device_list_mutex);
2444         list_for_each_entry(device, &fs_devices->devices, dev_list) {
2445                 if (device->zone_info) {
2446                         vfree(device->zone_info->zone_cache);
2447                         device->zone_info->zone_cache = NULL;
2448                 }
2449         }
2450         mutex_unlock(&fs_devices->device_list_mutex);
2451 }
2452
2453 bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
2454 {
2455         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2456         struct btrfs_device *device;
2457         u64 used = 0;
2458         u64 total = 0;
2459         u64 factor;
2460
2461         ASSERT(btrfs_is_zoned(fs_info));
2462
2463         if (fs_info->bg_reclaim_threshold == 0)
2464                 return false;
2465
2466         mutex_lock(&fs_devices->device_list_mutex);
2467         list_for_each_entry(device, &fs_devices->devices, dev_list) {
2468                 if (!device->bdev)
2469                         continue;
2470
2471                 total += device->disk_total_bytes;
2472                 used += device->bytes_used;
2473         }
2474         mutex_unlock(&fs_devices->device_list_mutex);
2475
2476         factor = div64_u64(used * 100, total);
2477         return factor >= fs_info->bg_reclaim_threshold;
2478 }
2479
2480 void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
2481                                        u64 length)
2482 {
2483         struct btrfs_block_group *block_group;
2484
2485         if (!btrfs_is_zoned(fs_info))
2486                 return;
2487
2488         block_group = btrfs_lookup_block_group(fs_info, logical);
2489         /* It should be called on a previous data relocation block group. */
2490         ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
2491
2492         spin_lock(&block_group->lock);
2493         if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))
2494                 goto out;
2495
2496         /* All relocation extents are written. */
2497         if (block_group->start + block_group->alloc_offset == logical + length) {
2498                 /*
2499                  * Now, release this block group for further allocations and
2500                  * zone finish.
2501                  */
2502                 clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
2503                           &block_group->runtime_flags);
2504         }
2505
2506 out:
2507         spin_unlock(&block_group->lock);
2508         btrfs_put_block_group(block_group);
2509 }
2510
2511 int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
2512 {
2513         struct btrfs_block_group *block_group;
2514         struct btrfs_block_group *min_bg = NULL;
2515         u64 min_avail = U64_MAX;
2516         int ret;
2517
2518         spin_lock(&fs_info->zone_active_bgs_lock);
2519         list_for_each_entry(block_group, &fs_info->zone_active_bgs,
2520                             active_bg_list) {
2521                 u64 avail;
2522
2523                 spin_lock(&block_group->lock);
2524                 if (block_group->reserved || block_group->alloc_offset == 0 ||
2525                     (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
2526                     test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
2527                         spin_unlock(&block_group->lock);
2528                         continue;
2529                 }
2530
2531                 avail = block_group->zone_capacity - block_group->alloc_offset;
2532                 if (min_avail > avail) {
2533                         if (min_bg)
2534                                 btrfs_put_block_group(min_bg);
2535                         min_bg = block_group;
2536                         min_avail = avail;
2537                         btrfs_get_block_group(min_bg);
2538                 }
2539                 spin_unlock(&block_group->lock);
2540         }
2541         spin_unlock(&fs_info->zone_active_bgs_lock);
2542
2543         if (!min_bg)
2544                 return 0;
2545
2546         ret = btrfs_zone_finish(min_bg);
2547         btrfs_put_block_group(min_bg);
2548
2549         return ret < 0 ? ret : 1;
2550 }
2551
2552 int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
2553                                 struct btrfs_space_info *space_info,
2554                                 bool do_finish)
2555 {
2556         struct btrfs_block_group *bg;
2557         int index;
2558
2559         if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
2560                 return 0;
2561
2562         for (;;) {
2563                 int ret;
2564                 bool need_finish = false;
2565
2566                 down_read(&space_info->groups_sem);
2567                 for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) {
2568                         list_for_each_entry(bg, &space_info->block_groups[index],
2569                                             list) {
2570                                 if (!spin_trylock(&bg->lock))
2571                                         continue;
2572                                 if (btrfs_zoned_bg_is_full(bg) ||
2573                                     test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
2574                                              &bg->runtime_flags)) {
2575                                         spin_unlock(&bg->lock);
2576                                         continue;
2577                                 }
2578                                 spin_unlock(&bg->lock);
2579
2580                                 if (btrfs_zone_activate(bg)) {
2581                                         up_read(&space_info->groups_sem);
2582                                         return 1;
2583                                 }
2584
2585                                 need_finish = true;
2586                         }
2587                 }
2588                 up_read(&space_info->groups_sem);
2589
2590                 if (!do_finish || !need_finish)
2591                         break;
2592
2593                 ret = btrfs_zone_finish_one_bg(fs_info);
2594                 if (ret == 0)
2595                         break;
2596                 if (ret < 0)
2597                         return ret;
2598         }
2599
2600         return 0;
2601 }
2602
2603 /*
2604  * Reserve zones for one metadata block group, one tree-log block group, and one
2605  * system block group.
2606  */
2607 void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
2608 {
2609         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2610         struct btrfs_block_group *block_group;
2611         struct btrfs_device *device;
2612         /* Reserve zones for normal SINGLE metadata and tree-log block group. */
2613         unsigned int metadata_reserve = 2;
2614         /* Reserve a zone for SINGLE system block group. */
2615         unsigned int system_reserve = 1;
2616
2617         if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags))
2618                 return;
2619
2620         /*
2621          * This function is called from the mount context. So, there is no
2622          * parallel process touching the bits. No need for read_seqretry().
2623          */
2624         if (fs_info->avail_metadata_alloc_bits & BTRFS_BLOCK_GROUP_DUP)
2625                 metadata_reserve = 4;
2626         if (fs_info->avail_system_alloc_bits & BTRFS_BLOCK_GROUP_DUP)
2627                 system_reserve = 2;
2628
2629         /* Apply the reservation on all the devices. */
2630         mutex_lock(&fs_devices->device_list_mutex);
2631         list_for_each_entry(device, &fs_devices->devices, dev_list) {
2632                 if (!device->bdev)
2633                         continue;
2634
2635                 device->zone_info->reserved_active_zones =
2636                         metadata_reserve + system_reserve;
2637         }
2638         mutex_unlock(&fs_devices->device_list_mutex);
2639
2640         /* Release reservation for currently active block groups. */
2641         spin_lock(&fs_info->zone_active_bgs_lock);
2642         list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
2643                 struct btrfs_chunk_map *map = block_group->physical_map;
2644
2645                 if (!(block_group->flags &
2646                       (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)))
2647                         continue;
2648
2649                 for (int i = 0; i < map->num_stripes; i++)
2650                         map->stripes[i].dev->zone_info->reserved_active_zones--;
2651         }
2652         spin_unlock(&fs_info->zone_active_bgs_lock);
2653 }
This page took 0.181839 seconds and 4 git commands to generate.