fs/btrfs/zoned.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include <linux/bitops.h>
   4 #include <linux/slab.h>
   5 #include <linux/blkdev.h>
   6 #include <linux/sched/mm.h>
   7 #include "ctree.h"
   8 #include "volumes.h"
   9 #include "zoned.h"
  10 #include "rcu-string.h"
  11 #include "disk-io.h"
  12 #include "block-group.h"
  13 #include "transaction.h"
  14 #include "dev-replace.h"
  15 #include "space-info.h"
  16
  17 /* Maximum number of zones to report per blkdev_report_zones() call */
  18 #define BTRFS_REPORT_NR_ZONES   4096
  19 /* Invalid allocation pointer value for missing devices */
  20 #define WP_MISSING_DEV ((u64)-1)
  21 /* Pseudo write pointer value for conventional zone */
  22 #define WP_CONVENTIONAL ((u64)-2)
  23
  24 /*
  25  * Location of the first zone of superblock logging zone pairs.
  26  *
  27  * - primary superblock:    0B (zone 0)
  28  * - first copy:          512G (zone starting at that offset)
  29  * - second copy:           4T (zone starting at that offset)
  30  */
  31 #define BTRFS_SB_LOG_PRIMARY_OFFSET     (0ULL)
  32 #define BTRFS_SB_LOG_FIRST_OFFSET       (512ULL * SZ_1G)
  33 #define BTRFS_SB_LOG_SECOND_OFFSET      (4096ULL * SZ_1G)
  34
  35 #define BTRFS_SB_LOG_FIRST_SHIFT        const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
  36 #define BTRFS_SB_LOG_SECOND_SHIFT       const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
  37
  38 /* Number of superblock log zones */
  39 #define BTRFS_NR_SB_LOG_ZONES 2
  40
  41 /*
  42  * Maximum supported zone size. Currently, SMR disks have a zone size of
  43  * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
  44  * expect the zone size to become larger than 8GiB in the near future.
  45  */
  46 #define BTRFS_MAX_ZONE_SIZE             SZ_8G
  47
  48 static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
  49 {
  50         struct blk_zone *zones = data;
  51
  52         memcpy(&zones[idx], zone, sizeof(*zone));
  53
  54         return 0;
  55 }
  56
  57 static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
  58                             u64 *wp_ret)
  59 {
  60         bool empty[BTRFS_NR_SB_LOG_ZONES];
  61         bool full[BTRFS_NR_SB_LOG_ZONES];
  62         sector_t sector;
  63
  64         ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
  65                zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
  66
  67         empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
  68         empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
  69         full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
  70         full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
  71
  72         /*
  73          * Possible states of log buffer zones
  74          *
  75          *           Empty[0]  In use[0]  Full[0]
  76          * Empty[1]         *          x        0
  77          * In use[1]        0          x        0
  78          * Full[1]          1          1        C
  79          *
  80          * Log position:
  81          *   *: Special case, no superblock is written
  82          *   0: Use write pointer of zones[0]
  83          *   1: Use write pointer of zones[1]
  84          *   C: Compare super blcoks from zones[0] and zones[1], use the latest
  85          *      one determined by generation
  86          *   x: Invalid state
  87          */
  88
  89         if (empty[0] && empty[1]) {
  90                 /* Special case to distinguish no superblock to read */
  91                 *wp_ret = zones[0].start << SECTOR_SHIFT;
  92                 return -ENOENT;
  93         } else if (full[0] && full[1]) {
  94                 /* Compare two super blocks */
  95                 struct address_space *mapping = bdev->bd_inode->i_mapping;
  96                 struct page *page[BTRFS_NR_SB_LOG_ZONES];
  97                 struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
  98                 int i;
  99
 100                 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
 101                         u64 bytenr;
 102
 103                         bytenr = ((zones[i].start + zones[i].len)
 104                                    << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;
 105
 106                         page[i] = read_cache_page_gfp(mapping,
 107                                         bytenr >> PAGE_SHIFT, GFP_NOFS);
 108                         if (IS_ERR(page[i])) {
 109                                 if (i == 1)
 110                                         btrfs_release_disk_super(super[0]);
 111                                 return PTR_ERR(page[i]);
 112                         }
 113                         super[i] = page_address(page[i]);
 114                 }
 115
 116                 if (super[0]->generation > super[1]->generation)
 117                         sector = zones[1].start;
 118                 else
 119                         sector = zones[0].start;
 120
 121                 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
 122                         btrfs_release_disk_super(super[i]);
 123         } else if (!full[0] && (empty[1] || full[1])) {
 124                 sector = zones[0].wp;
 125         } else if (full[0]) {
 126                 sector = zones[1].wp;
 127         } else {
 128                 return -EUCLEAN;
 129         }
 130         *wp_ret = sector << SECTOR_SHIFT;
 131         return 0;
 132 }
 133
 134 /*
 135  * Get the first zone number of the superblock mirror
 136  */
 137 static inline u32 sb_zone_number(int shift, int mirror)
 138 {
 139         u64 zone;
 140
 141         ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
 142         switch (mirror) {
 143         case 0: zone = 0; break;
 144         case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
 145         case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
 146         }
 147
 148         ASSERT(zone <= U32_MAX);
 149
 150         return (u32)zone;
 151 }
 152
 153 static inline sector_t zone_start_sector(u32 zone_number,
 154                                          struct block_device *bdev)
 155 {
 156         return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
 157 }
 158
 159 static inline u64 zone_start_physical(u32 zone_number,
 160                                       struct btrfs_zoned_device_info *zone_info)
 161 {
 162         return (u64)zone_number << zone_info->zone_size_shift;
 163 }
 164
 165 /*
 166  * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
 167  * device into static sized chunks and fake a conventional zone on each of
 168  * them.
 169  */
 170 static int emulate_report_zones(struct btrfs_device *device, u64 pos,
 171                                 struct blk_zone *zones, unsigned int nr_zones)
 172 {
 173         const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
 174         sector_t bdev_size = bdev_nr_sectors(device->bdev);
 175         unsigned int i;
 176
 177         pos >>= SECTOR_SHIFT;
 178         for (i = 0; i < nr_zones; i++) {
 179                 zones[i].start = i * zone_sectors + pos;
 180                 zones[i].len = zone_sectors;
 181                 zones[i].capacity = zone_sectors;
 182                 zones[i].wp = zones[i].start + zone_sectors;
 183                 zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
 184                 zones[i].cond = BLK_ZONE_COND_NOT_WP;
 185
 186                 if (zones[i].wp >= bdev_size) {
 187                         i++;
 188                         break;
 189                 }
 190         }
 191
 192         return i;
 193 }
 194
 195 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 196                                struct blk_zone *zones, unsigned int *nr_zones)
 197 {
 198         int ret;
 199
 200         if (!*nr_zones)
 201                 return 0;
 202
 203         if (!bdev_is_zoned(device->bdev)) {
 204                 ret = emulate_report_zones(device, pos, zones, *nr_zones);
 205                 *nr_zones = ret;
 206                 return 0;
 207         }
 208
 209         ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
 210                                   copy_zone_info_cb, zones);
 211         if (ret < 0) {
 212                 btrfs_err_in_rcu(device->fs_info,
 213                                  "zoned: failed to read zone %llu on %s (devid %llu)",
 214                                  pos, rcu_str_deref(device->name),
 215                                  device->devid);
 216                 return ret;
 217         }
 218         *nr_zones = ret;
 219         if (!ret)
 220                 return -EIO;
 221
 222         return 0;
 223 }
 224
 225 /* The emulated zone size is determined from the size of device extent */
 226 static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
 227 {
 228         struct btrfs_path *path;
 229         struct btrfs_root *root = fs_info->dev_root;
 230         struct btrfs_key key;
 231         struct extent_buffer *leaf;
 232         struct btrfs_dev_extent *dext;
 233         int ret = 0;
 234
 235         key.objectid = 1;
 236         key.type = BTRFS_DEV_EXTENT_KEY;
 237         key.offset = 0;
 238
 239         path = btrfs_alloc_path();
 240         if (!path)
 241                 return -ENOMEM;
 242
 243         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 244         if (ret < 0)
 245                 goto out;
 246
 247         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
 248                 ret = btrfs_next_item(root, path);
 249                 if (ret < 0)
 250                         goto out;
 251                 /* No dev extents at all? Not good */
 252                 if (ret > 0) {
 253                         ret = -EUCLEAN;
 254                         goto out;
 255                 }
 256         }
 257
 258         leaf = path->nodes[0];
 259         dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
 260         fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
 261         ret = 0;
 262
 263 out:
 264         btrfs_free_path(path);
 265
 266         return ret;
 267 }
 268
 269 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
 270 {
 271         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 272         struct btrfs_device *device;
 273         int ret = 0;
 274
 275         /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
 276         if (!btrfs_fs_incompat(fs_info, ZONED))
 277                 return 0;
 278
 279         mutex_lock(&fs_devices->device_list_mutex);
 280         list_for_each_entry(device, &fs_devices->devices, dev_list) {
 281                 /* We can skip reading of zone info for missing devices */
 282                 if (!device->bdev)
 283                         continue;
 284
 285                 ret = btrfs_get_dev_zone_info(device);
 286                 if (ret)
 287                         break;
 288         }
 289         mutex_unlock(&fs_devices->device_list_mutex);
 290
 291         return ret;
 292 }
 293
 294 int btrfs_get_dev_zone_info(struct btrfs_device *device)
 295 {
 296         struct btrfs_fs_info *fs_info = device->fs_info;
 297         struct btrfs_zoned_device_info *zone_info = NULL;
 298         struct block_device *bdev = device->bdev;
 299         struct request_queue *queue = bdev_get_queue(bdev);
 300         sector_t nr_sectors;
 301         sector_t sector = 0;
 302         struct blk_zone *zones = NULL;
 303         unsigned int i, nreported = 0, nr_zones;
 304         sector_t zone_sectors;
 305         char *model, *emulated;
 306         int ret;
 307
 308         /*
 309          * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
 310          * yet be set.
 311          */
 312         if (!btrfs_fs_incompat(fs_info, ZONED))
 313                 return 0;
 314
 315         if (device->zone_info)
 316                 return 0;
 317
 318         zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
 319         if (!zone_info)
 320                 return -ENOMEM;
 321
 322         if (!bdev_is_zoned(bdev)) {
 323                 if (!fs_info->zone_size) {
 324                         ret = calculate_emulated_zone_size(fs_info);
 325                         if (ret)
 326                                 goto out;
 327                 }
 328
 329                 ASSERT(fs_info->zone_size);
 330                 zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
 331         } else {
 332                 zone_sectors = bdev_zone_sectors(bdev);
 333         }
 334
 335         /* Check if it's power of 2 (see is_power_of_2) */
 336         ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
 337         zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
 338
 339         /* We reject devices with a zone size larger than 8GB */
 340         if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
 341                 btrfs_err_in_rcu(fs_info,
 342                 "zoned: %s: zone size %llu larger than supported maximum %llu",
 343                                  rcu_str_deref(device->name),
 344                                  zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
 345                 ret = -EINVAL;
 346                 goto out;
 347         }
 348
 349         nr_sectors = bdev_nr_sectors(bdev);
 350         zone_info->zone_size_shift = ilog2(zone_info->zone_size);
 351         zone_info->max_zone_append_size =
 352                 (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
 353         zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
 354         if (!IS_ALIGNED(nr_sectors, zone_sectors))
 355                 zone_info->nr_zones++;
 356
 357         if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) {
 358                 btrfs_err(fs_info, "zoned: device %pg does not support zone append",
 359                           bdev);
 360                 ret = -EINVAL;
 361                 goto out;
 362         }
 363
 364         zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
 365         if (!zone_info->seq_zones) {
 366                 ret = -ENOMEM;
 367                 goto out;
 368         }
 369
 370         zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
 371         if (!zone_info->empty_zones) {
 372                 ret = -ENOMEM;
 373                 goto out;
 374         }
 375
 376         zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
 377         if (!zones) {
 378                 ret = -ENOMEM;
 379                 goto out;
 380         }
 381
 382         /* Get zones type */
 383         while (sector < nr_sectors) {
 384                 nr_zones = BTRFS_REPORT_NR_ZONES;
 385                 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
 386                                           &nr_zones);
 387                 if (ret)
 388                         goto out;
 389
 390                 for (i = 0; i < nr_zones; i++) {
 391                         if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
 392                                 __set_bit(nreported, zone_info->seq_zones);
 393                         if (zones[i].cond == BLK_ZONE_COND_EMPTY)
 394                                 __set_bit(nreported, zone_info->empty_zones);
 395                         nreported++;
 396                 }
 397                 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
 398         }
 399
 400         if (nreported != zone_info->nr_zones) {
 401                 btrfs_err_in_rcu(device->fs_info,
 402                                  "inconsistent number of zones on %s (%u/%u)",
 403                                  rcu_str_deref(device->name), nreported,
 404                                  zone_info->nr_zones);
 405                 ret = -EIO;
 406                 goto out;
 407         }
 408
 409         /* Validate superblock log */
 410         nr_zones = BTRFS_NR_SB_LOG_ZONES;
 411         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 412                 u32 sb_zone;
 413                 u64 sb_wp;
 414                 int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
 415
 416                 sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
 417                 if (sb_zone + 1 >= zone_info->nr_zones)
 418                         continue;
 419
 420                 ret = btrfs_get_dev_zones(device,
 421                                           zone_start_physical(sb_zone, zone_info),
 422                                           &zone_info->sb_zones[sb_pos],
 423                                           &nr_zones);
 424                 if (ret)
 425                         goto out;
 426
 427                 if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
 428                         btrfs_err_in_rcu(device->fs_info,
 429         "zoned: failed to read super block log zone info at devid %llu zone %u",
 430                                          device->devid, sb_zone);
 431                         ret = -EUCLEAN;
 432                         goto out;
 433                 }
 434
 435                 /*
 436                  * If zones[0] is conventional, always use the beggining of the
 437                  * zone to record superblock. No need to validate in that case.
 438                  */
 439                 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
 440                     BLK_ZONE_TYPE_CONVENTIONAL)
 441                         continue;
 442
 443                 ret = sb_write_pointer(device->bdev,
 444                                        &zone_info->sb_zones[sb_pos], &sb_wp);
 445                 if (ret != -ENOENT && ret) {
 446                         btrfs_err_in_rcu(device->fs_info,
 447                         "zoned: super block log zone corrupted devid %llu zone %u",
 448                                          device->devid, sb_zone);
 449                         ret = -EUCLEAN;
 450                         goto out;
 451                 }
 452         }
 453
 454
 455         kfree(zones);
 456
 457         device->zone_info = zone_info;
 458
 459         switch (bdev_zoned_model(bdev)) {
 460         case BLK_ZONED_HM:
 461                 model = "host-managed zoned";
 462                 emulated = "";
 463                 break;
 464         case BLK_ZONED_HA:
 465                 model = "host-aware zoned";
 466                 emulated = "";
 467                 break;
 468         case BLK_ZONED_NONE:
 469                 model = "regular";
 470                 emulated = "emulated ";
 471                 break;
 472         default:
 473                 /* Just in case */
 474                 btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
 475                                  bdev_zoned_model(bdev),
 476                                  rcu_str_deref(device->name));
 477                 ret = -EOPNOTSUPP;
 478                 goto out_free_zone_info;
 479         }
 480
 481         btrfs_info_in_rcu(fs_info,
 482                 "%s block device %s, %u %szones of %llu bytes",
 483                 model, rcu_str_deref(device->name), zone_info->nr_zones,
 484                 emulated, zone_info->zone_size);
 485
 486         return 0;
 487
 488 out:
 489         kfree(zones);
 490 out_free_zone_info:
 491         bitmap_free(zone_info->empty_zones);
 492         bitmap_free(zone_info->seq_zones);
 493         kfree(zone_info);
 494         device->zone_info = NULL;
 495
 496         return ret;
 497 }
 498
 499 void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
 500 {
 501         struct btrfs_zoned_device_info *zone_info = device->zone_info;
 502
 503         if (!zone_info)
 504                 return;
 505
 506         bitmap_free(zone_info->seq_zones);
 507         bitmap_free(zone_info->empty_zones);
 508         kfree(zone_info);
 509         device->zone_info = NULL;
 510 }
 511
 512 int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
 513                        struct blk_zone *zone)
 514 {
 515         unsigned int nr_zones = 1;
 516         int ret;
 517
 518         ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
 519         if (ret != 0 || !nr_zones)
 520                 return ret ? ret : -EIO;
 521
 522         return 0;
 523 }
 524
 525 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 526 {
 527         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 528         struct btrfs_device *device;
 529         u64 zoned_devices = 0;
 530         u64 nr_devices = 0;
 531         u64 zone_size = 0;
 532         u64 max_zone_append_size = 0;
 533         const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
 534         int ret = 0;
 535
 536         /* Count zoned devices */
 537         list_for_each_entry(device, &fs_devices->devices, dev_list) {
 538                 enum blk_zoned_model model;
 539
 540                 if (!device->bdev)
 541                         continue;
 542
 543                 model = bdev_zoned_model(device->bdev);
 544                 /*
 545                  * A Host-Managed zoned device must be used as a zoned device.
 546                  * A Host-Aware zoned device and a non-zoned devices can be
 547                  * treated as a zoned device, if ZONED flag is enabled in the
 548                  * superblock.
 549                  */
 550                 if (model == BLK_ZONED_HM ||
 551                     (model == BLK_ZONED_HA && incompat_zoned) ||
 552                     (model == BLK_ZONED_NONE && incompat_zoned)) {
 553                         struct btrfs_zoned_device_info *zone_info =
 554                                 device->zone_info;
 555
 556                         zone_info = device->zone_info;
 557                         zoned_devices++;
 558                         if (!zone_size) {
 559                                 zone_size = zone_info->zone_size;
 560                         } else if (zone_info->zone_size != zone_size) {
 561                                 btrfs_err(fs_info,
 562                 "zoned: unequal block device zone sizes: have %llu found %llu",
 563                                           device->zone_info->zone_size,
 564                                           zone_size);
 565                                 ret = -EINVAL;
 566                                 goto out;
 567                         }
 568                         if (!max_zone_append_size ||
 569                             (zone_info->max_zone_append_size &&
 570                              zone_info->max_zone_append_size < max_zone_append_size))
 571                                 max_zone_append_size =
 572                                         zone_info->max_zone_append_size;
 573                 }
 574                 nr_devices++;
 575         }
 576
 577         if (!zoned_devices && !incompat_zoned)
 578                 goto out;
 579
 580         if (!zoned_devices && incompat_zoned) {
 581                 /* No zoned block device found on ZONED filesystem */
 582                 btrfs_err(fs_info,
 583                           "zoned: no zoned devices found on a zoned filesystem");
 584                 ret = -EINVAL;
 585                 goto out;
 586         }
 587
 588         if (zoned_devices && !incompat_zoned) {
 589                 btrfs_err(fs_info,
 590                           "zoned: mode not enabled but zoned device found");
 591                 ret = -EINVAL;
 592                 goto out;
 593         }
 594
 595         if (zoned_devices != nr_devices) {
 596                 btrfs_err(fs_info,
 597                           "zoned: cannot mix zoned and regular devices");
 598                 ret = -EINVAL;
 599                 goto out;
 600         }
 601
 602         /*
 603          * stripe_size is always aligned to BTRFS_STRIPE_LEN in
 604          * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
 605          * check the alignment here.
 606          */
 607         if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
 608                 btrfs_err(fs_info,
 609                           "zoned: zone size %llu not aligned to stripe %u",
 610                           zone_size, BTRFS_STRIPE_LEN);
 611                 ret = -EINVAL;
 612                 goto out;
 613         }
 614
 615         if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
 616                 btrfs_err(fs_info, "zoned: mixed block groups not supported");
 617                 ret = -EINVAL;
 618                 goto out;
 619         }
 620
 621         fs_info->zone_size = zone_size;
 622         fs_info->max_zone_append_size = max_zone_append_size;
 623         fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
 624
 625         /*
 626          * Check mount options here, because we might change fs_info->zoned
 627          * from fs_info->zone_size.
 628          */
 629         ret = btrfs_check_mountopts_zoned(fs_info);
 630         if (ret)
 631                 goto out;
 632
 633         btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
 634 out:
 635         return ret;
 636 }
 637
 638 int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
 639 {
 640         if (!btrfs_is_zoned(info))
 641                 return 0;
 642
 643         /*
 644          * Space cache writing is not COWed. Disable that to avoid write errors
 645          * in sequential zones.
 646          */
 647         if (btrfs_test_opt(info, SPACE_CACHE)) {
 648                 btrfs_err(info, "zoned: space cache v1 is not supported");
 649                 return -EINVAL;
 650         }
 651
 652         if (btrfs_test_opt(info, NODATACOW)) {
 653                 btrfs_err(info, "zoned: NODATACOW not supported");
 654                 return -EINVAL;
 655         }
 656
 657         return 0;
 658 }
 659
 660 static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
 661                            int rw, u64 *bytenr_ret)
 662 {
 663         u64 wp;
 664         int ret;
 665
 666         if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
 667                 *bytenr_ret = zones[0].start << SECTOR_SHIFT;
 668                 return 0;
 669         }
 670
 671         ret = sb_write_pointer(bdev, zones, &wp);
 672         if (ret != -ENOENT && ret < 0)
 673                 return ret;
 674
 675         if (rw == WRITE) {
 676                 struct blk_zone *reset = NULL;
 677
 678                 if (wp == zones[0].start << SECTOR_SHIFT)
 679                         reset = &zones[0];
 680                 else if (wp == zones[1].start << SECTOR_SHIFT)
 681                         reset = &zones[1];
 682
 683                 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
 684                         ASSERT(reset->cond == BLK_ZONE_COND_FULL);
 685
 686                         ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
 687                                                reset->start, reset->len,
 688                                                GFP_NOFS);
 689                         if (ret)
 690                                 return ret;
 691
 692                         reset->cond = BLK_ZONE_COND_EMPTY;
 693                         reset->wp = reset->start;
 694                 }
 695         } else if (ret != -ENOENT) {
 696                 /* For READ, we want the precious one */
 697                 if (wp == zones[0].start << SECTOR_SHIFT)
 698                         wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
 699                 wp -= BTRFS_SUPER_INFO_SIZE;
 700         }
 701
 702         *bytenr_ret = wp;
 703         return 0;
 704
 705 }
 706
 707 int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
 708                                u64 *bytenr_ret)
 709 {
 710         struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
 711         sector_t zone_sectors;
 712         u32 sb_zone;
 713         int ret;
 714         u8 zone_sectors_shift;
 715         sector_t nr_sectors;
 716         u32 nr_zones;
 717
 718         if (!bdev_is_zoned(bdev)) {
 719                 *bytenr_ret = btrfs_sb_offset(mirror);
 720                 return 0;
 721         }
 722
 723         ASSERT(rw == READ || rw == WRITE);
 724
 725         zone_sectors = bdev_zone_sectors(bdev);
 726         if (!is_power_of_2(zone_sectors))
 727                 return -EINVAL;
 728         zone_sectors_shift = ilog2(zone_sectors);
 729         nr_sectors = bdev_nr_sectors(bdev);
 730         nr_zones = nr_sectors >> zone_sectors_shift;
 731
 732         sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
 733         if (sb_zone + 1 >= nr_zones)
 734                 return -ENOENT;
 735
 736         ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
 737                                   BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
 738                                   zones);
 739         if (ret < 0)
 740                 return ret;
 741         if (ret != BTRFS_NR_SB_LOG_ZONES)
 742                 return -EIO;
 743
 744         return sb_log_location(bdev, zones, rw, bytenr_ret);
 745 }
 746
 747 int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
 748                           u64 *bytenr_ret)
 749 {
 750         struct btrfs_zoned_device_info *zinfo = device->zone_info;
 751         u32 zone_num;
 752
 753         /*
 754          * For a zoned filesystem on a non-zoned block device, use the same
 755          * super block locations as regular filesystem. Doing so, the super
 756          * block can always be retrieved and the zoned flag of the volume
 757          * detected from the super block information.
 758          */
 759         if (!bdev_is_zoned(device->bdev)) {
 760                 *bytenr_ret = btrfs_sb_offset(mirror);
 761                 return 0;
 762         }
 763
 764         zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
 765         if (zone_num + 1 >= zinfo->nr_zones)
 766                 return -ENOENT;
 767
 768         return sb_log_location(device->bdev,
 769                                &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
 770                                rw, bytenr_ret);
 771 }
 772
 773 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
 774                                   int mirror)
 775 {
 776         u32 zone_num;
 777
 778         if (!zinfo)
 779                 return false;
 780
 781         zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
 782         if (zone_num + 1 >= zinfo->nr_zones)
 783                 return false;
 784
 785         if (!test_bit(zone_num, zinfo->seq_zones))
 786                 return false;
 787
 788         return true;
 789 }
 790
 791 void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
 792 {
 793         struct btrfs_zoned_device_info *zinfo = device->zone_info;
 794         struct blk_zone *zone;
 795
 796         if (!is_sb_log_zone(zinfo, mirror))
 797                 return;
 798
 799         zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
 800         if (zone->cond != BLK_ZONE_COND_FULL) {
 801                 if (zone->cond == BLK_ZONE_COND_EMPTY)
 802                         zone->cond = BLK_ZONE_COND_IMP_OPEN;
 803
 804                 zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
 805
 806                 if (zone->wp == zone->start + zone->len)
 807                         zone->cond = BLK_ZONE_COND_FULL;
 808
 809                 return;
 810         }
 811
 812         zone++;
 813         ASSERT(zone->cond != BLK_ZONE_COND_FULL);
 814         if (zone->cond == BLK_ZONE_COND_EMPTY)
 815                 zone->cond = BLK_ZONE_COND_IMP_OPEN;
 816
 817         zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
 818
 819         if (zone->wp == zone->start + zone->len)
 820                 zone->cond = BLK_ZONE_COND_FULL;
 821 }
 822
 823 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
 824 {
 825         sector_t zone_sectors;
 826         sector_t nr_sectors;
 827         u8 zone_sectors_shift;
 828         u32 sb_zone;
 829         u32 nr_zones;
 830
 831         zone_sectors = bdev_zone_sectors(bdev);
 832         zone_sectors_shift = ilog2(zone_sectors);
 833         nr_sectors = bdev_nr_sectors(bdev);
 834         nr_zones = nr_sectors >> zone_sectors_shift;
 835
 836         sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
 837         if (sb_zone + 1 >= nr_zones)
 838                 return -ENOENT;
 839
 840         return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
 841                                 zone_start_sector(sb_zone, bdev),
 842                                 zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
 843 }
 844
 845 /**
 846  * btrfs_find_allocatable_zones - find allocatable zones within a given region
 847  *
 848  * @device:     the device to allocate a region on
 849  * @hole_start: the position of the hole to allocate the region
 850  * @num_bytes:  size of wanted region
 851  * @hole_end:   the end of the hole
 852  * @return:     position of allocatable zones
 853  *
 854  * Allocatable region should not contain any superblock locations.
 855  */
 856 u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
 857                                  u64 hole_end, u64 num_bytes)
 858 {
 859         struct btrfs_zoned_device_info *zinfo = device->zone_info;
 860         const u8 shift = zinfo->zone_size_shift;
 861         u64 nzones = num_bytes >> shift;
 862         u64 pos = hole_start;
 863         u64 begin, end;
 864         bool have_sb;
 865         int i;
 866
 867         ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
 868         ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
 869
 870         while (pos < hole_end) {
 871                 begin = pos >> shift;
 872                 end = begin + nzones;
 873
 874                 if (end > zinfo->nr_zones)
 875                         return hole_end;
 876
 877                 /* Check if zones in the region are all empty */
 878                 if (btrfs_dev_is_sequential(device, pos) &&
 879                     find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
 880                         pos += zinfo->zone_size;
 881                         continue;
 882                 }
 883
 884                 have_sb = false;
 885                 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 886                         u32 sb_zone;
 887                         u64 sb_pos;
 888
 889                         sb_zone = sb_zone_number(shift, i);
 890                         if (!(end <= sb_zone ||
 891                               sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
 892                                 have_sb = true;
 893                                 pos = zone_start_physical(
 894                                         sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
 895                                 break;
 896                         }
 897
 898                         /* We also need to exclude regular superblock positions */
 899                         sb_pos = btrfs_sb_offset(i);
 900                         if (!(pos + num_bytes <= sb_pos ||
 901                               sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
 902                                 have_sb = true;
 903                                 pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
 904                                             zinfo->zone_size);
 905                                 break;
 906                         }
 907                 }
 908                 if (!have_sb)
 909                         break;
 910         }
 911
 912         return pos;
 913 }
 914
 915 int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
 916                             u64 length, u64 *bytes)
 917 {
 918         int ret;
 919
 920         *bytes = 0;
 921         ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
 922                                physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
 923                                GFP_NOFS);
 924         if (ret)
 925                 return ret;
 926
 927         *bytes = length;
 928         while (length) {
 929                 btrfs_dev_set_zone_empty(device, physical);
 930                 physical += device->zone_info->zone_size;
 931                 length -= device->zone_info->zone_size;
 932         }
 933
 934         return 0;
 935 }
 936
 937 int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
 938 {
 939         struct btrfs_zoned_device_info *zinfo = device->zone_info;
 940         const u8 shift = zinfo->zone_size_shift;
 941         unsigned long begin = start >> shift;
 942         unsigned long end = (start + size) >> shift;
 943         u64 pos;
 944         int ret;
 945
 946         ASSERT(IS_ALIGNED(start, zinfo->zone_size));
 947         ASSERT(IS_ALIGNED(size, zinfo->zone_size));
 948
 949         if (end > zinfo->nr_zones)
 950                 return -ERANGE;
 951
 952         /* All the zones are conventional */
 953         if (find_next_bit(zinfo->seq_zones, begin, end) == end)
 954                 return 0;
 955
 956         /* All the zones are sequential and empty */
 957         if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end &&
 958             find_next_zero_bit(zinfo->empty_zones, begin, end) == end)
 959                 return 0;
 960
 961         for (pos = start; pos < start + size; pos += zinfo->zone_size) {
 962                 u64 reset_bytes;
 963
 964                 if (!btrfs_dev_is_sequential(device, pos) ||
 965                     btrfs_dev_is_empty_zone(device, pos))
 966                         continue;
 967
 968                 /* Free regions should be empty */
 969                 btrfs_warn_in_rcu(
 970                         device->fs_info,
 971                 "zoned: resetting device %s (devid %llu) zone %llu for allocation",
 972                         rcu_str_deref(device->name), device->devid, pos >> shift);
 973                 WARN_ON_ONCE(1);
 974
 975                 ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
 976                                               &reset_bytes);
 977                 if (ret)
 978                         return ret;
 979         }
 980
 981         return 0;
 982 }
 983
 984 /*
 985  * Calculate an allocation pointer from the extent allocation information
 986  * for a block group consist of conventional zones. It is pointed to the
 987  * end of the highest addressed extent in the block group as an allocation
 988  * offset.
 989  */
 990 static int calculate_alloc_pointer(struct btrfs_block_group *cache,
 991                                    u64 *offset_ret)
 992 {
 993         struct btrfs_fs_info *fs_info = cache->fs_info;
 994         struct btrfs_root *root = fs_info->extent_root;
 995         struct btrfs_path *path;
 996         struct btrfs_key key;
 997         struct btrfs_key found_key;
 998         int ret;
 999         u64 length;
1000
1001         path = btrfs_alloc_path();
1002         if (!path)
1003                 return -ENOMEM;
1004
1005         key.objectid = cache->start + cache->length;
1006         key.type = 0;
1007         key.offset = 0;
1008
1009         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1010         /* We should not find the exact match */
1011         if (!ret)
1012                 ret = -EUCLEAN;
1013         if (ret < 0)
1014                 goto out;
1015
1016         ret = btrfs_previous_extent_item(root, path, cache->start);
1017         if (ret) {
1018                 if (ret == 1) {
1019                         ret = 0;
1020                         *offset_ret = 0;
1021                 }
1022                 goto out;
1023         }
1024
1025         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
1026
1027         if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
1028                 length = found_key.offset;
1029         else
1030                 length = fs_info->nodesize;
1031
1032         if (!(found_key.objectid >= cache->start &&
1033                found_key.objectid + length <= cache->start + cache->length)) {
1034                 ret = -EUCLEAN;
1035                 goto out;
1036         }
1037         *offset_ret = found_key.objectid + length - cache->start;
1038         ret = 0;
1039
1040 out:
1041         btrfs_free_path(path);
1042         return ret;
1043 }
1044
1045 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
1046 {
1047         struct btrfs_fs_info *fs_info = cache->fs_info;
1048         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1049         struct extent_map *em;
1050         struct map_lookup *map;
1051         struct btrfs_device *device;
1052         u64 logical = cache->start;
1053         u64 length = cache->length;
1054         u64 physical = 0;
1055         int ret;
1056         int i;
1057         unsigned int nofs_flag;
1058         u64 *alloc_offsets = NULL;
1059         u64 last_alloc = 0;
1060         u32 num_sequential = 0, num_conventional = 0;
1061
1062         if (!btrfs_is_zoned(fs_info))
1063                 return 0;
1064
1065         /* Sanity check */
1066         if (!IS_ALIGNED(length, fs_info->zone_size)) {
1067                 btrfs_err(fs_info,
1068                 "zoned: block group %llu len %llu unaligned to zone size %llu",
1069                           logical, length, fs_info->zone_size);
1070                 return -EIO;
1071         }
1072
1073         /* Get the chunk mapping */
1074         read_lock(&em_tree->lock);
1075         em = lookup_extent_mapping(em_tree, logical, length);
1076         read_unlock(&em_tree->lock);
1077
1078         if (!em)
1079                 return -EINVAL;
1080
1081         map = em->map_lookup;
1082
1083         alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
1084         if (!alloc_offsets) {
1085                 free_extent_map(em);
1086                 return -ENOMEM;
1087         }
1088
1089         for (i = 0; i < map->num_stripes; i++) {
1090                 bool is_sequential;
1091                 struct blk_zone zone;
1092                 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1093                 int dev_replace_is_ongoing = 0;
1094
1095                 device = map->stripes[i].dev;
1096                 physical = map->stripes[i].physical;
1097
1098                 if (device->bdev == NULL) {
1099                         alloc_offsets[i] = WP_MISSING_DEV;
1100                         continue;
1101                 }
1102
1103                 is_sequential = btrfs_dev_is_sequential(device, physical);
1104                 if (is_sequential)
1105                         num_sequential++;
1106                 else
1107                         num_conventional++;
1108
1109                 if (!is_sequential) {
1110                         alloc_offsets[i] = WP_CONVENTIONAL;
1111                         continue;
1112                 }
1113
1114                 /*
1115                  * This zone will be used for allocation, so mark this zone
1116                  * non-empty.
1117                  */
1118                 btrfs_dev_clear_zone_empty(device, physical);
1119
1120                 down_read(&dev_replace->rwsem);
1121                 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
1122                 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
1123                         btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical);
1124                 up_read(&dev_replace->rwsem);
1125
1126                 /*
1127                  * The group is mapped to a sequential zone. Get the zone write
1128                  * pointer to determine the allocation offset within the zone.
1129                  */
1130                 WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
1131                 nofs_flag = memalloc_nofs_save();
1132                 ret = btrfs_get_dev_zone(device, physical, &zone);
1133                 memalloc_nofs_restore(nofs_flag);
1134                 if (ret == -EIO || ret == -EOPNOTSUPP) {
1135                         ret = 0;
1136                         alloc_offsets[i] = WP_MISSING_DEV;
1137                         continue;
1138                 } else if (ret) {
1139                         goto out;
1140                 }
1141
1142                 if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
1143                         btrfs_err_in_rcu(fs_info,
1144         "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
1145                                 zone.start << SECTOR_SHIFT,
1146                                 rcu_str_deref(device->name), device->devid);
1147                         ret = -EIO;
1148                         goto out;
1149                 }
1150
1151                 switch (zone.cond) {
1152                 case BLK_ZONE_COND_OFFLINE:
1153                 case BLK_ZONE_COND_READONLY:
1154                         btrfs_err(fs_info,
1155                 "zoned: offline/readonly zone %llu on device %s (devid %llu)",
1156                                   physical >> device->zone_info->zone_size_shift,
1157                                   rcu_str_deref(device->name), device->devid);
1158                         alloc_offsets[i] = WP_MISSING_DEV;
1159                         break;
1160                 case BLK_ZONE_COND_EMPTY:
1161                         alloc_offsets[i] = 0;
1162                         break;
1163                 case BLK_ZONE_COND_FULL:
1164                         alloc_offsets[i] = fs_info->zone_size;
1165                         break;
1166                 default:
1167                         /* Partially used zone */
1168                         alloc_offsets[i] =
1169                                         ((zone.wp - zone.start) << SECTOR_SHIFT);
1170                         break;
1171                 }
1172         }
1173
1174         if (num_sequential > 0)
1175                 cache->seq_zone = true;
1176
1177         if (num_conventional > 0) {
1178                 /*
1179                  * Avoid calling calculate_alloc_pointer() for new BG. It
1180                  * is no use for new BG. It must be always 0.
1181                  *
1182                  * Also, we have a lock chain of extent buffer lock ->
1183                  * chunk mutex.  For new BG, this function is called from
1184                  * btrfs_make_block_group() which is already taking the
1185                  * chunk mutex. Thus, we cannot call
1186                  * calculate_alloc_pointer() which takes extent buffer
1187                  * locks to avoid deadlock.
1188                  */
1189                 if (new) {
1190                         cache->alloc_offset = 0;
1191                         goto out;
1192                 }
1193                 ret = calculate_alloc_pointer(cache, &last_alloc);
1194                 if (ret || map->num_stripes == num_conventional) {
1195                         if (!ret)
1196                                 cache->alloc_offset = last_alloc;
1197                         else
1198                                 btrfs_err(fs_info,
1199                         "zoned: failed to determine allocation offset of bg %llu",
1200                                           cache->start);
1201                         goto out;
1202                 }
1203         }
1204
1205         switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
1206         case 0: /* single */
1207                 if (alloc_offsets[0] == WP_MISSING_DEV) {
1208                         btrfs_err(fs_info,
1209                         "zoned: cannot recover write pointer for zone %llu",
1210                                 physical);
1211                         ret = -EIO;
1212                         goto out;
1213                 }
1214                 cache->alloc_offset = alloc_offsets[0];
1215                 break;
1216         case BTRFS_BLOCK_GROUP_DUP:
1217         case BTRFS_BLOCK_GROUP_RAID1:
1218         case BTRFS_BLOCK_GROUP_RAID0:
1219         case BTRFS_BLOCK_GROUP_RAID10:
1220         case BTRFS_BLOCK_GROUP_RAID5:
1221         case BTRFS_BLOCK_GROUP_RAID6:
1222                 /* non-single profiles are not supported yet */
1223         default:
1224                 btrfs_err(fs_info, "zoned: profile %s not yet supported",
1225                           btrfs_bg_type_to_raid_name(map->type));
1226                 ret = -EINVAL;
1227                 goto out;
1228         }
1229
1230 out:
1231         if (cache->alloc_offset > fs_info->zone_size) {
1232                 btrfs_err(fs_info,
1233                         "zoned: invalid write pointer %llu in block group %llu",
1234                         cache->alloc_offset, cache->start);
1235                 ret = -EIO;
1236         }
1237
1238         /* An extent is allocated after the write pointer */
1239         if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
1240                 btrfs_err(fs_info,
1241                           "zoned: got wrong write pointer in BG %llu: %llu > %llu",
1242                           logical, last_alloc, cache->alloc_offset);
1243                 ret = -EIO;
1244         }
1245
1246         if (!ret)
1247                 cache->meta_write_pointer = cache->alloc_offset + cache->start;
1248
1249         kfree(alloc_offsets);
1250         free_extent_map(em);
1251
1252         return ret;
1253 }
1254
1255 void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
1256 {
1257         u64 unusable, free;
1258
1259         if (!btrfs_is_zoned(cache->fs_info))
1260                 return;
1261
1262         WARN_ON(cache->bytes_super != 0);
1263         unusable = cache->alloc_offset - cache->used;
1264         free = cache->length - cache->alloc_offset;
1265
1266         /* We only need ->free_space in ALLOC_SEQ block groups */
1267         cache->last_byte_to_unpin = (u64)-1;
1268         cache->cached = BTRFS_CACHE_FINISHED;
1269         cache->free_space_ctl->free_space = free;
1270         cache->zone_unusable = unusable;
1271
1272         /* Should not have any excluded extents. Just in case, though */
1273         btrfs_free_excluded_extents(cache);
1274 }
1275
1276 void btrfs_redirty_list_add(struct btrfs_transaction *trans,
1277                             struct extent_buffer *eb)
1278 {
1279         struct btrfs_fs_info *fs_info = eb->fs_info;
1280
1281         if (!btrfs_is_zoned(fs_info) ||
1282             btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) ||
1283             !list_empty(&eb->release_list))
1284                 return;
1285
1286         set_extent_buffer_dirty(eb);
1287         set_extent_bits_nowait(&trans->dirty_pages, eb->start,
1288                                eb->start + eb->len - 1, EXTENT_DIRTY);
1289         memzero_extent_buffer(eb, 0, eb->len);
1290         set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
1291
1292         spin_lock(&trans->releasing_ebs_lock);
1293         list_add_tail(&eb->release_list, &trans->releasing_ebs);
1294         spin_unlock(&trans->releasing_ebs_lock);
1295         atomic_inc(&eb->refs);
1296 }
1297
1298 void btrfs_free_redirty_list(struct btrfs_transaction *trans)
1299 {
1300         spin_lock(&trans->releasing_ebs_lock);
1301         while (!list_empty(&trans->releasing_ebs)) {
1302                 struct extent_buffer *eb;
1303
1304                 eb = list_first_entry(&trans->releasing_ebs,
1305                                       struct extent_buffer, release_list);
1306                 list_del_init(&eb->release_list);
1307                 free_extent_buffer(eb);
1308         }
1309         spin_unlock(&trans->releasing_ebs_lock);
1310 }
1311
1312 bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
1313 {
1314         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1315         struct btrfs_block_group *cache;
1316         bool ret = false;
1317
1318         if (!btrfs_is_zoned(fs_info))
1319                 return false;
1320
1321         if (!fs_info->max_zone_append_size)
1322                 return false;
1323
1324         if (!is_data_inode(&inode->vfs_inode))
1325                 return false;
1326
1327         cache = btrfs_lookup_block_group(fs_info, start);
1328         ASSERT(cache);
1329         if (!cache)
1330                 return false;
1331
1332         ret = cache->seq_zone;
1333         btrfs_put_block_group(cache);
1334
1335         return ret;
1336 }
1337
1338 void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
1339                                  struct bio *bio)
1340 {
1341         struct btrfs_ordered_extent *ordered;
1342         const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
1343
1344         if (bio_op(bio) != REQ_OP_ZONE_APPEND)
1345                 return;
1346
1347         ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset);
1348         if (WARN_ON(!ordered))
1349                 return;
1350
1351         ordered->physical = physical;
1352         ordered->disk = bio->bi_bdev->bd_disk;
1353         ordered->partno = bio->bi_bdev->bd_partno;
1354
1355         btrfs_put_ordered_extent(ordered);
1356 }
1357
1358 void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
1359 {
1360         struct btrfs_inode *inode = BTRFS_I(ordered->inode);
1361         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1362         struct extent_map_tree *em_tree;
1363         struct extent_map *em;
1364         struct btrfs_ordered_sum *sum;
1365         struct block_device *bdev;
1366         u64 orig_logical = ordered->disk_bytenr;
1367         u64 *logical = NULL;
1368         int nr, stripe_len;
1369
1370         /* Zoned devices should not have partitions. So, we can assume it is 0 */
1371         ASSERT(ordered->partno == 0);
1372         bdev = bdgrab(ordered->disk->part0);
1373         if (WARN_ON(!bdev))
1374                 return;
1375
1376         if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, bdev,
1377                                      ordered->physical, &logical, &nr,
1378                                      &stripe_len)))
1379                 goto out;
1380
1381         WARN_ON(nr != 1);
1382
1383         if (orig_logical == *logical)
1384                 goto out;
1385
1386         ordered->disk_bytenr = *logical;
1387
1388         em_tree = &inode->extent_tree;
1389         write_lock(&em_tree->lock);
1390         em = search_extent_mapping(em_tree, ordered->file_offset,
1391                                    ordered->num_bytes);
1392         em->block_start = *logical;
1393         free_extent_map(em);
1394         write_unlock(&em_tree->lock);
1395
1396         list_for_each_entry(sum, &ordered->list, list) {
1397                 if (*logical < orig_logical)
1398                         sum->bytenr -= orig_logical - *logical;
1399                 else
1400                         sum->bytenr += *logical - orig_logical;
1401         }
1402
1403 out:
1404         kfree(logical);
1405         bdput(bdev);
1406 }
1407
1408 bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
1409                                     struct extent_buffer *eb,
1410                                     struct btrfs_block_group **cache_ret)
1411 {
1412         struct btrfs_block_group *cache;
1413         bool ret = true;
1414
1415         if (!btrfs_is_zoned(fs_info))
1416                 return true;
1417
1418         cache = *cache_ret;
1419
1420         if (cache && (eb->start < cache->start ||
1421                       cache->start + cache->length <= eb->start)) {
1422                 btrfs_put_block_group(cache);
1423                 cache = NULL;
1424                 *cache_ret = NULL;
1425         }
1426
1427         if (!cache)
1428                 cache = btrfs_lookup_block_group(fs_info, eb->start);
1429
1430         if (cache) {
1431                 if (cache->meta_write_pointer != eb->start) {
1432                         btrfs_put_block_group(cache);
1433                         cache = NULL;
1434                         ret = false;
1435                 } else {
1436                         cache->meta_write_pointer = eb->start + eb->len;
1437                 }
1438
1439                 *cache_ret = cache;
1440         }
1441
1442         return ret;
1443 }
1444
1445 void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
1446                                      struct extent_buffer *eb)
1447 {
1448         if (!btrfs_is_zoned(eb->fs_info) || !cache)
1449                 return;
1450
1451         ASSERT(cache->meta_write_pointer == eb->start + eb->len);
1452         cache->meta_write_pointer = eb->start;
1453 }
1454
1455 int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
1456 {
1457         if (!btrfs_dev_is_sequential(device, physical))
1458                 return -EOPNOTSUPP;
1459
1460         return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
1461                                     length >> SECTOR_SHIFT, GFP_NOFS, 0);
1462 }
1463
1464 static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
1465                           struct blk_zone *zone)
1466 {
1467         struct btrfs_bio *bbio = NULL;
1468         u64 mapped_length = PAGE_SIZE;
1469         unsigned int nofs_flag;
1470         int nmirrors;
1471         int i, ret;
1472
1473         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
1474                                &mapped_length, &bbio);
1475         if (ret || !bbio || mapped_length < PAGE_SIZE) {
1476                 btrfs_put_bbio(bbio);
1477                 return -EIO;
1478         }
1479
1480         if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
1481                 return -EINVAL;
1482
1483         nofs_flag = memalloc_nofs_save();
1484         nmirrors = (int)bbio->num_stripes;
1485         for (i = 0; i < nmirrors; i++) {
1486                 u64 physical = bbio->stripes[i].physical;
1487                 struct btrfs_device *dev = bbio->stripes[i].dev;
1488
1489                 /* Missing device */
1490                 if (!dev->bdev)
1491                         continue;
1492
1493                 ret = btrfs_get_dev_zone(dev, physical, zone);
1494                 /* Failing device */
1495                 if (ret == -EIO || ret == -EOPNOTSUPP)
1496                         continue;
1497                 break;
1498         }
1499         memalloc_nofs_restore(nofs_flag);
1500
1501         return ret;
1502 }
1503
1504 /*
1505  * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
1506  * filling zeros between @physical_pos to a write pointer of dev-replace
1507  * source device.
1508  */
1509 int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
1510                                     u64 physical_start, u64 physical_pos)
1511 {
1512         struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
1513         struct blk_zone zone;
1514         u64 length;
1515         u64 wp;
1516         int ret;
1517
1518         if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
1519                 return 0;
1520
1521         ret = read_zone_info(fs_info, logical, &zone);
1522         if (ret)
1523                 return ret;
1524
1525         wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
1526
1527         if (physical_pos == wp)
1528                 return 0;
1529
1530         if (physical_pos > wp)
1531                 return -EUCLEAN;
1532
1533         length = wp - physical_pos;
1534         return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
1535 }
1536
1537 struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
1538                                             u64 logical, u64 length)
1539 {
1540         struct btrfs_device *device;
1541         struct extent_map *em;
1542         struct map_lookup *map;
1543
1544         em = btrfs_get_chunk_map(fs_info, logical, length);
1545         if (IS_ERR(em))
1546                 return ERR_CAST(em);
1547
1548         map = em->map_lookup;
1549         /* We only support single profile for now */
1550         ASSERT(map->num_stripes == 1);
1551         device = map->stripes[0].dev;
1552
1553         free_extent_map(em);
1554
1555         return device;
1556 }