]> Git Repo - linux.git/commitdiff
Merge tag 'for-5.1/block-20190302' of git://git.kernel.dk/linux-block
authorLinus Torvalds <[email protected]>
Fri, 8 Mar 2019 22:12:17 +0000 (14:12 -0800)
committerLinus Torvalds <[email protected]>
Fri, 8 Mar 2019 22:12:17 +0000 (14:12 -0800)
Pull block layer updates from Jens Axboe:
 "Not a huge amount of changes in this round, the biggest one is that we
  finally have Mings multi-page bvec support merged. Apart from that,
  this pull request contains:

   - Small series that avoids quiescing the queue for sysfs changes that
     match what we currently have (Aleksei)

   - Series of bcache fixes (via Coly)

   - Series of lightnvm fixes (via Mathias)

   - NVMe pull request from Christoph. Nothing major, just SPDX/license
     cleanups, RR mp policy (Hannes), and little fixes (Bart,
     Chaitanya).

   - BFQ series (Paolo)

   - Save blk-mq cpu -> hw queue mapping, removing a pointer indirection
     for the fast path (Jianchao)

   - fops->iopoll() added for async IO polling, this is a feature that
     the upcoming io_uring interface will use (Christoph, me)

   - Partition scan loop fixes (Dongli)

   - mtip32xx conversion from managed resource API (Christoph)

   - cdrom registration race fix (Guenter)

   - MD pull from Song, two minor fixes.

   - Various documentation fixes (Marcos)

   - Multi-page bvec feature. This brings a lot of nice improvements
     with it, like more efficient splitting, larger IOs can be supported
     without growing the bvec table size, and so on. (Ming)

   - Various little fixes to core and drivers"

* tag 'for-5.1/block-20190302' of git://git.kernel.dk/linux-block: (117 commits)
  block: fix updating bio's front segment size
  block: Replace function name in string with __func__
  nbd: propagate genlmsg_reply return code
  floppy: remove set but not used variable 'q'
  null_blk: fix checking for REQ_FUA
  block: fix NULL pointer dereference in register_disk
  fs: fix guard_bio_eod to check for real EOD errors
  blk-mq: use HCTX_TYPE_DEFAULT but not 0 to index blk_mq_tag_set->map
  block: optimize bvec iteration in bvec_iter_advance
  block: introduce mp_bvec_for_each_page() for iterating over page
  block: optimize blk_bio_segment_split for single-page bvec
  block: optimize __blk_segment_map_sg() for single-page bvec
  block: introduce bvec_nth_page()
  iomap: wire up the iopoll method
  block: add bio_set_polled() helper
  block: wire up block device iopoll method
  fs: add an iopoll method to struct file_operations
  loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part()
  loop: do not print warn message if partition scan is successful
  block: bounce: make sure that bvec table is updated
  ...

20 files changed:
1  2 
block/blk-mq.c
drivers/ata/libata-scsi.c
drivers/block/floppy.c
drivers/block/mtip32xx/mtip32xx.c
drivers/block/rbd.c
drivers/md/dm-crypt.c
drivers/md/raid1.c
drivers/mmc/core/queue.c
drivers/nvme/host/pci.c
drivers/scsi/scsi_lib.c
drivers/staging/erofs/data.c
drivers/staging/erofs/unzip_vle.c
fs/btrfs/compression.c
fs/btrfs/disk-io.c
fs/btrfs/extent_io.c
fs/btrfs/inode.c
fs/gfs2/lops.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_file.c
include/linux/fs.h

diff --combined block/blk-mq.c
index 9437a5eb07cff63062ed459afc0b6b90e685e6f7,fa024bce2b38ad211d55e0155a74ba590651e3f5..4e502db8b10c8667ca59604150c951ecac40f671
@@@ -364,7 -364,7 +364,7 @@@ static struct request *blk_mq_get_reque
        }
        if (likely(!data->hctx))
                data->hctx = blk_mq_map_queue(q, data->cmd_flags,
-                                               data->ctx->cpu);
+                                               data->ctx);
        if (data->cmd_flags & REQ_NOWAIT)
                data->flags |= BLK_MQ_REQ_NOWAIT;
  
@@@ -737,20 -737,12 +737,20 @@@ static void blk_mq_requeue_work(struct 
        spin_unlock_irq(&q->requeue_lock);
  
        list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
 -              if (!(rq->rq_flags & RQF_SOFTBARRIER))
 +              if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
                        continue;
  
                rq->rq_flags &= ~RQF_SOFTBARRIER;
                list_del_init(&rq->queuelist);
 -              blk_mq_sched_insert_request(rq, true, false, false);
 +              /*
 +               * If RQF_DONTPREP, rq has contained some driver specific
 +               * data, so insert it to hctx dispatch list to avoid any
 +               * merge.
 +               */
 +              if (rq->rq_flags & RQF_DONTPREP)
 +                      blk_mq_request_bypass_insert(rq, false);
 +              else
 +                      blk_mq_sched_insert_request(rq, true, false, false);
        }
  
        while (!list_empty(&rq_list)) {
@@@ -2069,7 -2061,7 +2069,7 @@@ struct blk_mq_tags *blk_mq_alloc_rq_map
        struct blk_mq_tags *tags;
        int node;
  
-       node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
+       node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
        if (node == NUMA_NO_NODE)
                node = set->numa_node;
  
@@@ -2125,7 -2117,7 +2125,7 @@@ int blk_mq_alloc_rqs(struct blk_mq_tag_
        size_t rq_size, left;
        int node;
  
-       node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
+       node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
        if (node == NUMA_NO_NODE)
                node = set->numa_node;
  
@@@ -2424,7 -2416,7 +2424,7 @@@ static void blk_mq_map_swqueue(struct r
         * If the cpu isn't present, the cpu is mapped to first hctx.
         */
        for_each_possible_cpu(i) {
-               hctx_idx = set->map[0].mq_map[i];
+               hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i];
                /* unmapped hw queue can be remapped after CPU topo changed */
                if (!set->tags[hctx_idx] &&
                    !__blk_mq_alloc_rq_map(set, hctx_idx)) {
                         * case, remap the current ctx to hctx[0] which
                         * is guaranteed to always have tags allocated
                         */
-                       set->map[0].mq_map[i] = 0;
+                       set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0;
                }
  
                ctx = per_cpu_ptr(q->queue_ctx, i);
                for (j = 0; j < set->nr_maps; j++) {
-                       if (!set->map[j].nr_queues)
+                       if (!set->map[j].nr_queues) {
+                               ctx->hctxs[j] = blk_mq_map_queue_type(q,
+                                               HCTX_TYPE_DEFAULT, i);
                                continue;
+                       }
  
                        hctx = blk_mq_map_queue_type(q, j, i);
+                       ctx->hctxs[j] = hctx;
                        /*
                         * If the CPU is already set in the mask, then we've
                         * mapped this one already. This can happen if
                         */
                        BUG_ON(!hctx->nr_ctx);
                }
+               for (; j < HCTX_MAX_TYPES; j++)
+                       ctx->hctxs[j] = blk_mq_map_queue_type(q,
+                                       HCTX_TYPE_DEFAULT, i);
        }
  
        mutex_unlock(&q->sysfs_lock);
@@@ -2734,7 -2733,7 +2741,7 @@@ static void blk_mq_realloc_hw_ctxs(stru
                int node;
                struct blk_mq_hw_ctx *hctx;
  
-               node = blk_mq_hw_queue_to_node(&set->map[0], i);
+               node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
                /*
                 * If the hw queue has been mapped to another numa node,
                 * we need to realloc the hctx. If allocation fails, fallback
@@@ -2838,9 -2837,6 +2845,6 @@@ struct request_queue *blk_mq_init_alloc
            set->map[HCTX_TYPE_POLL].nr_queues)
                blk_queue_flag_set(QUEUE_FLAG_POLL, q);
  
-       if (!(set->flags & BLK_MQ_F_SG_MERGE))
-               blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
        q->sg_reserved_size = INT_MAX;
  
        INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
@@@ -2968,7 -2964,7 +2972,7 @@@ static int blk_mq_update_queue_map(stru
                return set->ops->map_queues(set);
        } else {
                BUG_ON(set->nr_maps > 1);
-               return blk_mq_map_queues(&set->map[0]);
+               return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
        }
  }
  
@@@ -3090,6 -3086,9 +3094,9 @@@ int blk_mq_update_nr_requests(struct re
        if (!set)
                return -EINVAL;
  
+       if (q->nr_requests == nr)
+               return 0;
        blk_mq_freeze_queue(q);
        blk_mq_quiesce_queue(q);
  
@@@ -3235,7 -3234,7 +3242,7 @@@ fallback
                        pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
                                        nr_hw_queues, prev_nr_hw_queues);
                        set->nr_hw_queues = prev_nr_hw_queues;
-                       blk_mq_map_queues(&set->map[0]);
+                       blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
                        goto fallback;
                }
                blk_mq_map_swqueue(q);
index c2adfd8486c41f687e0b449c5b21b91a81f5c845,dfe66d00dd5b87e699fe2f1d5a159f779e50bb25..21d1ce20e1a9055b48efd8fbf482b2105a0ac45f
@@@ -1318,8 -1318,6 +1318,6 @@@ static int ata_scsi_dev_config(struct s
                scsi_change_queue_depth(sdev, depth);
        }
  
-       blk_queue_flush_queueable(q, false);
        if (dev->flags & ATA_DFLAG_TRUSTED)
                sdev->security_supported = 1;
  
@@@ -2990,7 -2988,7 +2988,7 @@@ static unsigned int atapi_xlat(struct a
         * This inconsistency confuses several controllers which
         * perform PIO using DMA such as Intel AHCIs and sil3124/32.
         * These controllers use actual number of transferred bytes to
 -       * update DMA poitner and transfer of 4n+2 bytes make those
 +       * update DMA pointer and transfer of 4n+2 bytes make those
         * controller push DMA pointer by 4n+4 bytes because SATA data
         * FISes are aligned to 4 bytes.  This causes data corruption
         * and buffer overrun.
diff --combined drivers/block/floppy.c
index 55481b40df9a5ee57c11a05462bc03d1d46d3682,04d47683eddd63cbecdff1a19ea2902f1ab2c751..95f608d1a098a2c0d0d312e1bd6be82517bfabaf
@@@ -2230,7 -2230,6 +2230,6 @@@ static void floppy_end_request(struct r
  static void request_done(int uptodate)
  {
        struct request *req = current_req;
-       struct request_queue *q;
        int block;
        char msg[sizeof("request done ") + sizeof(int) * 3];
  
                return;
        }
  
-       q = req->q;
        if (uptodate) {
                /* maintain values for invalidation on geometry
                 * change */
@@@ -4075,7 -4072,7 +4072,7 @@@ static unsigned int floppy_check_events
  
        if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) {
                if (lock_fdc(drive))
 -                      return -EINTR;
 +                      return 0;
                poll_drive(false, 0);
                process_fd_request();
        }
index 2f3ee4d6af827645248f2d49f6e2de033782b044,9a6f40cd8df6be434f4954f5107234e0eedf3125..83302ecdc8db5ea3627ba7e0c2fb9fde83ab5336
@@@ -40,7 -40,6 +40,7 @@@
  #include <linux/export.h>
  #include <linux/debugfs.h>
  #include <linux/prefetch.h>
 +#include <linux/numa.h>
  #include "mtip32xx.h"
  
  #define HW_CMD_SLOT_SZ                (MTIP_MAX_COMMAND_SLOTS * 32)
@@@ -1416,7 -1415,7 +1416,7 @@@ static blk_status_t mtip_send_trim(stru
        WARN_ON(sizeof(struct mtip_trim) > ATA_SECT_SIZE);
  
        /* Allocate a DMA buffer for the trim structure */
-       buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
+       buf = dma_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
                                                                GFP_KERNEL);
        if (!buf)
                return BLK_STS_RESOURCE;
                                        MTIP_TRIM_TIMEOUT_MS) < 0)
                ret = BLK_STS_IOERR;
  
-       dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
+       dma_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
        return ret;
  }
  
@@@ -1656,7 -1655,7 +1656,7 @@@ static int exec_drive_command(struct mt
                if (!user_buffer)
                        return -EFAULT;
  
-               buf = dmam_alloc_coherent(&port->dd->pdev->dev,
+               buf = dma_alloc_coherent(&port->dd->pdev->dev,
                                ATA_SECT_SIZE * xfer_sz,
                                &dma_addr,
                                GFP_KERNEL);
        }
  exit_drive_command:
        if (buf)
-               dmam_free_coherent(&port->dd->pdev->dev,
+               dma_free_coherent(&port->dd->pdev->dev,
                                ATA_SECT_SIZE * xfer_sz, buf, dma_addr);
        return rv;
  }
@@@ -2838,11 -2837,11 +2838,11 @@@ static void mtip_dma_free(struct driver
        struct mtip_port *port = dd->port;
  
        if (port->block1)
-               dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+               dma_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
                                        port->block1, port->block1_dma);
  
        if (port->command_list) {
-               dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
+               dma_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
                                port->command_list, port->command_list_dma);
        }
  }
@@@ -2861,7 -2860,7 +2861,7 @@@ static int mtip_dma_alloc(struct driver
  
        /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */
        port->block1 =
-               dmam_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+               dma_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
                                        &port->block1_dma, GFP_KERNEL);
        if (!port->block1)
                return -ENOMEM;
  
        /* Allocate dma memory for command list */
        port->command_list =
-               dmam_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
+               dma_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
                                        &port->command_list_dma, GFP_KERNEL);
        if (!port->command_list) {
-               dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+               dma_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
                                        port->block1, port->block1_dma);
                port->block1 = NULL;
                port->block1_dma = 0;
@@@ -3057,13 -3056,8 +3057,8 @@@ static int mtip_hw_init(struct driver_d
        mtip_start_port(dd->port);
  
        /* Setup the ISR and enable interrupts. */
-       rv = devm_request_irq(&dd->pdev->dev,
-                               dd->pdev->irq,
-                               mtip_irq_handler,
-                               IRQF_SHARED,
-                               dev_driver_string(&dd->pdev->dev),
-                               dd);
+       rv = request_irq(dd->pdev->irq, mtip_irq_handler, IRQF_SHARED,
+                        dev_driver_string(&dd->pdev->dev), dd);
        if (rv) {
                dev_err(&dd->pdev->dev,
                        "Unable to allocate IRQ %d\n", dd->pdev->irq);
@@@ -3091,7 -3085,7 +3086,7 @@@ out3
  
        /* Release the IRQ. */
        irq_set_affinity_hint(dd->pdev->irq, NULL);
-       devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+       free_irq(dd->pdev->irq, dd);
  
  out2:
        mtip_deinit_port(dd->port);
@@@ -3146,7 -3140,7 +3141,7 @@@ static int mtip_hw_exit(struct driver_d
  
        /* Release the IRQ. */
        irq_set_affinity_hint(dd->pdev->irq, NULL);
-       devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+       free_irq(dd->pdev->irq, dd);
        msleep(1000);
  
        /* Free dma regions */
@@@ -3610,8 -3604,8 +3605,8 @@@ static void mtip_free_cmd(struct blk_mq
        if (!cmd->command)
                return;
  
-       dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
-                               cmd->command, cmd->command_dma);
+       dma_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, cmd->command,
+                         cmd->command_dma);
  }
  
  static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq,
        struct driver_data *dd = set->driver_data;
        struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
  
-       cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
+       cmd->command = dma_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
                        &cmd->command_dma, GFP_KERNEL);
        if (!cmd->command)
                return -ENOMEM;
@@@ -4019,9 -4013,9 +4014,9 @@@ static int get_least_used_cpu_on_node(i
  /* Helper for selecting a node in round robin mode */
  static inline int mtip_get_next_rr_node(void)
  {
 -      static int next_node = -1;
 +      static int next_node = NUMA_NO_NODE;
  
 -      if (next_node == -1) {
 +      if (next_node == NUMA_NO_NODE) {
                next_node = first_online_node;
                return next_node;
        }
diff --combined drivers/block/rbd.c
index 282e2e82d84974726b93e9752e2003a3ca7c5491,abe9e1c8922742b826afce126cf890c2d04e2814..74088d8dbaf357b9ecf46b02051dabf96e2d0a10
@@@ -428,13 -428,14 +428,13 @@@ static bool single_major = true
  module_param(single_major, bool, 0444);
  MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
  
 -static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 -                     size_t count);
 -static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
 -                        size_t count);
 -static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
 -                                  size_t count);
 -static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
 -                                     size_t count);
 +static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
 +static ssize_t remove_store(struct bus_type *bus, const char *buf,
 +                          size_t count);
 +static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
 +                                    size_t count);
 +static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
 +                                       size_t count);
  static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
  
  static int rbd_dev_id_to_minor(int dev_id)
@@@ -463,16 -464,16 +463,16 @@@ static bool rbd_is_lock_owner(struct rb
        return is_lock_owner;
  }
  
 -static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
 +static ssize_t supported_features_show(struct bus_type *bus, char *buf)
  {
        return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
  }
  
 -static BUS_ATTR(add, 0200, NULL, rbd_add);
 -static BUS_ATTR(remove, 0200, NULL, rbd_remove);
 -static BUS_ATTR(add_single_major, 0200, NULL, rbd_add_single_major);
 -static BUS_ATTR(remove_single_major, 0200, NULL, rbd_remove_single_major);
 -static BUS_ATTR(supported_features, 0444, rbd_supported_features_show, NULL);
 +static BUS_ATTR_WO(add);
 +static BUS_ATTR_WO(remove);
 +static BUS_ATTR_WO(add_single_major);
 +static BUS_ATTR_WO(remove_single_major);
 +static BUS_ATTR_RO(supported_features);
  
  static struct attribute *rbd_bus_attrs[] = {
        &bus_attr_add.attr,
@@@ -3987,7 -3988,7 +3987,7 @@@ static int rbd_init_disk(struct rbd_dev
        rbd_dev->tag_set.ops = &rbd_mq_ops;
        rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
        rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
-       rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+       rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
        rbd_dev->tag_set.nr_hw_queues = 1;
        rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
  
@@@ -5933,7 -5934,9 +5933,7 @@@ err_out_args
        goto out;
  }
  
 -static ssize_t rbd_add(struct bus_type *bus,
 -                     const char *buf,
 -                     size_t count)
 +static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
  {
        if (single_major)
                return -EINVAL;
        return do_rbd_add(bus, buf, count);
  }
  
 -static ssize_t rbd_add_single_major(struct bus_type *bus,
 -                                  const char *buf,
 -                                  size_t count)
 +static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
 +                                    size_t count)
  {
        return do_rbd_add(bus, buf, count);
  }
@@@ -6045,7 -6049,9 +6045,7 @@@ static ssize_t do_rbd_remove(struct bus
        return count;
  }
  
 -static ssize_t rbd_remove(struct bus_type *bus,
 -                        const char *buf,
 -                        size_t count)
 +static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
  {
        if (single_major)
                return -EINVAL;
        return do_rbd_remove(bus, buf, count);
  }
  
 -static ssize_t rbd_remove_single_major(struct bus_type *bus,
 -                                     const char *buf,
 -                                     size_t count)
 +static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
 +                                       size_t count)
  {
        return do_rbd_remove(bus, buf, count);
  }
diff --combined drivers/md/dm-crypt.c
index dd538e6b27480c5583731cab60738adb4064f1c0,9a29037f56158f4664c28fa90ef3a733edaf533a..dd6565798778055f3a519c12a716f417631f8f9d
@@@ -932,7 -932,7 +932,7 @@@ static int dm_crypt_integrity_io_alloc(
        if (IS_ERR(bip))
                return PTR_ERR(bip);
  
 -      tag_len = io->cc->on_disk_tag_size * bio_sectors(bio);
 +      tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift);
  
        bip->bip_iter.bi_size = tag_len;
        bip->bip_iter.bi_sector = io->cc->start + io->sector;
@@@ -1447,8 -1447,9 +1447,9 @@@ static void crypt_free_buffer_pages(str
  {
        unsigned int i;
        struct bio_vec *bv;
+       struct bvec_iter_all iter_all;
  
-       bio_for_each_segment_all(bv, clone, i) {
+       bio_for_each_segment_all(bv, clone, i, iter_all) {
                BUG_ON(!bv->bv_page);
                mempool_free(bv->bv_page, &cc->page_pool);
        }
diff --combined drivers/md/raid1.c
index fa47249fa3e42819a76f2931963cebec4accda40,88c61d3090b0521f1d3cb6f47192beffef90c8a6..fdf451aac369041c6fccccfca4da74d121862025
@@@ -1603,11 -1603,9 +1603,9 @@@ static void raid1_error(struct mddev *m
                return;
        }
        set_bit(Blocked, &rdev->flags);
-       if (test_and_clear_bit(In_sync, &rdev->flags)) {
+       if (test_and_clear_bit(In_sync, &rdev->flags))
                mddev->degraded++;
-               set_bit(Faulty, &rdev->flags);
-       } else
-               set_bit(Faulty, &rdev->flags);
+       set_bit(Faulty, &rdev->flags);
        spin_unlock_irqrestore(&conf->device_lock, flags);
        /*
         * if recovery is running, make sure it aborts.
@@@ -1863,20 -1861,6 +1861,20 @@@ static void end_sync_read(struct bio *b
                reschedule_retry(r1_bio);
  }
  
 +static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
 +{
 +      sector_t sync_blocks = 0;
 +      sector_t s = r1_bio->sector;
 +      long sectors_to_go = r1_bio->sectors;
 +
 +      /* make sure these bits don't get cleared. */
 +      do {
 +              md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
 +              s += sync_blocks;
 +              sectors_to_go -= sync_blocks;
 +      } while (sectors_to_go > 0);
 +}
 +
  static void end_sync_write(struct bio *bio)
  {
        int uptodate = !bio->bi_status;
        struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
  
        if (!uptodate) {
 -              sector_t sync_blocks = 0;
 -              sector_t s = r1_bio->sector;
 -              long sectors_to_go = r1_bio->sectors;
 -              /* make sure these bits doesn't get cleared. */
 -              do {
 -                      md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
 -                      s += sync_blocks;
 -                      sectors_to_go -= sync_blocks;
 -              } while (sectors_to_go > 0);
 +              abort_sync_write(mddev, r1_bio);
                set_bit(WriteErrorSeen, &rdev->flags);
                if (!test_and_set_bit(WantReplacement, &rdev->flags))
                        set_bit(MD_RECOVERY_NEEDED, &
@@@ -2120,13 -2112,14 +2118,14 @@@ static void process_checks(struct r1bi
                struct page **spages = get_resync_pages(sbio)->pages;
                struct bio_vec *bi;
                int page_len[RESYNC_PAGES] = { 0 };
+               struct bvec_iter_all iter_all;
  
                if (sbio->bi_end_io != end_sync_read)
                        continue;
                /* Now we can 'fixup' the error value */
                sbio->bi_status = 0;
  
-               bio_for_each_segment_all(bi, sbio, j)
+               bio_for_each_segment_all(bi, sbio, j, iter_all)
                        page_len[j] = bi->bv_len;
  
                if (!status) {
@@@ -2178,10 -2171,8 +2177,10 @@@ static void sync_request_write(struct m
                     (i == r1_bio->read_disk ||
                      !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
                        continue;
 -              if (test_bit(Faulty, &conf->mirrors[i].rdev->flags))
 +              if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
 +                      abort_sync_write(mddev, r1_bio);
                        continue;
 +              }
  
                bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
                if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
diff --combined drivers/mmc/core/queue.c
index 15a45ec6518d75c3fd8e602c313ebe77d3aa7376,cc19e71c71d469815c49d7b550586b206d9e9899..7c364a9c4eeb4bb6f3fa7f45913279078262a30e
@@@ -355,7 -355,6 +355,7 @@@ static void mmc_setup_queue(struct mmc_
  {
        struct mmc_host *host = card->host;
        u64 limit = BLK_BOUNCE_HIGH;
 +      unsigned block_size = 512;
  
        if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask)
                limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT;
        blk_queue_max_hw_sectors(mq->queue,
                min(host->max_blk_count, host->max_req_size / 512));
        blk_queue_max_segments(mq->queue, host->max_segs);
 -      blk_queue_max_segment_size(mq->queue, host->max_seg_size);
 +
 +      if (mmc_card_mmc(card))
 +              block_size = card->ext_csd.data_sector_size;
 +
 +      blk_queue_logical_block_size(mq->queue, block_size);
 +      blk_queue_max_segment_size(mq->queue,
 +                      round_down(host->max_seg_size, block_size));
  
        INIT_WORK(&mq->recovery_work, mmc_mq_recovery_handler);
        INIT_WORK(&mq->complete_work, mmc_blk_mq_complete_work);
@@@ -417,8 -410,7 +417,7 @@@ int mmc_init_queue(struct mmc_queue *mq
        else
                mq->tag_set.queue_depth = MMC_QUEUE_DEPTH;
        mq->tag_set.numa_node = NUMA_NO_NODE;
-       mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE |
-                           BLK_MQ_F_BLOCKING;
+       mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
        mq->tag_set.nr_hw_queues = 1;
        mq->tag_set.cmd_size = sizeof(struct mmc_queue_req);
        mq->tag_set.driver_data = mq;
diff --combined drivers/nvme/host/pci.c
index e905861186e35230256a9b66e906be54e72c8f84,f54718b63637dbfcdb3984cc7fcf7a69048be484..92bad1c810acda473bdf5d92e96960a3107e19e6
@@@ -1,15 -1,7 +1,7 @@@
+ // SPDX-License-Identifier: GPL-2.0
  /*
   * NVM Express device driver
   * Copyright (c) 2011-2014, Intel Corporation.
-  *
-  * This program is free software; you can redistribute it and/or modify it
-  * under the terms and conditions of the GNU General Public License,
-  * version 2, as published by the Free Software Foundation.
-  *
-  * This program is distributed in the hope it will be useful, but WITHOUT
-  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  * more details.
   */
  
  #include <linux/aer.h>
@@@ -157,6 -149,8 +149,8 @@@ static int queue_count_set(const char *
        int n = 0, ret;
  
        ret = kstrtoint(val, 10, &n);
+       if (ret)
+               return ret;
        if (n > num_possible_cpus())
                n = num_possible_cpus();
  
@@@ -2041,52 -2035,53 +2035,52 @@@ static int nvme_setup_host_mem(struct n
        return ret;
  }
  
 -/* irq_queues covers admin queue */
 -static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues)
 +/*
 + * nirqs is the number of interrupts available for write and read
 + * queues. The core already reserved an interrupt for the admin queue.
 + */
 +static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
  {
 -      unsigned int this_w_queues = write_queues;
 -
 -      WARN_ON(!irq_queues);
 -
 -      /*
 -       * Setup read/write queue split, assign admin queue one independent
 -       * irq vector if irq_queues is > 1.
 -       */
 -      if (irq_queues <= 2) {
 -              dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
 -              dev->io_queues[HCTX_TYPE_READ] = 0;
 -              return;
 -      }
 +      struct nvme_dev *dev = affd->priv;
 +      unsigned int nr_read_queues;
  
        /*
 -       * If 'write_queues' is set, ensure it leaves room for at least
 -       * one read queue and one admin queue
 +       * If there is no interupt available for queues, ensure that
 +       * the default queue is set to 1. The affinity set size is
 +       * also set to one, but the irq core ignores it for this case.
 +       *
 +       * If only one interrupt is available or 'write_queue' == 0, combine
 +       * write and read queues.
 +       *
 +       * If 'write_queues' > 0, ensure it leaves room for at least one read
 +       * queue.
         */
 -      if (this_w_queues >= irq_queues)
 -              this_w_queues = irq_queues - 2;
 -
 -      /*
 -       * If 'write_queues' is set to zero, reads and writes will share
 -       * a queue set.
 -       */
 -      if (!this_w_queues) {
 -              dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues - 1;
 -              dev->io_queues[HCTX_TYPE_READ] = 0;
 +      if (!nrirqs) {
 +              nrirqs = 1;
 +              nr_read_queues = 0;
 +      } else if (nrirqs == 1 || !write_queues) {
 +              nr_read_queues = 0;
 +      } else if (write_queues >= nrirqs) {
 +              nr_read_queues = 1;
        } else {
 -              dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
 -              dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues - 1;
 +              nr_read_queues = nrirqs - write_queues;
        }
 +
 +      dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
 +      affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
 +      dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
 +      affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
 +      affd->nr_sets = nr_read_queues ? 2 : 1;
  }
  
  static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
  {
        struct pci_dev *pdev = to_pci_dev(dev->dev);
 -      int irq_sets[2];
        struct irq_affinity affd = {
 -              .pre_vectors = 1,
 -              .nr_sets = ARRAY_SIZE(irq_sets),
 -              .sets = irq_sets,
 +              .pre_vectors    = 1,
 +              .calc_sets      = nvme_calc_irq_sets,
 +              .priv           = dev,
        };
 -      int result = 0;
        unsigned int irq_queues, this_p_queues;
  
        /*
        }
        dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
  
 -      /*
 -       * For irq sets, we have to ask for minvec == maxvec. This passes
 -       * any reduction back to us, so we can adjust our queue counts and
 -       * IRQ vector needs.
 -       */
 -      do {
 -              nvme_calc_io_queues(dev, irq_queues);
 -              irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
 -              irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
 -              if (!irq_sets[1])
 -                      affd.nr_sets = 1;
 +      /* Initialize for the single interrupt case */
 +      dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
 +      dev->io_queues[HCTX_TYPE_READ] = 0;
  
 -              /*
 -               * If we got a failure and we're down to asking for just
 -               * 1 + 1 queues, just ask for a single vector. We'll share
 -               * that between the single IO queue and the admin queue.
 -               * Otherwise, we assign one independent vector to admin queue.
 -               */
 -              if (irq_queues > 1)
 -                      irq_queues = irq_sets[0] + irq_sets[1] + 1;
 -
 -              result = pci_alloc_irq_vectors_affinity(pdev, irq_queues,
 -                              irq_queues,
 -                              PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
 -
 -              /*
 -               * Need to reduce our vec counts. If we get ENOSPC, the
 -               * platform should support mulitple vecs, we just need
 -               * to decrease our ask. If we get EINVAL, the platform
 -               * likely does not. Back down to ask for just one vector.
 -               */
 -              if (result == -ENOSPC) {
 -                      irq_queues--;
 -                      if (!irq_queues)
 -                              return result;
 -                      continue;
 -              } else if (result == -EINVAL) {
 -                      irq_queues = 1;
 -                      continue;
 -              } else if (result <= 0)
 -                      return -EIO;
 -              break;
 -      } while (1);
 -
 -      return result;
 +      return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
 +                            PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
  }
  
  static void nvme_disable_io_queues(struct nvme_dev *dev)
@@@ -2520,15 -2554,15 +2514,15 @@@ static void nvme_reset_work(struct work
        mutex_lock(&dev->shutdown_lock);
        result = nvme_pci_enable(dev);
        if (result)
 -              goto out;
 +              goto out_unlock;
  
        result = nvme_pci_configure_admin_queue(dev);
        if (result)
 -              goto out;
 +              goto out_unlock;
  
        result = nvme_alloc_admin_tags(dev);
        if (result)
 -              goto out;
 +              goto out_unlock;
  
        /*
         * Limit the max command size to prevent iod->sg allocations going
        nvme_start_ctrl(&dev->ctrl);
        return;
  
 + out_unlock:
 +      mutex_unlock(&dev->shutdown_lock);
   out:
        nvme_remove_dead_ctrl(dev, result);
  }
@@@ -2984,7 -3016,6 +2978,7 @@@ static struct pci_driver nvme_driver = 
  
  static int __init nvme_init(void)
  {
 +      BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
        return pci_register_driver(&nvme_driver);
  }
  
diff --combined drivers/scsi/scsi_lib.c
index a6828391d6b3777873782221c98885c4b8f41885,6cadbe945bdb559e81b549e80f11d1e095c2e617..ca5fd3ae81f848222cf43c9aa629efd56f404ba5
@@@ -655,7 -655,6 +655,7 @@@ static blk_status_t scsi_result_to_blk_
                set_host_byte(cmd, DID_OK);
                return BLK_STS_TARGET;
        case DID_NEXUS_FAILURE:
 +              set_host_byte(cmd, DID_OK);
                return BLK_STS_NEXUS;
        case DID_ALLOC_FAILURE:
                set_host_byte(cmd, DID_OK);
@@@ -1900,7 -1899,7 +1900,7 @@@ int scsi_mq_setup_tags(struct Scsi_Hos
        shost->tag_set.queue_depth = shost->can_queue;
        shost->tag_set.cmd_size = cmd_size;
        shost->tag_set.numa_node = NUMA_NO_NODE;
-       shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+       shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
        shost->tag_set.flags |=
                BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy);
        shost->tag_set.driver_data = shost;
@@@ -2598,6 -2597,7 +2598,6 @@@ void scsi_device_resume(struct scsi_dev
         * device deleted during suspend)
         */
        mutex_lock(&sdev->state_mutex);
 -      WARN_ON_ONCE(!sdev->quiesced_by);
        sdev->quiesced_by = NULL;
        blk_clear_pm_only(sdev->request_queue);
        if (sdev->sdev_state == SDEV_QUIESCE)
index 9c471f08ffd4112a97b7ef06897e965ed043a3d8,4871ba7b7d9ac7a826aec3258f52d25f76b402e6..526e0dbea5b5714618b463cb3eab98b0895e99f6
@@@ -20,8 -20,9 +20,9 @@@ static inline void read_endio(struct bi
        int i;
        struct bio_vec *bvec;
        const blk_status_t err = bio->bi_status;
+       struct bvec_iter_all iter_all;
  
-       bio_for_each_segment_all(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i, iter_all) {
                struct page *page = bvec->bv_page;
  
                /* page is already locked */
@@@ -165,16 -166,43 +166,16 @@@ err_out
        return err;
  }
  
 -#ifdef CONFIG_EROFS_FS_ZIP
 -extern int z_erofs_map_blocks_iter(struct inode *,
 -                                 struct erofs_map_blocks *,
 -                                 struct page **, int);
 -#endif
 -
 -int erofs_map_blocks_iter(struct inode *inode,
 -                        struct erofs_map_blocks *map,
 -                        struct page **mpage_ret, int flags)
 -{
 -      /* by default, reading raw data never use erofs_map_blocks_iter */
 -      if (unlikely(!is_inode_layout_compression(inode))) {
 -              if (*mpage_ret)
 -                      put_page(*mpage_ret);
 -              *mpage_ret = NULL;
 -
 -              return erofs_map_blocks(inode, map, flags);
 -      }
 -
 -#ifdef CONFIG_EROFS_FS_ZIP
 -      return z_erofs_map_blocks_iter(inode, map, mpage_ret, flags);
 -#else
 -      /* data compression is not available */
 -      return -ENOTSUPP;
 -#endif
 -}
 -
  int erofs_map_blocks(struct inode *inode,
                     struct erofs_map_blocks *map, int flags)
  {
        if (unlikely(is_inode_layout_compression(inode))) {
 -              struct page *mpage = NULL;
 -              int err;
 +              int err = z_erofs_map_blocks_iter(inode, map, flags);
  
 -              err = erofs_map_blocks_iter(inode, map, &mpage, flags);
 -              if (mpage)
 -                      put_page(mpage);
 +              if (map->mpage) {
 +                      put_page(map->mpage);
 +                      map->mpage = NULL;
 +              }
                return err;
        }
        return erofs_map_blocks_flatmode(inode, map, flags);
index 02f34a83147d21874121a8eee5a39068cfb812d2,c057c5616b1ddd0feccc069449c29ef34878e3d9..8715bc50e09c16d44ece32baa474eb7d9bc5ab8f
@@@ -107,30 -107,15 +107,30 @@@ enum z_erofs_vle_work_role 
        Z_EROFS_VLE_WORK_SECONDARY,
        Z_EROFS_VLE_WORK_PRIMARY,
        /*
 -       * The current work has at least been linked with the following
 -       * processed chained works, which means if the processing page
 -       * is the tail partial page of the work, the current work can
 -       * safely use the whole page, as illustrated below:
 -       * +--------------+-------------------------------------------+
 -       * |  tail page   |      head page (of the previous work)     |
 -       * +--------------+-------------------------------------------+
 -       *   /\  which belongs to the current work
 -       * [  (*) this page can be used for the current work itself.  ]
 +       * The current work was the tail of an exist chain, and the previous
 +       * processed chained works are all decided to be hooked up to it.
 +       * A new chain should be created for the remaining unprocessed works,
 +       * therefore different from Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED,
 +       * the next work cannot reuse the whole page in the following scenario:
 +       *  ________________________________________________________________
 +       * |      tail (partial) page     |       head (partial) page       |
 +       * |  (belongs to the next work)  |  (belongs to the current work)  |
 +       * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
 +       */
 +      Z_EROFS_VLE_WORK_PRIMARY_HOOKED,
 +      /*
 +       * The current work has been linked with the processed chained works,
 +       * and could be also linked with the potential remaining works, which
 +       * means if the processing page is the tail partial page of the work,
 +       * the current work can safely use the whole page (since the next work
 +       * is under control) for in-place decompression, as illustrated below:
 +       *  ________________________________________________________________
 +       * |  tail (partial) page  |          head (partial) page           |
 +       * | (of the current work) |         (of the previous work)         |
 +       * |  PRIMARY_FOLLOWED or  |                                        |
 +       * |_____PRIMARY_HOOKED____|____________PRIMARY_FOLLOWED____________|
 +       *
 +       * [  (*) the above page can be used for the current work itself.  ]
         */
        Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED,
        Z_EROFS_VLE_WORK_MAX
@@@ -253,9 -238,14 +253,9 @@@ int erofs_try_to_free_cached_page(struc
  {
        struct erofs_sb_info *const sbi = EROFS_SB(mapping->host->i_sb);
        const unsigned int clusterpages = erofs_clusterpages(sbi);
 -
 -      struct z_erofs_vle_workgroup *grp;
 +      struct z_erofs_vle_workgroup *const grp = (void *)page_private(page);
        int ret = 0;    /* 0 - busy */
  
 -      /* prevent the workgroup from being freed */
 -      rcu_read_lock();
 -      grp = (void *)page_private(page);
 -
        if (erofs_workgroup_try_to_freeze(&grp->obj, 1)) {
                unsigned int i;
  
                        }
                }
                erofs_workgroup_unfreeze(&grp->obj, 1);
 -      }
 -      rcu_read_unlock();
  
 -      if (ret) {
 -              ClearPagePrivate(page);
 -              put_page(page);
 +              if (ret) {
 +                      ClearPagePrivate(page);
 +                      put_page(page);
 +              }
        }
        return ret;
  }
@@@ -324,10 -315,10 +324,10 @@@ static int z_erofs_vle_work_add_page
        return ret ? 0 : -EAGAIN;
  }
  
 -static inline bool try_to_claim_workgroup(
 -      struct z_erofs_vle_workgroup *grp,
 -      z_erofs_vle_owned_workgrp_t *owned_head,
 -      bool *hosted)
 +static enum z_erofs_vle_work_role
 +try_to_claim_workgroup(struct z_erofs_vle_workgroup *grp,
 +                     z_erofs_vle_owned_workgrp_t *owned_head,
 +                     bool *hosted)
  {
        DBG_BUGON(*hosted == true);
  
@@@ -341,9 -332,6 +341,9 @@@ retry
  
                *owned_head = &grp->next;
                *hosted = true;
 +              /* lucky, I am the followee :) */
 +              return Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED;
 +
        } else if (grp->next == Z_EROFS_VLE_WORKGRP_TAIL) {
                /*
                 * type 2, link to the end of a existing open chain,
                if (cmpxchg(&grp->next, Z_EROFS_VLE_WORKGRP_TAIL,
                            *owned_head) != Z_EROFS_VLE_WORKGRP_TAIL)
                        goto retry;
 -
                *owned_head = Z_EROFS_VLE_WORKGRP_TAIL;
 -      } else
 -              return false;   /* :( better luck next time */
 +              return Z_EROFS_VLE_WORK_PRIMARY_HOOKED;
 +      }
  
 -      return true;    /* lucky, I am the followee :) */
 +      return Z_EROFS_VLE_WORK_PRIMARY; /* :( better luck next time */
  }
  
  struct z_erofs_vle_work_finder {
@@@ -435,9 -424,12 +435,9 @@@ z_erofs_vle_work_lookup(const struct z_
        *f->hosted = false;
        if (!primary)
                *f->role = Z_EROFS_VLE_WORK_SECONDARY;
 -      /* claim the workgroup if possible */
 -      else if (try_to_claim_workgroup(grp, f->owned_head, f->hosted))
 -              *f->role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED;
 -      else
 -              *f->role = Z_EROFS_VLE_WORK_PRIMARY;
 -
 +      else    /* claim the workgroup if possible */
 +              *f->role = try_to_claim_workgroup(grp, f->owned_head,
 +                                                f->hosted);
        return work;
  }
  
@@@ -501,9 -493,6 +501,9 @@@ z_erofs_vle_work_register(const struct 
        return work;
  }
  
 +#define builder_is_hooked(builder) \
 +      ((builder)->role >= Z_EROFS_VLE_WORK_PRIMARY_HOOKED)
 +
  #define builder_is_followed(builder) \
        ((builder)->role >= Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED)
  
@@@ -550,7 -539,7 +550,7 @@@ repeat
        if (unlikely(work == ERR_PTR(-EAGAIN)))
                goto repeat;
  
 -      if (unlikely(IS_ERR(work)))
 +      if (IS_ERR(work))
                return PTR_ERR(work);
  got_it:
        z_erofs_pagevec_ctor_init(&builder->vector,
@@@ -600,7 -589,7 +600,7 @@@ static void __z_erofs_vle_work_release(
        erofs_workgroup_put(&grp->obj);
  }
  
 -void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
 +static void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
  {
        struct z_erofs_vle_workgroup *grp =
                z_erofs_vle_work_workgroup(work, true);
@@@ -647,7 -636,7 +647,7 @@@ struct z_erofs_vle_frontend 
        struct inode *const inode;
  
        struct z_erofs_vle_work_builder builder;
 -      struct erofs_map_blocks_iter m_iter;
 +      struct erofs_map_blocks map;
  
        z_erofs_vle_owned_workgrp_t owned_head;
  
  
  #define VLE_FRONTEND_INIT(__i) { \
        .inode = __i, \
 -      .m_iter = { \
 -              { .m_llen = 0, .m_plen = 0 }, \
 +      .map = { \
 +              .m_llen = 0, \
 +              .m_plen = 0, \
                .mpage = NULL \
        }, \
        .builder = VLE_WORK_BUILDER_INIT(), \
@@@ -693,11 -681,12 +693,11 @@@ static int z_erofs_do_read_page(struct 
  {
        struct super_block *const sb = fe->inode->i_sb;
        struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb);
 -      struct erofs_map_blocks_iter *const m = &fe->m_iter;
 -      struct erofs_map_blocks *const map = &m->map;
 +      struct erofs_map_blocks *const map = &fe->map;
        struct z_erofs_vle_work_builder *const builder = &fe->builder;
        const loff_t offset = page_offset(page);
  
 -      bool tight = builder_is_followed(builder);
 +      bool tight = builder_is_hooked(builder);
        struct z_erofs_vle_work *work = builder->work;
  
        enum z_erofs_cache_alloctype cache_strategy;
@@@ -715,12 -704,8 +715,12 @@@ repeat
  
        /* lucky, within the range of the current map_blocks */
        if (offset + cur >= map->m_la &&
 -              offset + cur < map->m_la + map->m_llen)
 +              offset + cur < map->m_la + map->m_llen) {
 +              /* didn't get a valid unzip work previously (very rare) */
 +              if (!builder->work)
 +                      goto restart_now;
                goto hitted;
 +      }
  
        /* go ahead the next map_blocks */
        debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
  
        map->m_la = offset + cur;
        map->m_llen = 0;
 -      err = erofs_map_blocks_iter(fe->inode, map, &m->mpage, 0);
 +      err = z_erofs_map_blocks_iter(fe->inode, map, 0);
        if (unlikely(err))
                goto err_out;
  
 +restart_now:
        if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED)))
                goto hitted;
  
                                 map->m_plen / PAGE_SIZE,
                                 cache_strategy, page_pool, GFP_KERNEL);
  
 -      tight &= builder_is_followed(builder);
 +      tight &= builder_is_hooked(builder);
        work = builder->work;
  hitted:
        cur = end - min_t(unsigned int, offset + end - map->m_la, end);
                        (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
                                Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
  
 +      if (cur)
 +              tight &= builder_is_followed(builder);
 +
  retry:
        err = z_erofs_vle_work_add_page(builder, page, page_type);
        /* should allocate an additional staging page for pagevec */
@@@ -849,8 -830,9 +849,9 @@@ static inline void z_erofs_vle_read_end
  #ifdef EROFS_FS_HAS_MANAGED_CACHE
        struct address_space *mc = NULL;
  #endif
+       struct bvec_iter_all iter_all;
  
-       bio_for_each_segment_all(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i, iter_all) {
                struct page *page = bvec->bv_page;
                bool cachemngd = false;
  
@@@ -1011,10 -993,11 +1012,10 @@@ repeat
        if (llen > grp->llen)
                llen = grp->llen;
  
 -      err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
 -              clusterpages, pages, llen, work->pageofs,
 -              z_erofs_onlinepage_endio);
 +      err = z_erofs_vle_unzip_fast_percpu(compressed_pages, clusterpages,
 +                                          pages, llen, work->pageofs);
        if (err != -ENOTSUPP)
 -              goto out_percpu;
 +              goto out;
  
        if (sparsemem_pages >= nr_pages)
                goto skip_allocpage;
@@@ -1035,25 -1018,8 +1036,25 @@@ skip_allocpage
        erofs_vunmap(vout, nr_pages);
  
  out:
 +      /* must handle all compressed pages before endding pages */
 +      for (i = 0; i < clusterpages; ++i) {
 +              page = compressed_pages[i];
 +
 +#ifdef EROFS_FS_HAS_MANAGED_CACHE
 +              if (page->mapping == MNGD_MAPPING(sbi))
 +                      continue;
 +#endif
 +              /* recycle all individual staging pages */
 +              (void)z_erofs_gather_if_stagingpage(page_pool, page);
 +
 +              WRITE_ONCE(compressed_pages[i], NULL);
 +      }
 +
        for (i = 0; i < nr_pages; ++i) {
                page = pages[i];
 +              if (!page)
 +                      continue;
 +
                DBG_BUGON(!page->mapping);
  
                /* recycle all individual staging pages */
                z_erofs_onlinepage_endio(page);
        }
  
 -out_percpu:
 -      for (i = 0; i < clusterpages; ++i) {
 -              page = compressed_pages[i];
 -
 -#ifdef EROFS_FS_HAS_MANAGED_CACHE
 -              if (page->mapping == MNGD_MAPPING(sbi))
 -                      continue;
 -#endif
 -              /* recycle all individual staging pages */
 -              (void)z_erofs_gather_if_stagingpage(page_pool, page);
 -
 -              WRITE_ONCE(compressed_pages[i], NULL);
 -      }
 -
        if (pages == z_pagemap_global)
                mutex_unlock(&z_pagemap_global_lock);
        else if (unlikely(pages != pages_onstack))
@@@ -1505,8 -1485,8 +1506,8 @@@ static int z_erofs_vle_normalaccess_rea
  
        z_erofs_submit_and_unzip(&f, &pagepool, true);
  out:
 -      if (f.m_iter.mpage)
 -              put_page(f.m_iter.mpage);
 +      if (f.map.mpage)
 +              put_page(f.map.mpage);
  
        /* clean up the remaining free pages */
        put_pages_list(&pagepool);
@@@ -1576,8 -1556,8 +1577,8 @@@ static int z_erofs_vle_normalaccess_rea
  
        z_erofs_submit_and_unzip(&f, &pagepool, sync);
  
 -      if (f.m_iter.mpage)
 -              put_page(f.m_iter.mpage);
 +      if (f.map.mpage)
 +              put_page(f.map.mpage);
  
        /* clean up the remaining free pages */
        put_pages_list(&pagepool);
@@@ -1722,14 -1702,14 +1723,14 @@@ vle_get_logical_extent_head(const struc
  
  int z_erofs_map_blocks_iter(struct inode *inode,
        struct erofs_map_blocks *map,
 -      struct page **mpage_ret, int flags)
 +      int flags)
  {
        void *kaddr;
        const struct vle_map_blocks_iter_ctx ctx = {
                .inode = inode,
                .sb = inode->i_sb,
                .clusterbits = EROFS_I_SB(inode)->clusterbits,
 -              .mpage_ret = mpage_ret,
 +              .mpage_ret = &map->mpage,
                .kaddr_ret = &kaddr
        };
        const unsigned int clustersize = 1 << ctx.clusterbits;
  
        /* initialize `pblk' to keep gcc from printing foolish warnings */
        erofs_blk_t mblk, pblk = 0;
 -      struct page *mpage = *mpage_ret;
 +      struct page *mpage = map->mpage;
        struct z_erofs_vle_decompressed_index *di;
        unsigned int cluster_type, logical_cluster_ofs;
        int err = 0;
                        err = PTR_ERR(mpage);
                        goto out;
                }
 -              *mpage_ret = mpage;
 +              map->mpage = mpage;
        } else {
                lock_page(mpage);
                DBG_BUGON(!PageUptodate(mpage));
                /* get the correspoinding first chunk */
                err = vle_get_logical_extent_head(&ctx, lcn, &ofs,
                                                  &pblk, &map->m_flags);
 -              mpage = *mpage_ret;
 +              mpage = map->mpage;
  
                if (unlikely(err)) {
                        if (mpage)
diff --combined fs/btrfs/compression.c
index eb8e20b740d6e245ec303da68ab335ce21fcd14a,6896ea60c843229b412bf705028172d652cad1ca..4f2a8ae0aa421f35fc5fc5ea97e598273ffc10ba
@@@ -162,13 -162,14 +162,14 @@@ csum_failed
        } else {
                int i;
                struct bio_vec *bvec;
+               struct bvec_iter_all iter_all;
  
                /*
                 * we have verified the checksum already, set page
                 * checked so the end_io handlers know about it
                 */
                ASSERT(!bio_flagged(bio, BIO_CLONED));
-               bio_for_each_segment_all(bvec, cb->orig_bio, i)
+               bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all)
                        SetPageChecked(bvec->bv_page);
  
                bio_endio(cb->orig_bio);
@@@ -730,28 -731,6 +731,28 @@@ struct heuristic_ws 
        struct list_head list;
  };
  
 +static struct workspace_manager heuristic_wsm;
 +
 +static void heuristic_init_workspace_manager(void)
 +{
 +      btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress);
 +}
 +
 +static void heuristic_cleanup_workspace_manager(void)
 +{
 +      btrfs_cleanup_workspace_manager(&heuristic_wsm);
 +}
 +
 +static struct list_head *heuristic_get_workspace(unsigned int level)
 +{
 +      return btrfs_get_workspace(&heuristic_wsm, level);
 +}
 +
 +static void heuristic_put_workspace(struct list_head *ws)
 +{
 +      btrfs_put_workspace(&heuristic_wsm, ws);
 +}
 +
  static void free_heuristic_ws(struct list_head *ws)
  {
        struct heuristic_ws *workspace;
        kfree(workspace);
  }
  
 -static struct list_head *alloc_heuristic_ws(void)
 +static struct list_head *alloc_heuristic_ws(unsigned int level)
  {
        struct heuristic_ws *ws;
  
@@@ -791,59 -770,65 +792,59 @@@ fail
        return ERR_PTR(-ENOMEM);
  }
  
 -struct workspaces_list {
 -      struct list_head idle_ws;
 -      spinlock_t ws_lock;
 -      /* Number of free workspaces */
 -      int free_ws;
 -      /* Total number of allocated workspaces */
 -      atomic_t total_ws;
 -      /* Waiters for a free workspace */
 -      wait_queue_head_t ws_wait;
 +const struct btrfs_compress_op btrfs_heuristic_compress = {
 +      .init_workspace_manager = heuristic_init_workspace_manager,
 +      .cleanup_workspace_manager = heuristic_cleanup_workspace_manager,
 +      .get_workspace = heuristic_get_workspace,
 +      .put_workspace = heuristic_put_workspace,
 +      .alloc_workspace = alloc_heuristic_ws,
 +      .free_workspace = free_heuristic_ws,
  };
  
 -static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
 -
 -static struct workspaces_list btrfs_heuristic_ws;
 -
  static const struct btrfs_compress_op * const btrfs_compress_op[] = {
 +      /* The heuristic is represented as compression type 0 */
 +      &btrfs_heuristic_compress,
        &btrfs_zlib_compress,
        &btrfs_lzo_compress,
        &btrfs_zstd_compress,
  };
  
 -void __init btrfs_init_compress(void)
 +void btrfs_init_workspace_manager(struct workspace_manager *wsm,
 +                                const struct btrfs_compress_op *ops)
  {
        struct list_head *workspace;
 -      int i;
  
 -      INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws);
 -      spin_lock_init(&btrfs_heuristic_ws.ws_lock);
 -      atomic_set(&btrfs_heuristic_ws.total_ws, 0);
 -      init_waitqueue_head(&btrfs_heuristic_ws.ws_wait);
 +      wsm->ops = ops;
  
 -      workspace = alloc_heuristic_ws();
 +      INIT_LIST_HEAD(&wsm->idle_ws);
 +      spin_lock_init(&wsm->ws_lock);
 +      atomic_set(&wsm->total_ws, 0);
 +      init_waitqueue_head(&wsm->ws_wait);
 +
 +      /*
 +       * Preallocate one workspace for each compression type so we can
 +       * guarantee forward progress in the worst case
 +       */
 +      workspace = wsm->ops->alloc_workspace(0);
        if (IS_ERR(workspace)) {
                pr_warn(
 -      "BTRFS: cannot preallocate heuristic workspace, will try later\n");
 +      "BTRFS: cannot preallocate compression workspace, will try later\n");
        } else {
 -              atomic_set(&btrfs_heuristic_ws.total_ws, 1);
 -              btrfs_heuristic_ws.free_ws = 1;
 -              list_add(workspace, &btrfs_heuristic_ws.idle_ws);
 +              atomic_set(&wsm->total_ws, 1);
 +              wsm->free_ws = 1;
 +              list_add(workspace, &wsm->idle_ws);
        }
 +}
  
 -      for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
 -              INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
 -              spin_lock_init(&btrfs_comp_ws[i].ws_lock);
 -              atomic_set(&btrfs_comp_ws[i].total_ws, 0);
 -              init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
 +void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman)
 +{
 +      struct list_head *ws;
  
 -              /*
 -               * Preallocate one workspace for each compression type so
 -               * we can guarantee forward progress in the worst case
 -               */
 -              workspace = btrfs_compress_op[i]->alloc_workspace();
 -              if (IS_ERR(workspace)) {
 -                      pr_warn("BTRFS: cannot preallocate compression workspace, will try later\n");
 -              } else {
 -                      atomic_set(&btrfs_comp_ws[i].total_ws, 1);
 -                      btrfs_comp_ws[i].free_ws = 1;
 -                      list_add(workspace, &btrfs_comp_ws[i].idle_ws);
 -              }
 +      while (!list_empty(&wsman->idle_ws)) {
 +              ws = wsman->idle_ws.next;
 +              list_del(ws);
 +              wsman->ops->free_workspace(ws);
 +              atomic_dec(&wsman->total_ws);
        }
  }
  
   * Preallocation makes a forward progress guarantees and we do not return
   * errors.
   */
 -static struct list_head *__find_workspace(int type, bool heuristic)
 +struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
 +                                    unsigned int level)
  {
        struct list_head *workspace;
        int cpus = num_online_cpus();
 -      int idx = type - 1;
        unsigned nofs_flag;
        struct list_head *idle_ws;
        spinlock_t *ws_lock;
        wait_queue_head_t *ws_wait;
        int *free_ws;
  
 -      if (heuristic) {
 -              idle_ws  = &btrfs_heuristic_ws.idle_ws;
 -              ws_lock  = &btrfs_heuristic_ws.ws_lock;
 -              total_ws = &btrfs_heuristic_ws.total_ws;
 -              ws_wait  = &btrfs_heuristic_ws.ws_wait;
 -              free_ws  = &btrfs_heuristic_ws.free_ws;
 -      } else {
 -              idle_ws  = &btrfs_comp_ws[idx].idle_ws;
 -              ws_lock  = &btrfs_comp_ws[idx].ws_lock;
 -              total_ws = &btrfs_comp_ws[idx].total_ws;
 -              ws_wait  = &btrfs_comp_ws[idx].ws_wait;
 -              free_ws  = &btrfs_comp_ws[idx].free_ws;
 -      }
 +      idle_ws  = &wsm->idle_ws;
 +      ws_lock  = &wsm->ws_lock;
 +      total_ws = &wsm->total_ws;
 +      ws_wait  = &wsm->ws_wait;
 +      free_ws  = &wsm->free_ws;
  
  again:
        spin_lock(ws_lock);
         * context of btrfs_compress_bio/btrfs_compress_pages
         */
        nofs_flag = memalloc_nofs_save();
 -      if (heuristic)
 -              workspace = alloc_heuristic_ws();
 -      else
 -              workspace = btrfs_compress_op[idx]->alloc_workspace();
 +      workspace = wsm->ops->alloc_workspace(level);
        memalloc_nofs_restore(nofs_flag);
  
        if (IS_ERR(workspace)) {
        return workspace;
  }
  
 -static struct list_head *find_workspace(int type)
 +static struct list_head *get_workspace(int type, int level)
  {
 -      return __find_workspace(type, false);
 +      return btrfs_compress_op[type]->get_workspace(level);
  }
  
  /*
   * put a workspace struct back on the list or free it if we have enough
   * idle ones sitting around
   */
 -static void __free_workspace(int type, struct list_head *workspace,
 -                           bool heuristic)
 +void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws)
  {
 -      int idx = type - 1;
        struct list_head *idle_ws;
        spinlock_t *ws_lock;
        atomic_t *total_ws;
        wait_queue_head_t *ws_wait;
        int *free_ws;
  
 -      if (heuristic) {
 -              idle_ws  = &btrfs_heuristic_ws.idle_ws;
 -              ws_lock  = &btrfs_heuristic_ws.ws_lock;
 -              total_ws = &btrfs_heuristic_ws.total_ws;
 -              ws_wait  = &btrfs_heuristic_ws.ws_wait;
 -              free_ws  = &btrfs_heuristic_ws.free_ws;
 -      } else {
 -              idle_ws  = &btrfs_comp_ws[idx].idle_ws;
 -              ws_lock  = &btrfs_comp_ws[idx].ws_lock;
 -              total_ws = &btrfs_comp_ws[idx].total_ws;
 -              ws_wait  = &btrfs_comp_ws[idx].ws_wait;
 -              free_ws  = &btrfs_comp_ws[idx].free_ws;
 -      }
 +      idle_ws  = &wsm->idle_ws;
 +      ws_lock  = &wsm->ws_lock;
 +      total_ws = &wsm->total_ws;
 +      ws_wait  = &wsm->ws_wait;
 +      free_ws  = &wsm->free_ws;
  
        spin_lock(ws_lock);
        if (*free_ws <= num_online_cpus()) {
 -              list_add(workspace, idle_ws);
 +              list_add(ws, idle_ws);
                (*free_ws)++;
                spin_unlock(ws_lock);
                goto wake;
        }
        spin_unlock(ws_lock);
  
 -      if (heuristic)
 -              free_heuristic_ws(workspace);
 -      else
 -              btrfs_compress_op[idx]->free_workspace(workspace);
 +      wsm->ops->free_workspace(ws);
        atomic_dec(total_ws);
  wake:
        cond_wake_up(ws_wait);
  }
  
 -static void free_workspace(int type, struct list_head *ws)
 +static void put_workspace(int type, struct list_head *ws)
  {
 -      return __free_workspace(type, ws, false);
 -}
 -
 -/*
 - * cleanup function for module exit
 - */
 -static void free_workspaces(void)
 -{
 -      struct list_head *workspace;
 -      int i;
 -
 -      while (!list_empty(&btrfs_heuristic_ws.idle_ws)) {
 -              workspace = btrfs_heuristic_ws.idle_ws.next;
 -              list_del(workspace);
 -              free_heuristic_ws(workspace);
 -              atomic_dec(&btrfs_heuristic_ws.total_ws);
 -      }
 -
 -      for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
 -              while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
 -                      workspace = btrfs_comp_ws[i].idle_ws.next;
 -                      list_del(workspace);
 -                      btrfs_compress_op[i]->free_workspace(workspace);
 -                      atomic_dec(&btrfs_comp_ws[i].total_ws);
 -              }
 -      }
 +      return btrfs_compress_op[type]->put_workspace(ws);
  }
  
  /*
@@@ -1003,17 -1037,18 +1004,17 @@@ int btrfs_compress_pages(unsigned int t
                         unsigned long *total_in,
                         unsigned long *total_out)
  {
 +      int type = btrfs_compress_type(type_level);
 +      int level = btrfs_compress_level(type_level);
        struct list_head *workspace;
        int ret;
 -      int type = type_level & 0xF;
 -
 -      workspace = find_workspace(type);
  
 -      btrfs_compress_op[type - 1]->set_level(workspace, type_level);
 -      ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
 +      workspace = get_workspace(type, level);
 +      ret = btrfs_compress_op[type]->compress_pages(workspace, mapping,
                                                      start, pages,
                                                      out_pages,
                                                      total_in, total_out);
 -      free_workspace(type, workspace);
 +      put_workspace(type, workspace);
        return ret;
  }
  
@@@ -1037,9 -1072,9 +1038,9 @@@ static int btrfs_decompress_bio(struct 
        int ret;
        int type = cb->compress_type;
  
 -      workspace = find_workspace(type);
 -      ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb);
 -      free_workspace(type, workspace);
 +      workspace = get_workspace(type, 0);
 +      ret = btrfs_compress_op[type]->decompress_bio(workspace, cb);
 +      put_workspace(type, workspace);
  
        return ret;
  }
@@@ -1055,29 -1090,19 +1056,29 @@@ int btrfs_decompress(int type, unsigne
        struct list_head *workspace;
        int ret;
  
 -      workspace = find_workspace(type);
 -
 -      ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
 +      workspace = get_workspace(type, 0);
 +      ret = btrfs_compress_op[type]->decompress(workspace, data_in,
                                                  dest_page, start_byte,
                                                  srclen, destlen);
 +      put_workspace(type, workspace);
  
 -      free_workspace(type, workspace);
        return ret;
  }
  
 +void __init btrfs_init_compress(void)
 +{
 +      int i;
 +
 +      for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
 +              btrfs_compress_op[i]->init_workspace_manager();
 +}
 +
  void __cold btrfs_exit_compress(void)
  {
 -      free_workspaces();
 +      int i;
 +
 +      for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
 +              btrfs_compress_op[i]->cleanup_workspace_manager();
  }
  
  /*
@@@ -1488,7 -1513,7 +1489,7 @@@ static void heuristic_collect_sample(st
   */
  int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
  {
 -      struct list_head *ws_list = __find_workspace(0, true);
 +      struct list_head *ws_list = get_workspace(0, 0);
        struct heuristic_ws *ws;
        u32 i;
        u8 byte;
        }
  
  out:
 -      __free_workspace(0, ws_list, true);
 +      put_workspace(0, ws_list);
        return ret;
  }
  
 -unsigned int btrfs_compress_str2level(const char *str)
 +/*
 + * Convert the compression suffix (eg. after "zlib" starting with ":") to
 + * level, unrecognized string will set the default level
 + */
 +unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
  {
 -      if (strncmp(str, "zlib", 4) != 0)
 +      unsigned int level = 0;
 +      int ret;
 +
 +      if (!type)
                return 0;
  
 -      /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */
 -      if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0)
 -              return str[5] - '0';
 +      if (str[0] == ':') {
 +              ret = kstrtouint(str + 1, 10, &level);
 +              if (ret)
 +                      level = 0;
 +      }
 +
 +      level = btrfs_compress_op[type]->set_level(level);
  
 -      return BTRFS_ZLIB_DEFAULT_LEVEL;
 +      return level;
  }
diff --combined fs/btrfs/disk-io.c
index 5216e7b3f9ada29308a1c6fc5cf75b816032aded,ca1b7da6dd1b927e54fe84ef12c04ec56197875f..f0cdb53f3e2dc86f0dee6080d3b719482214170d
@@@ -17,7 -17,6 +17,7 @@@
  #include <linux/semaphore.h>
  #include <linux/error-injection.h>
  #include <linux/crc32c.h>
 +#include <linux/sched/mm.h>
  #include <asm/unaligned.h>
  #include "ctree.h"
  #include "disk-io.h"
@@@ -342,7 -341,7 +342,7 @@@ static int verify_parent_transid(struc
  
        if (need_lock) {
                btrfs_tree_read_lock(eb);
 -              btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 +              btrfs_set_lock_blocking_read(eb);
        }
  
        lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
@@@ -833,9 -832,10 +833,10 @@@ static blk_status_t btree_csum_one_bio(
        struct bio_vec *bvec;
        struct btrfs_root *root;
        int i, ret = 0;
+       struct bvec_iter_all iter_all;
  
        ASSERT(!bio_flagged(bio, BIO_CLONED));
-       bio_for_each_segment_all(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i, iter_all) {
                root = BTRFS_I(bvec->bv_page->mapping->host)->root;
                ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
                if (ret)
@@@ -1121,7 -1121,7 +1122,7 @@@ void clean_tree_block(struct btrfs_fs_i
                                                 -buf->len,
                                                 fs_info->dirty_metadata_batch);
                        /* ugh, clear_extent_buffer_dirty needs to lock the page */
 -                      btrfs_set_lock_blocking(buf);
 +                      btrfs_set_lock_blocking_write(buf);
                        clear_extent_buffer_dirty(buf);
                }
        }
@@@ -1176,7 -1176,6 +1177,7 @@@ static void __setup_root(struct btrfs_r
        INIT_LIST_HEAD(&root->delalloc_root);
        INIT_LIST_HEAD(&root->ordered_extents);
        INIT_LIST_HEAD(&root->ordered_root);
 +      INIT_LIST_HEAD(&root->reloc_dirty_list);
        INIT_LIST_HEAD(&root->logged_list[0]);
        INIT_LIST_HEAD(&root->logged_list[1]);
        spin_lock_init(&root->inode_lock);
        root->anon_dev = 0;
  
        spin_lock_init(&root->root_item_lock);
 +      btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
  }
  
  static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
@@@ -1261,17 -1259,10 +1262,17 @@@ struct btrfs_root *btrfs_create_tree(st
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct btrfs_root *root;
        struct btrfs_key key;
 +      unsigned int nofs_flag;
        int ret = 0;
        uuid_le uuid = NULL_UUID_LE;
  
 +      /*
 +       * We're holding a transaction handle, so use a NOFS memory allocation
 +       * context to avoid deadlock if reclaim happens.
 +       */
 +      nofs_flag = memalloc_nofs_save();
        root = btrfs_alloc_root(fs_info, GFP_KERNEL);
 +      memalloc_nofs_restore(nofs_flag);
        if (!root)
                return ERR_PTR(-ENOMEM);
  
@@@ -1717,7 -1708,9 +1718,7 @@@ static int cleaner_kthread(void *arg
                        goto sleep;
                }
  
 -              mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
                btrfs_run_delayed_iputs(fs_info);
 -              mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
  
                again = btrfs_clean_one_deleted_snapshot(root);
                mutex_unlock(&fs_info->cleaner_mutex);
@@@ -2109,7 -2102,7 +2110,7 @@@ static void btrfs_init_scrub(struct btr
        atomic_set(&fs_info->scrubs_paused, 0);
        atomic_set(&fs_info->scrub_cancel_req, 0);
        init_waitqueue_head(&fs_info->scrub_pause_wait);
 -      fs_info->scrub_workers_refcnt = 0;
 +      refcount_set(&fs_info->scrub_workers_refcnt, 0);
  }
  
  static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
@@@ -2674,6 -2667,7 +2675,6 @@@ int open_ctree(struct super_block *sb
        mutex_init(&fs_info->delete_unused_bgs_mutex);
        mutex_init(&fs_info->reloc_mutex);
        mutex_init(&fs_info->delalloc_root_mutex);
 -      mutex_init(&fs_info->cleaner_delayed_iput_mutex);
        seqlock_init(&fs_info->profiles_lock);
  
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
        atomic_set(&fs_info->defrag_running, 0);
        atomic_set(&fs_info->qgroup_op_seq, 0);
        atomic_set(&fs_info->reada_works_cnt, 0);
 +      atomic_set(&fs_info->nr_delayed_iputs, 0);
        atomic64_set(&fs_info->tree_mod_seq, 0);
        fs_info->sb = sb;
        fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
        init_waitqueue_head(&fs_info->transaction_wait);
        init_waitqueue_head(&fs_info->transaction_blocked_wait);
        init_waitqueue_head(&fs_info->async_submit_wait);
 +      init_waitqueue_head(&fs_info->delayed_iputs_wait);
  
        INIT_LIST_HEAD(&fs_info->pinned_chunks);
  
@@@ -4247,9 -4239,16 +4248,9 @@@ static int btrfs_destroy_delayed_refs(s
  
                head = rb_entry(node, struct btrfs_delayed_ref_head,
                                href_node);
 -              if (!mutex_trylock(&head->mutex)) {
 -                      refcount_inc(&head->refs);
 -                      spin_unlock(&delayed_refs->lock);
 -
 -                      mutex_lock(&head->mutex);
 -                      mutex_unlock(&head->mutex);
 -                      btrfs_put_delayed_ref_head(head);
 -                      spin_lock(&delayed_refs->lock);
 +              if (btrfs_delayed_ref_lock(delayed_refs, head))
                        continue;
 -              }
 +
                spin_lock(&head->lock);
                while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
                        ref = rb_entry(n, struct btrfs_delayed_ref_node,
                if (head->must_insert_reserved)
                        pin_bytes = true;
                btrfs_free_delayed_extent_op(head->extent_op);
 -              delayed_refs->num_heads--;
 -              if (head->processing == 0)
 -                      delayed_refs->num_heads_ready--;
 -              atomic_dec(&delayed_refs->num_entries);
 -              rb_erase_cached(&head->href_node, &delayed_refs->href_root);
 -              RB_CLEAR_NODE(&head->href_node);
 +              btrfs_delete_ref_head(delayed_refs, head);
                spin_unlock(&head->lock);
                spin_unlock(&delayed_refs->lock);
                mutex_unlock(&head->mutex);
diff --combined fs/btrfs/extent_io.c
index ca259c75bbcd1a32f462cfada377362ddfa2c8ca,4ed58c9a94a9916959e304d28f77e9002b64e8df..ab705183d749709f004e28c2f02b56243f1b23ff
@@@ -147,38 -147,7 +147,39 @@@ static int add_extent_changeset(struct 
        return ret;
  }
  
 -static void flush_write_bio(struct extent_page_data *epd);
 +static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
 +                                     unsigned long bio_flags)
 +{
 +      blk_status_t ret = 0;
 +      struct bio_vec *bvec = bio_last_bvec_all(bio);
-       struct page *page = bvec->bv_page;
++      struct bio_vec bv;
 +      struct extent_io_tree *tree = bio->bi_private;
 +      u64 start;
 +
-       start = page_offset(page) + bvec->bv_offset;
++      mp_bvec_last_segment(bvec, &bv);
++      start = page_offset(bv.bv_page) + bv.bv_offset;
 +
 +      bio->bi_private = NULL;
 +
 +      if (tree->ops)
 +              ret = tree->ops->submit_bio_hook(tree->private_data, bio,
 +                                         mirror_num, bio_flags, start);
 +      else
 +              btrfsic_submit_bio(bio);
 +
 +      return blk_status_to_errno(ret);
 +}
 +
 +static void flush_write_bio(struct extent_page_data *epd)
 +{
 +      if (epd->bio) {
 +              int ret;
 +
 +              ret = submit_one_bio(epd->bio, 0, 0);
 +              BUG_ON(ret < 0); /* -ENOMEM */
 +              epd->bio = NULL;
 +      }
 +}
  
  int __init extent_io_init(void)
  {
@@@ -312,8 -281,8 +313,8 @@@ do_insert
  }
  
  static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 -                                    struct rb_node **prev_ret,
                                      struct rb_node **next_ret,
 +                                    struct rb_node **prev_ret,
                                      struct rb_node ***p_ret,
                                      struct rb_node **parent_ret)
  {
        if (parent_ret)
                *parent_ret = prev;
  
 -      if (prev_ret) {
 +      if (next_ret) {
                orig_prev = prev;
                while (prev && offset > prev_entry->end) {
                        prev = rb_next(prev);
                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
                }
 -              *prev_ret = prev;
 +              *next_ret = prev;
                prev = orig_prev;
        }
  
 -      if (next_ret) {
 +      if (prev_ret) {
                prev_entry = rb_entry(prev, struct tree_entry, rb_node);
                while (prev && offset < prev_entry->start) {
                        prev = rb_prev(prev);
                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
                }
 -              *next_ret = prev;
 +              *prev_ret = prev;
        }
        return NULL;
  }
@@@ -369,12 -338,12 +370,12 @@@ tree_search_for_insert(struct extent_io
                       struct rb_node ***p_ret,
                       struct rb_node **parent_ret)
  {
 -      struct rb_node *prev = NULL;
 +      struct rb_node *next= NULL;
        struct rb_node *ret;
  
 -      ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
 +      ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
        if (!ret)
 -              return prev;
 +              return next;
        return ret;
  }
  
@@@ -616,6 -585,7 +617,6 @@@ int __clear_extent_bit(struct extent_io
  
        if (delete)
                bits |= ~EXTENT_CTLBITS;
 -      bits |= EXTENT_FIRST_DELALLOC;
  
        if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
                clear = 1;
@@@ -880,6 -850,7 +881,6 @@@ __set_extent_bit(struct extent_io_tree 
  
        btrfs_debug_check_extent_io_range(tree, start, end);
  
 -      bits |= EXTENT_FIRST_DELALLOC;
  again:
        if (!prealloc && gfpflags_allow_blocking(mask)) {
                /*
@@@ -2379,7 -2350,7 +2380,7 @@@ static int bio_readpage_error(struct bi
        int read_mode = 0;
        blk_status_t status;
        int ret;
-       unsigned failed_bio_pages = bio_pages_all(failed_bio);
+       unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
  
        BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
  
@@@ -2451,9 -2422,10 +2452,10 @@@ static void end_bio_extent_writepage(st
        u64 start;
        u64 end;
        int i;
+       struct bvec_iter_all iter_all;
  
        ASSERT(!bio_flagged(bio, BIO_CLONED));
-       bio_for_each_segment_all(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i, iter_all) {
                struct page *page = bvec->bv_page;
                struct inode *inode = page->mapping->host;
                struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@@ -2522,9 -2494,10 +2524,10 @@@ static void end_bio_extent_readpage(str
        int mirror;
        int ret;
        int i;
+       struct bvec_iter_all iter_all;
  
        ASSERT(!bio_flagged(bio, BIO_CLONED));
-       bio_for_each_segment_all(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i, iter_all) {
                struct page *page = bvec->bv_page;
                struct inode *inode = page->mapping->host;
                struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@@ -2721,6 -2694,29 +2724,6 @@@ struct bio *btrfs_bio_clone_partial(str
        return bio;
  }
  
 -static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
 -                                     unsigned long bio_flags)
 -{
 -      blk_status_t ret = 0;
 -      struct bio_vec *bvec = bio_last_bvec_all(bio);
 -      struct bio_vec bv;
 -      struct extent_io_tree *tree = bio->bi_private;
 -      u64 start;
 -
 -      mp_bvec_last_segment(bvec, &bv);
 -      start = page_offset(bv.bv_page) + bv.bv_offset;
 -
 -      bio->bi_private = NULL;
 -
 -      if (tree->ops)
 -              ret = tree->ops->submit_bio_hook(tree->private_data, bio,
 -                                         mirror_num, bio_flags, start);
 -      else
 -              btrfsic_submit_bio(bio);
 -
 -      return blk_status_to_errno(ret);
 -}
 -
  /*
   * @opf:      bio REQ_OP_* and REQ_* flags as one value
   * @tree:     tree so we can call our merge_bio hook
@@@ -3641,9 -3637,10 +3644,10 @@@ static void end_bio_extent_buffer_write
        struct bio_vec *bvec;
        struct extent_buffer *eb;
        int i, done;
+       struct bvec_iter_all iter_all;
  
        ASSERT(!bio_flagged(bio, BIO_CLONED));
-       bio_for_each_segment_all(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i, iter_all) {
                struct page *page = bvec->bv_page;
  
                eb = (struct extent_buffer *)page->private;
@@@ -4014,6 -4011,17 +4018,6 @@@ retry
        return ret;
  }
  
 -static void flush_write_bio(struct extent_page_data *epd)
 -{
 -      if (epd->bio) {
 -              int ret;
 -
 -              ret = submit_one_bio(epd->bio, 0, 0);
 -              BUG_ON(ret < 0); /* -ENOMEM */
 -              epd->bio = NULL;
 -      }
 -}
 -
  int extent_write_full_page(struct page *page, struct writeback_control *wbc)
  {
        int ret;
@@@ -4255,7 -4263,8 +4259,7 @@@ static struct extent_map *get_extent_sk
                if (len == 0)
                        break;
                len = ALIGN(len, sectorsize);
 -              em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset,
 -                              len, 0);
 +              em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
                if (IS_ERR_OR_NULL(em))
                        return em;
  
diff --combined fs/btrfs/inode.c
index 3f180b857e202bc628a65ff0955606f880193de5,7ade5769f6915acfa1acfbb492db1d926c4df594..82fdda8ff5ab82b5298c4b72859e697d8bd1a3d5
@@@ -453,6 -453,7 +453,6 @@@ static noinline void compress_file_rang
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        u64 blocksize = fs_info->sectorsize;
        u64 actual_end;
 -      u64 isize = i_size_read(inode);
        int ret = 0;
        struct page **pages = NULL;
        unsigned long nr_pages;
        inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
                        SZ_16K);
  
 -      actual_end = min_t(u64, isize, end + 1);
 +      actual_end = min_t(u64, i_size_read(inode), end + 1);
  again:
        will_compress = 0;
        nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
@@@ -713,9 -714,9 +713,9 @@@ static void free_async_extent_pages(str
   * queued.  We walk all the async extents created by compress_file_range
   * and send them down to the disk.
   */
 -static noinline void submit_compressed_extents(struct inode *inode,
 -                                            struct async_cow *async_cow)
 +static noinline void submit_compressed_extents(struct async_cow *async_cow)
  {
 +      struct inode *inode = async_cow->inode;
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct async_extent *async_extent;
        u64 alloc_hint = 0;
@@@ -1165,14 -1166,8 +1165,14 @@@ static noinline void async_cow_submit(s
            5 * SZ_1M)
                cond_wake_up_nomb(&fs_info->async_submit_wait);
  
 +      /*
 +       * ->inode could be NULL if async_cow_start has failed to compress,
 +       * in which case we don't have anything to submit, yet we need to
 +       * always adjust ->async_delalloc_pages as its paired with the init
 +       * happening in cow_file_range_async
 +       */
        if (async_cow->inode)
 -              submit_compressed_extents(async_cow->inode, async_cow);
 +              submit_compressed_extents(async_cow);
  }
  
  static noinline void async_cow_free(struct btrfs_work *work)
@@@ -1199,12 -1194,7 +1199,12 @@@ static int cow_file_range_async(struct 
        while (start < end) {
                async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
                BUG_ON(!async_cow); /* -ENOMEM */
 -              async_cow->inode = igrab(inode);
 +              /*
 +               * igrab is called higher up in the call chain, take only the
 +               * lightweight reference for the callback lifetime
 +               */
 +              ihold(inode);
 +              async_cow->inode = inode;
                async_cow->fs_info = fs_info;
                async_cow->locked_page = locked_page;
                async_cow->start = start;
@@@ -1596,10 -1586,11 +1596,10 @@@ static inline int need_force_cow(struc
   * Function to process delayed allocation (create CoW) for ranges which are
   * being touched for the first time.
   */
 -int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
 +int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
                u64 start, u64 end, int *page_started, unsigned long *nr_written,
                struct writeback_control *wbc)
  {
 -      struct inode *inode = private_data;
        int ret;
        int force_cow = need_force_cow(inode, start, end);
        unsigned int write_flags = wbc_to_write_flags(wbc);
@@@ -3256,7 -3247,6 +3256,7 @@@ void btrfs_add_delayed_iput(struct inod
        if (atomic_add_unless(&inode->i_count, -1, 1))
                return;
  
 +      atomic_inc(&fs_info->nr_delayed_iputs);
        spin_lock(&fs_info->delayed_iput_lock);
        ASSERT(list_empty(&binode->delayed_iput));
        list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
@@@ -3277,32 -3267,11 +3277,32 @@@ void btrfs_run_delayed_iputs(struct btr
                list_del_init(&inode->delayed_iput);
                spin_unlock(&fs_info->delayed_iput_lock);
                iput(&inode->vfs_inode);
 +              if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
 +                      wake_up(&fs_info->delayed_iputs_wait);
                spin_lock(&fs_info->delayed_iput_lock);
        }
        spin_unlock(&fs_info->delayed_iput_lock);
  }
  
 +/**
 + * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
 + * @fs_info - the fs_info for this fs
 + * @return - EINTR if we were killed, 0 if nothing's pending
 + *
 + * This will wait on any delayed iputs that are currently running with KILLABLE
 + * set.  Once they are all done running we will return, unless we are killed in
 + * which case we return EINTR. This helps in user operations like fallocate etc
 + * that might get blocked on the iputs.
 + */
 +int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
 +{
 +      int ret = wait_event_killable(fs_info->delayed_iputs_wait,
 +                      atomic_read(&fs_info->nr_delayed_iputs) == 0);
 +      if (ret)
 +              return -EINTR;
 +      return 0;
 +}
 +
  /*
   * This creates an orphan entry for the given inode in case something goes wrong
   * in the middle of an unlink.
@@@ -5293,15 -5262,13 +5293,15 @@@ static struct btrfs_trans_handle *evict
  {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 +      u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1);
        int failures = 0;
  
        for (;;) {
                struct btrfs_trans_handle *trans;
                int ret;
  
 -              ret = btrfs_block_rsv_refill(root, rsv, rsv->size,
 +              ret = btrfs_block_rsv_refill(root, rsv,
 +                                           rsv->size + delayed_refs_extra,
                                             BTRFS_RESERVE_FLUSH_LIMIT);
  
                if (ret && ++failures > 2) {
                        return ERR_PTR(-ENOSPC);
                }
  
 +              /*
 +               * Evict can generate a large amount of delayed refs without
 +               * having a way to add space back since we exhaust our temporary
 +               * block rsv.  We aren't allowed to do FLUSH_ALL in this case
 +               * because we could deadlock with so many things in the flushing
 +               * code, so we have to try and hold some extra space to
 +               * compensate for our delayed ref generation.  If we can't get
 +               * that space then we need see if we can steal our minimum from
 +               * the global reserve.  We will be ratelimited by the amount of
 +               * space we have for the delayed refs rsv, so we'll end up
 +               * committing and trying again.
 +               */
                trans = btrfs_join_transaction(root);
 -              if (IS_ERR(trans) || !ret)
 +              if (IS_ERR(trans) || !ret) {
 +                      if (!IS_ERR(trans)) {
 +                              trans->block_rsv = &fs_info->trans_block_rsv;
 +                              trans->bytes_reserved = delayed_refs_extra;
 +                              btrfs_block_rsv_migrate(rsv, trans->block_rsv,
 +                                                      delayed_refs_extra, 1);
 +                      }
                        return trans;
 +              }
  
                /*
                 * Try to steal from the global reserve if there is space for
@@@ -6783,7 -6731,7 +6783,7 @@@ struct extent_map *btrfs_get_extent(str
        u64 extent_start = 0;
        u64 extent_end = 0;
        u64 objectid = btrfs_ino(inode);
 -      u32 found_type;
 +      u8 extent_type;
        struct btrfs_path *path = NULL;
        struct btrfs_root *root = inode->root;
        struct btrfs_file_extent_item *item;
        if (ret < 0) {
                err = ret;
                goto out;
 -      }
 -
 -      if (ret != 0) {
 +      } else if (ret > 0) {
                if (path->slots[0] == 0)
                        goto not_found;
                path->slots[0]--;
        leaf = path->nodes[0];
        item = btrfs_item_ptr(leaf, path->slots[0],
                              struct btrfs_file_extent_item);
 -      /* are we inside the extent that was found? */
        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 -      found_type = found_key.type;
        if (found_key.objectid != objectid ||
 -          found_type != BTRFS_EXTENT_DATA_KEY) {
 +          found_key.type != BTRFS_EXTENT_DATA_KEY) {
                /*
                 * If we backup past the first extent we want to move forward
                 * and see if there is an extent in front of us, otherwise we'll
                goto next;
        }
  
 -      found_type = btrfs_file_extent_type(leaf, item);
 +      extent_type = btrfs_file_extent_type(leaf, item);
        extent_start = found_key.offset;
 -      if (found_type == BTRFS_FILE_EXTENT_REG ||
 -          found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 +      if (extent_type == BTRFS_FILE_EXTENT_REG ||
 +          extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
                extent_end = extent_start +
                       btrfs_file_extent_num_bytes(leaf, item);
  
                trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
                                                       extent_start);
 -      } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 +      } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                size_t size;
  
                size = btrfs_file_extent_ram_bytes(leaf, item);
@@@ -6888,9 -6840,9 +6888,9 @@@ next
                        if (ret < 0) {
                                err = ret;
                                goto out;
 -                      }
 -                      if (ret > 0)
 +                      } else if (ret > 0) {
                                goto not_found;
 +                      }
                        leaf = path->nodes[0];
                }
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                        goto not_found;
                if (start > found_key.offset)
                        goto next;
 +
 +              /* New extent overlaps with existing one */
                em->start = start;
                em->orig_start = start;
                em->len = found_key.offset - start;
 -              goto not_found_em;
 +              em->block_start = EXTENT_MAP_HOLE;
 +              goto insert;
        }
  
        btrfs_extent_item_to_extent_map(inode, path, item,
                        new_inline, em);
  
 -      if (found_type == BTRFS_FILE_EXTENT_REG ||
 -          found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 +      if (extent_type == BTRFS_FILE_EXTENT_REG ||
 +          extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
                goto insert;
 -      } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 +      } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                unsigned long ptr;
                char *map;
                size_t size;
@@@ -6967,6 -6916,7 +6967,6 @@@ not_found
        em->start = start;
        em->orig_start = start;
        em->len = len;
 -not_found_em:
        em->block_start = EXTENT_MAP_HOLE;
  insert:
        btrfs_release_path(path);
  }
  
  struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
 -              struct page *page,
 -              size_t pg_offset, u64 start, u64 len,
 -              int create)
 +                                         u64 start, u64 len)
  {
        struct extent_map *em;
        struct extent_map *hole_em = NULL;
 -      u64 range_start = start;
 +      u64 delalloc_start = start;
        u64 end;
 -      u64 found;
 -      u64 found_end;
 +      u64 delalloc_len;
 +      u64 delalloc_end;
        int err = 0;
  
 -      em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
 +      em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
        if (IS_ERR(em))
                return em;
        /*
        em = NULL;
  
        /* ok, we didn't find anything, lets look for delalloc */
 -      found = count_range_bits(&inode->io_tree, &range_start,
 +      delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
                                 end, len, EXTENT_DELALLOC, 1);
 -      found_end = range_start + found;
 -      if (found_end < range_start)
 -              found_end = (u64)-1;
 +      delalloc_end = delalloc_start + delalloc_len;
 +      if (delalloc_end < delalloc_start)
 +              delalloc_end = (u64)-1;
  
        /*
 -       * we didn't find anything useful, return
 -       * the original results from get_extent()
 +       * We didn't find anything useful, return the original results from
 +       * get_extent()
         */
 -      if (range_start > end || found_end <= start) {
 +      if (delalloc_start > end || delalloc_end <= start) {
                em = hole_em;
                hole_em = NULL;
                goto out;
        }
  
 -      /* adjust the range_start to make sure it doesn't
 -       * go backwards from the start they passed in
 +      /*
 +       * Adjust the delalloc_start to make sure it doesn't go backwards from
 +       * the start they passed in
         */
 -      range_start = max(start, range_start);
 -      found = found_end - range_start;
 +      delalloc_start = max(start, delalloc_start);
 +      delalloc_len = delalloc_end - delalloc_start;
  
 -      if (found > 0) {
 -              u64 hole_start = start;
 -              u64 hole_len = len;
 +      if (delalloc_len > 0) {
 +              u64 hole_start;
 +              u64 hole_len;
 +              const u64 hole_end = extent_map_end(hole_em);
  
                em = alloc_extent_map();
                if (!em) {
                        err = -ENOMEM;
                        goto out;
                }
 +              em->bdev = NULL;
 +
 +              ASSERT(hole_em);
                /*
 -               * when btrfs_get_extent can't find anything it
 -               * returns one huge hole
 +               * When btrfs_get_extent can't find anything it returns one
 +               * huge hole
                 *
 -               * make sure what it found really fits our range, and
 -               * adjust to make sure it is based on the start from
 -               * the caller
 +               * Make sure what it found really fits our range, and adjust to
 +               * make sure it is based on the start from the caller
                 */
 -              if (hole_em) {
 -                      u64 calc_end = extent_map_end(hole_em);
 -
 -                      if (calc_end <= start || (hole_em->start > end)) {
 -                              free_extent_map(hole_em);
 -                              hole_em = NULL;
 -                      } else {
 -                              hole_start = max(hole_em->start, start);
 -                              hole_len = calc_end - hole_start;
 -                      }
 +              if (hole_end <= start || hole_em->start > end) {
 +                     free_extent_map(hole_em);
 +                     hole_em = NULL;
 +              } else {
 +                     hole_start = max(hole_em->start, start);
 +                     hole_len = hole_end - hole_start;
                }
 -              em->bdev = NULL;
 -              if (hole_em && range_start > hole_start) {
 -                      /* our hole starts before our delalloc, so we
 -                       * have to return just the parts of the hole
 -                       * that go until  the delalloc starts
 +
 +              if (hole_em && delalloc_start > hole_start) {
 +                      /*
 +                       * Our hole starts before our delalloc, so we have to
 +                       * return just the parts of the hole that go until the
 +                       * delalloc starts
                         */
 -                      em->len = min(hole_len,
 -                                    range_start - hole_start);
 +                      em->len = min(hole_len, delalloc_start - hole_start);
                        em->start = hole_start;
                        em->orig_start = hole_start;
                        /*
 -                       * don't adjust block start at all,
 -                       * it is fixed at EXTENT_MAP_HOLE
 +                       * Don't adjust block start at all, it is fixed at
 +                       * EXTENT_MAP_HOLE
                         */
                        em->block_start = hole_em->block_start;
                        em->block_len = hole_len;
                        if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
                                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
                } else {
 -                      em->start = range_start;
 -                      em->len = found;
 -                      em->orig_start = range_start;
 +                      /*
 +                       * Hole is out of passed range or it starts after
 +                       * delalloc range
 +                       */
 +                      em->start = delalloc_start;
 +                      em->len = delalloc_len;
 +                      em->orig_start = delalloc_start;
                        em->block_start = EXTENT_MAP_DELALLOC;
 -                      em->block_len = found;
 +                      em->block_len = delalloc_len;
                }
        } else {
                return hole_em;
@@@ -7829,6 -7777,7 +7829,7 @@@ static void btrfs_retry_endio_nocsum(st
        struct bio_vec *bvec;
        struct extent_io_tree *io_tree, *failure_tree;
        int i;
+       struct bvec_iter_all iter_all;
  
        if (bio->bi_status)
                goto end;
  
        done->uptodate = 1;
        ASSERT(!bio_flagged(bio, BIO_CLONED));
-       bio_for_each_segment_all(bvec, bio, i)
+       bio_for_each_segment_all(bvec, bio, i, iter_all)
                clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
                                 io_tree, done->start, bvec->bv_page,
                                 btrfs_ino(BTRFS_I(inode)), 0);
@@@ -7919,6 -7868,7 +7920,7 @@@ static void btrfs_retry_endio(struct bi
        int uptodate;
        int ret;
        int i;
+       struct bvec_iter_all iter_all;
  
        if (bio->bi_status)
                goto end;
        failure_tree = &BTRFS_I(inode)->io_failure_tree;
  
        ASSERT(!bio_flagged(bio, BIO_CLONED));
-       bio_for_each_segment_all(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i, iter_all) {
                ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
                                             bvec->bv_offset, done->start,
                                             bvec->bv_len);
@@@ -9962,6 -9912,7 +9964,6 @@@ static struct btrfs_delalloc_work *btrf
        init_completion(&work->completion);
        INIT_LIST_HEAD(&work->list);
        work->inode = inode;
 -      WARN_ON_ONCE(!inode);
        btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
                        btrfs_run_delalloc_work, NULL, NULL);
  
diff --combined fs/gfs2/lops.c
index 2295042bc6259887c87bfcb786e5d4bfc511662c,15deefeaafd0d68f912c4b4733aa5dee936da43d..8722c60b11feb478fe4dbf560f8afcdcda4e2dac
@@@ -17,7 -17,9 +17,7 @@@
  #include <linux/bio.h>
  #include <linux/fs.h>
  #include <linux/list_sort.h>
 -#include <linux/blkdev.h>
  
 -#include "bmap.h"
  #include "dir.h"
  #include "gfs2.h"
  #include "incore.h"
@@@ -168,7 -170,8 +168,8 @@@ u64 gfs2_log_bmap(struct gfs2_sbd *sdp
   * that is pinned in the pagecache.
   */
  
- static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
+ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
+                                 struct bio_vec *bvec,
                                  blk_status_t error)
  {
        struct buffer_head *bh, *next;
  /**
   * gfs2_end_log_write - end of i/o to the log
   * @bio: The bio
 + * @error: Status of i/o request
   *
   * Each bio_vec contains either data from the pagecache or data
   * relating to the log itself. Here we iterate over the bio_vec
@@@ -207,6 -209,7 +208,7 @@@ static void gfs2_end_log_write(struct b
        struct bio_vec *bvec;
        struct page *page;
        int i;
+       struct bvec_iter_all iter_all;
  
        if (bio->bi_status) {
                fs_err(sdp, "Error %d writing to journal, jid=%u\n",
                wake_up(&sdp->sd_logd_waitq);
        }
  
-       bio_for_each_segment_all(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i, iter_all) {
                page = bvec->bv_page;
                if (page_has_buffers(page))
                        gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
  /**
   * gfs2_log_submit_bio - Submit any pending log bio
   * @biop: Address of the bio pointer
 - * @opf: REQ_OP | op_flags
 + * @op: REQ_OP
 + * @op_flags: req_flag_bits
   *
   * Submit any pending part-built or full bio to the block device. If
   * there is no pending bio, then this is a no-op.
   */
  
 -void gfs2_log_submit_bio(struct bio **biop, int opf)
 +void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags)
  {
        struct bio *bio = *biop;
        if (bio) {
                struct gfs2_sbd *sdp = bio->bi_private;
                atomic_inc(&sdp->sd_log_in_flight);
 -              bio->bi_opf = opf;
 +              bio_set_op_attrs(bio, op, op_flags);
                submit_bio(bio);
                *biop = NULL;
        }
@@@ -304,7 -306,7 +306,7 @@@ static struct bio *gfs2_log_get_bio(str
                nblk >>= sdp->sd_fsb2bb_shift;
                if (blkno == nblk && !flush)
                        return bio;
 -              gfs2_log_submit_bio(biop, op);
 +              gfs2_log_submit_bio(biop, op, 0);
        }
  
        *biop = gfs2_log_alloc_bio(sdp, blkno, end_io);
@@@ -375,6 -377,185 +377,6 @@@ void gfs2_log_write_page(struct gfs2_sb
                       gfs2_log_bmap(sdp));
  }
  
 -/**
 - * gfs2_end_log_read - end I/O callback for reads from the log
 - * @bio: The bio
 - *
 - * Simply unlock the pages in the bio. The main thread will wait on them and
 - * process them in order as necessary.
 - */
 -
 -static void gfs2_end_log_read(struct bio *bio)
 -{
 -      struct page *page;
 -      struct bio_vec *bvec;
 -      int i;
 -      struct bvec_iter_all iter_all;
 -
 -      bio_for_each_segment_all(bvec, bio, i, iter_all) {
 -              page = bvec->bv_page;
 -              if (bio->bi_status) {
 -                      int err = blk_status_to_errno(bio->bi_status);
 -
 -                      SetPageError(page);
 -                      mapping_set_error(page->mapping, err);
 -              }
 -              unlock_page(page);
 -      }
 -
 -      bio_put(bio);
 -}
 -
 -/**
 - * gfs2_jhead_pg_srch - Look for the journal head in a given page.
 - * @jd: The journal descriptor
 - * @page: The page to look in
 - *
 - * Returns: 1 if found, 0 otherwise.
 - */
 -
 -static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
 -                            struct gfs2_log_header_host *head,
 -                            struct page *page)
 -{
 -      struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 -      struct gfs2_log_header_host uninitialized_var(lh);
 -      void *kaddr = kmap_atomic(page);
 -      unsigned int offset;
 -      bool ret = false;
 -
 -      for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) {
 -              if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) {
 -                      if (lh.lh_sequence > head->lh_sequence)
 -                              *head = lh;
 -                      else {
 -                              ret = true;
 -                              break;
 -                      }
 -              }
 -      }
 -      kunmap_atomic(kaddr);
 -      return ret;
 -}
 -
 -/**
 - * gfs2_jhead_process_page - Search/cleanup a page
 - * @jd: The journal descriptor
 - * @index: Index of the page to look into
 - * @done: If set, perform only cleanup, else search and set if found.
 - *
 - * Find the page with 'index' in the journal's mapping. Search the page for
 - * the journal head if requested (cleanup == false). Release refs on the
 - * page so the page cache can reclaim it (put_page() twice). We grabbed a
 - * reference on this page two times, first when we did a find_or_create_page()
 - * to obtain the page to add it to the bio and second when we do a
 - * find_get_page() here to get the page to wait on while I/O on it is being
 - * completed.
 - * This function is also used to free up a page we might've grabbed but not
 - * used. Maybe we added it to a bio, but not submitted it for I/O. Or we
 - * submitted the I/O, but we already found the jhead so we only need to drop
 - * our references to the page.
 - */
 -
 -static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index,
 -                                  struct gfs2_log_header_host *head,
 -                                  bool *done)
 -{
 -      struct page *page;
 -
 -      page = find_get_page(jd->jd_inode->i_mapping, index);
 -      wait_on_page_locked(page);
 -
 -      if (PageError(page))
 -              *done = true;
 -
 -      if (!*done)
 -              *done = gfs2_jhead_pg_srch(jd, head, page);
 -
 -      put_page(page); /* Once for find_get_page */
 -      put_page(page); /* Once more for find_or_create_page */
 -}
 -
 -/**
 - * gfs2_find_jhead - find the head of a log
 - * @jd: The journal descriptor
 - * @head: The log descriptor for the head of the log is returned here
 - *
 - * Do a search of a journal by reading it in large chunks using bios and find
 - * the valid log entry with the highest sequence number.  (i.e. the log head)
 - *
 - * Returns: 0 on success, errno otherwise
 - */
 -
 -int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
 -{
 -      struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 -      struct address_space *mapping = jd->jd_inode->i_mapping;
 -      struct gfs2_journal_extent *je;
 -      u32 block, read_idx = 0, submit_idx = 0, index = 0;
 -      int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift;
 -      int blocks_per_page = 1 << shift, sz, ret = 0;
 -      struct bio *bio = NULL;
 -      struct page *page;
 -      bool done = false;
 -      errseq_t since;
 -
 -      memset(head, 0, sizeof(*head));
 -      if (list_empty(&jd->extent_list))
 -              gfs2_map_journal_extents(sdp, jd);
 -
 -      since = filemap_sample_wb_err(mapping);
 -      list_for_each_entry(je, &jd->extent_list, list) {
 -              for (block = 0; block < je->blocks; block += blocks_per_page) {
 -                      index = (je->lblock + block) >> shift;
 -
 -                      page = find_or_create_page(mapping, index, GFP_NOFS);
 -                      if (!page) {
 -                              ret = -ENOMEM;
 -                              done = true;
 -                              goto out;
 -                      }
 -
 -                      if (bio) {
 -                              sz = bio_add_page(bio, page, PAGE_SIZE, 0);
 -                              if (sz == PAGE_SIZE)
 -                                      goto page_added;
 -                              submit_idx = index;
 -                              submit_bio(bio);
 -                              bio = NULL;
 -                      }
 -
 -                      bio = gfs2_log_alloc_bio(sdp,
 -                                               je->dblock + (index << shift),
 -                                               gfs2_end_log_read);
 -                      bio->bi_opf = REQ_OP_READ;
 -                      sz = bio_add_page(bio, page, PAGE_SIZE, 0);
 -                      gfs2_assert_warn(sdp, sz == PAGE_SIZE);
 -
 -page_added:
 -                      if (submit_idx <= read_idx + BIO_MAX_PAGES) {
 -                              /* Keep at least one bio in flight */
 -                              continue;
 -                      }
 -
 -                      gfs2_jhead_process_page(jd, read_idx++, head, &done);
 -                      if (done)
 -                              goto out;  /* found */
 -              }
 -      }
 -
 -out:
 -      if (bio)
 -              submit_bio(bio);
 -      while (read_idx <= index)
 -              gfs2_jhead_process_page(jd, read_idx++, head, &done);
 -
 -      if (!ret)
 -              ret = filemap_check_wb_err(mapping, since);
 -
 -      return ret;
 -}
 -
  static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
                                      u32 ld_length, u32 ld_data1)
  {
diff --combined fs/xfs/xfs_aops.c
index 7b8bb6bde981028ad692fa07c87b0e4911bf3436,55f3e194a8522b748196128a8aee06e1e0686be6..3619e9e8d359e839b8ac88b633a52eedff360301
@@@ -28,8 -28,7 +28,8 @@@
   */
  struct xfs_writepage_ctx {
        struct xfs_bmbt_irec    imap;
 -      unsigned int            io_type;
 +      int                     fork;
 +      unsigned int            data_seq;
        unsigned int            cow_seq;
        struct xfs_ioend        *ioend;
  };
@@@ -63,7 -62,7 +63,7 @@@ xfs_find_daxdev_for_inode
  static void
  xfs_finish_page_writeback(
        struct inode            *inode,
-       struct bio_vec          *bvec,
+       struct bio_vec  *bvec,
        int                     error)
  {
        struct iomap_page       *iop = to_iomap_page(bvec->bv_page);
@@@ -99,6 -98,7 +99,7 @@@ xfs_destroy_ioend
        for (bio = &ioend->io_inline_bio; bio; bio = next) {
                struct bio_vec  *bvec;
                int             i;
+               struct bvec_iter_all iter_all;
  
                /*
                 * For the last bio, bi_private points to the ioend, so we
                        next = bio->bi_private;
  
                /* walk each page on bio, ending page IO on them */
-               bio_for_each_segment_all(bvec, bio, i)
+               bio_for_each_segment_all(bvec, bio, i, iter_all)
                        xfs_finish_page_writeback(inode, bvec, error);
                bio_put(bio);
        }
@@@ -256,20 -256,30 +257,20 @@@ xfs_end_io
         */
        error = blk_status_to_errno(ioend->io_bio->bi_status);
        if (unlikely(error)) {
 -              switch (ioend->io_type) {
 -              case XFS_IO_COW:
 +              if (ioend->io_fork == XFS_COW_FORK)
                        xfs_reflink_cancel_cow_range(ip, offset, size, true);
 -                      break;
 -              }
 -
                goto done;
        }
  
        /*
 -       * Success:  commit the COW or unwritten blocks if needed.
 +       * Success: commit the COW or unwritten blocks if needed.
         */
 -      switch (ioend->io_type) {
 -      case XFS_IO_COW:
 +      if (ioend->io_fork == XFS_COW_FORK)
                error = xfs_reflink_end_cow(ip, offset, size);
 -              break;
 -      case XFS_IO_UNWRITTEN:
 -              /* writeback should never update isize */
 +      else if (ioend->io_state == XFS_EXT_UNWRITTEN)
                error = xfs_iomap_write_unwritten(ip, offset, size, false);
 -              break;
 -      default:
 +      else
                ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
 -              break;
 -      }
  
  done:
        if (ioend->io_append_trans)
@@@ -284,8 -294,7 +285,8 @@@ xfs_end_bio
        struct xfs_ioend        *ioend = bio->bi_private;
        struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
  
 -      if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
 +      if (ioend->io_fork == XFS_COW_FORK ||
 +          ioend->io_state == XFS_EXT_UNWRITTEN)
                queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
        else if (ioend->io_append_trans)
                queue_work(mp->m_data_workqueue, &ioend->io_work);
                xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
  }
  
 +/*
 + * Fast revalidation of the cached writeback mapping. Return true if the current
 + * mapping is valid, false otherwise.
 + */
 +static bool
 +xfs_imap_valid(
 +      struct xfs_writepage_ctx        *wpc,
 +      struct xfs_inode                *ip,
 +      xfs_fileoff_t                   offset_fsb)
 +{
 +      if (offset_fsb < wpc->imap.br_startoff ||
 +          offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
 +              return false;
 +      /*
 +       * If this is a COW mapping, it is sufficient to check that the mapping
 +       * covers the offset. Be careful to check this first because the caller
 +       * can revalidate a COW mapping without updating the data seqno.
 +       */
 +      if (wpc->fork == XFS_COW_FORK)
 +              return true;
 +
 +      /*
 +       * This is not a COW mapping. Check the sequence number of the data fork
 +       * because concurrent changes could have invalidated the extent. Check
 +       * the COW fork because concurrent changes since the last time we
 +       * checked (and found nothing at this offset) could have added
 +       * overlapping blocks.
 +       */
 +      if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
 +              return false;
 +      if (xfs_inode_has_cow_data(ip) &&
 +          wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
 +              return false;
 +      return true;
 +}
 +
 +/*
 + * Pass in a dellalloc extent and convert it to real extents, return the real
 + * extent that maps offset_fsb in wpc->imap.
 + *
 + * The current page is held locked so nothing could have removed the block
 + * backing offset_fsb, although it could have moved from the COW to the data
 + * fork by another thread.
 + */
 +static int
 +xfs_convert_blocks(
 +      struct xfs_writepage_ctx *wpc,
 +      struct xfs_inode        *ip,
 +      xfs_fileoff_t           offset_fsb)
 +{
 +      int                     error;
 +
 +      /*
 +       * Attempt to allocate whatever delalloc extent currently backs
 +       * offset_fsb and put the result into wpc->imap.  Allocate in a loop
 +       * because it may take several attempts to allocate real blocks for a
 +       * contiguous delalloc extent if free space is sufficiently fragmented.
 +       */
 +      do {
 +              error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
 +                              &wpc->imap, wpc->fork == XFS_COW_FORK ?
 +                                      &wpc->cow_seq : &wpc->data_seq);
 +              if (error)
 +                      return error;
 +      } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
 +
 +      return 0;
 +}
 +
  STATIC int
  xfs_map_blocks(
        struct xfs_writepage_ctx *wpc,
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
        ssize_t                 count = i_blocksize(inode);
 -      xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
 +      xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
 +      xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + count);
        xfs_fileoff_t           cow_fsb = NULLFILEOFF;
        struct xfs_bmbt_irec    imap;
 -      int                     whichfork = XFS_DATA_FORK;
        struct xfs_iext_cursor  icur;
 -      bool                    imap_valid;
 +      int                     retries = 0;
        int                     error = 0;
  
 -      /*
 -       * We have to make sure the cached mapping is within EOF to protect
 -       * against eofblocks trimming on file release leaving us with a stale
 -       * mapping. Otherwise, a page for a subsequent file extending buffered
 -       * write could get picked up by this writeback cycle and written to the
 -       * wrong blocks.
 -       *
 -       * Note that what we really want here is a generic mapping invalidation
 -       * mechanism to protect us from arbitrary extent modifying contexts, not
 -       * just eofblocks.
 -       */
 -      xfs_trim_extent_eof(&wpc->imap, ip);
 +      if (XFS_FORCED_SHUTDOWN(mp))
 +              return -EIO;
  
        /*
         * COW fork blocks can overlap data fork blocks even if the blocks
         * against concurrent updates and provides a memory barrier on the way
         * out that ensures that we always see the current value.
         */
 -      imap_valid = offset_fsb >= wpc->imap.br_startoff &&
 -                   offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
 -      if (imap_valid &&
 -          (!xfs_inode_has_cow_data(ip) ||
 -           wpc->io_type == XFS_IO_COW ||
 -           wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq)))
 +      if (xfs_imap_valid(wpc, ip, offset_fsb))
                return 0;
  
 -      if (XFS_FORCED_SHUTDOWN(mp))
 -              return -EIO;
 -
        /*
         * If we don't have a valid map, now it's time to get a new one for this
         * offset.  This will convert delayed allocations (including COW ones)
         * into real extents.  If we return without a valid map, it means we
         * landed in a hole and we skip the block.
         */
 +retry:
        xfs_ilock(ip, XFS_ILOCK_SHARED);
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
               (ip->i_df.if_flags & XFS_IFEXTENTS));
 -      ASSERT(offset <= mp->m_super->s_maxbytes);
 -
 -      if (offset > mp->m_super->s_maxbytes - count)
 -              count = mp->m_super->s_maxbytes - offset;
 -      end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
  
        /*
         * Check if this is offset is covered by a COW extents, and if yes use
        if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
                wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
 -              /*
 -               * Truncate can race with writeback since writeback doesn't
 -               * take the iolock and truncate decreases the file size before
 -               * it starts truncating the pages between new_size and old_size.
 -               * Therefore, we can end up in the situation where writeback
 -               * gets a CoW fork mapping but the truncate makes the mapping
 -               * invalid and we end up in here trying to get a new mapping.
 -               * bail out here so that we simply never get a valid mapping
 -               * and so we drop the write altogether.  The page truncation
 -               * will kill the contents anyway.
 -               */
 -              if (offset > i_size_read(inode)) {
 -                      wpc->io_type = XFS_IO_HOLE;
 -                      return 0;
 -              }
 -              whichfork = XFS_COW_FORK;
 -              wpc->io_type = XFS_IO_COW;
 +
 +              wpc->fork = XFS_COW_FORK;
                goto allocate_blocks;
        }
  
        /*
 -       * Map valid and no COW extent in the way?  We're done.
 +       * No COW extent overlap. Revalidate now that we may have updated
 +       * ->cow_seq. If the data mapping is still valid, we're done.
         */
 -      if (imap_valid) {
 +      if (xfs_imap_valid(wpc, ip, offset_fsb)) {
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
                return 0;
        }
         */
        if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
                imap.br_startoff = end_fsb;     /* fake a hole past EOF */
 +      wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
  
 +      wpc->fork = XFS_DATA_FORK;
 +
 +      /* landed in a hole or beyond EOF? */
        if (imap.br_startoff > offset_fsb) {
 -              /* landed in a hole or beyond EOF */
                imap.br_blockcount = imap.br_startoff - offset_fsb;
                imap.br_startoff = offset_fsb;
                imap.br_startblock = HOLESTARTBLOCK;
 -              wpc->io_type = XFS_IO_HOLE;
 -      } else {
 -              /*
 -               * Truncate to the next COW extent if there is one.  This is the
 -               * only opportunity to do this because we can skip COW fork
 -               * lookups for the subsequent blocks in the mapping; however,
 -               * the requirement to treat the COW range separately remains.
 -               */
 -              if (cow_fsb != NULLFILEOFF &&
 -                  cow_fsb < imap.br_startoff + imap.br_blockcount)
 -                      imap.br_blockcount = cow_fsb - imap.br_startoff;
 -
 -              if (isnullstartblock(imap.br_startblock)) {
 -                      /* got a delalloc extent */
 -                      wpc->io_type = XFS_IO_DELALLOC;
 -                      goto allocate_blocks;
 -              }
 -
 -              if (imap.br_state == XFS_EXT_UNWRITTEN)
 -                      wpc->io_type = XFS_IO_UNWRITTEN;
 -              else
 -                      wpc->io_type = XFS_IO_OVERWRITE;
 +              imap.br_state = XFS_EXT_NORM;
        }
  
 +      /*
 +       * Truncate to the next COW extent if there is one.  This is the only
 +       * opportunity to do this because we can skip COW fork lookups for the
 +       * subsequent blocks in the mapping; however, the requirement to treat
 +       * the COW range separately remains.
 +       */
 +      if (cow_fsb != NULLFILEOFF &&
 +          cow_fsb < imap.br_startoff + imap.br_blockcount)
 +              imap.br_blockcount = cow_fsb - imap.br_startoff;
 +
 +      /* got a delalloc extent? */
 +      if (imap.br_startblock != HOLESTARTBLOCK &&
 +          isnullstartblock(imap.br_startblock))
 +              goto allocate_blocks;
 +
        wpc->imap = imap;
 -      xfs_trim_extent_eof(&wpc->imap, ip);
 -      trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
 +      trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
        return 0;
  allocate_blocks:
 -      error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
 -                      &wpc->cow_seq);
 -      if (error)
 +      error = xfs_convert_blocks(wpc, ip, offset_fsb);
 +      if (error) {
 +              /*
 +               * If we failed to find the extent in the COW fork we might have
 +               * raced with a COW to data fork conversion or truncate.
 +               * Restart the lookup to catch the extent in the data fork for
 +               * the former case, but prevent additional retries to avoid
 +               * looping forever for the latter case.
 +               */
 +              if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
 +                      goto retry;
 +              ASSERT(error != -EAGAIN);
                return error;
 -      ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF ||
 -             imap.br_startoff + imap.br_blockcount <= cow_fsb);
 -      wpc->imap = imap;
 -      xfs_trim_extent_eof(&wpc->imap, ip);
 -      trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
 +      }
 +
 +      /*
 +       * Due to merging the return real extent might be larger than the
 +       * original delalloc one.  Trim the return extent to the next COW
 +       * boundary again to force a re-lookup.
 +       */
 +      if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
 +          cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
 +              wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
 +
 +      ASSERT(wpc->imap.br_startoff <= offset_fsb);
 +      ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
 +      trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
        return 0;
  }
  
@@@ -525,7 -487,7 +526,7 @@@ xfs_submit_ioend
        int                     status)
  {
        /* Convert CoW extents to regular */
 -      if (!status && ioend->io_type == XFS_IO_COW) {
 +      if (!status && ioend->io_fork == XFS_COW_FORK) {
                /*
                 * Yuk. This can do memory allocation, but is not a
                 * transactional operation so everything is done in GFP_KERNEL
  
        /* Reserve log space if we might write beyond the on-disk inode size. */
        if (!status &&
 -          ioend->io_type != XFS_IO_UNWRITTEN &&
 +          (ioend->io_fork == XFS_COW_FORK ||
 +           ioend->io_state != XFS_EXT_UNWRITTEN) &&
            xfs_ioend_is_append(ioend) &&
            !ioend->io_append_trans)
                status = xfs_setfilesize_trans_alloc(ioend);
  static struct xfs_ioend *
  xfs_alloc_ioend(
        struct inode            *inode,
 -      unsigned int            type,
 +      int                     fork,
 +      xfs_exntst_t            state,
        xfs_off_t               offset,
        struct block_device     *bdev,
        sector_t                sector)
  
        ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
        INIT_LIST_HEAD(&ioend->io_list);
 -      ioend->io_type = type;
 +      ioend->io_fork = fork;
 +      ioend->io_state = state;
        ioend->io_inode = inode;
        ioend->io_size = 0;
        ioend->io_offset = offset;
@@@ -650,23 -609,21 +651,23 @@@ xfs_add_to_ioend
        sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
                ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
  
 -      if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
 +      if (!wpc->ioend ||
 +          wpc->fork != wpc->ioend->io_fork ||
 +          wpc->imap.br_state != wpc->ioend->io_state ||
            sector != bio_end_sector(wpc->ioend->io_bio) ||
            offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
                if (wpc->ioend)
                        list_add(&wpc->ioend->io_list, iolist);
 -              wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
 -                              bdev, sector);
 +              wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
 +                              wpc->imap.br_state, offset, bdev, sector);
        }
  
-       if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
+       if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) {
                if (iop)
                        atomic_inc(&iop->write_count);
                if (bio_full(wpc->ioend->io_bio))
                        xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
-               __bio_add_page(wpc->ioend->io_bio, page, len, poff);
+               bio_add_page(wpc->ioend->io_bio, page, len, poff);
        }
  
        wpc->ioend->io_size += len;
@@@ -767,7 -724,7 +768,7 @@@ xfs_writepage_map
                error = xfs_map_blocks(wpc, inode, file_offset);
                if (error)
                        break;
 -              if (wpc->io_type == XFS_IO_HOLE)
 +              if (wpc->imap.br_startblock == HOLESTARTBLOCK)
                        continue;
                xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
                                 &submit_list);
@@@ -962,7 -919,9 +963,7 @@@ xfs_vm_writepage
        struct page             *page,
        struct writeback_control *wbc)
  {
 -      struct xfs_writepage_ctx wpc = {
 -              .io_type = XFS_IO_HOLE,
 -      };
 +      struct xfs_writepage_ctx wpc = { };
        int                     ret;
  
        ret = xfs_do_writepage(page, wbc, &wpc);
@@@ -976,7 -935,9 +977,7 @@@ xfs_vm_writepages
        struct address_space    *mapping,
        struct writeback_control *wbc)
  {
 -      struct xfs_writepage_ctx wpc = {
 -              .io_type = XFS_IO_HOLE,
 -      };
 +      struct xfs_writepage_ctx wpc = { };
        int                     ret;
  
        xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
@@@ -1023,7 -984,7 +1024,7 @@@ xfs_vm_bmap
         * Since we don't pass back blockdev info, we can't return bmap
         * information for rt files either.
         */
 -      if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
 +      if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
                return 0;
        return iomap_bmap(mapping, block, &xfs_iomap_ops);
  }
diff --combined fs/xfs/xfs_file.c
index 770cc2edf777f4bb3ef6089986d5d49f75788ee4,60c2da41f0fc2890006d6bc9b9eb6dcb955b85b0..1f2e2845eb76c2c78a932c913057e1028cec2f05
@@@ -507,7 -507,7 +507,7 @@@ xfs_file_dio_aio_write
                 * We can't properly handle unaligned direct I/O to reflink
                 * files yet, as we can't unshare a partial block.
                 */
 -              if (xfs_is_reflink_inode(ip)) {
 +              if (xfs_is_cow_inode(ip)) {
                        trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
                        return -EREMCHG;
                }
@@@ -872,27 -872,14 +872,27 @@@ xfs_file_fallocate
                                goto out_unlock;
                }
  
 -              if (mode & FALLOC_FL_ZERO_RANGE)
 +              if (mode & FALLOC_FL_ZERO_RANGE) {
                        error = xfs_zero_file_space(ip, offset, len);
 -              else {
 -                      if (mode & FALLOC_FL_UNSHARE_RANGE) {
 -                              error = xfs_reflink_unshare(ip, offset, len);
 -                              if (error)
 -                                      goto out_unlock;
 +              } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
 +                      error = xfs_reflink_unshare(ip, offset, len);
 +                      if (error)
 +                              goto out_unlock;
 +
 +                      if (!xfs_is_always_cow_inode(ip)) {
 +                              error = xfs_alloc_file_space(ip, offset, len,
 +                                              XFS_BMAPI_PREALLOC);
                        }
 +              } else {
 +                      /*
 +                       * If always_cow mode we can't use preallocations and
 +                       * thus should not create them.
 +                       */
 +                      if (xfs_is_always_cow_inode(ip)) {
 +                              error = -EOPNOTSUPP;
 +                              goto out_unlock;
 +                      }
 +
                        error = xfs_alloc_file_space(ip, offset, len,
                                                     XFS_BMAPI_PREALLOC);
                }
@@@ -1081,10 -1068,10 +1081,10 @@@ xfs_file_llseek
        default:
                return generic_file_llseek(file, offset, whence);
        case SEEK_HOLE:
 -              offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
 +              offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
                break;
        case SEEK_DATA:
 -              offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
 +              offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
                break;
        }
  
@@@ -1216,6 -1203,7 +1216,7 @@@ const struct file_operations xfs_file_o
        .write_iter     = xfs_file_write_iter,
        .splice_read    = generic_file_splice_read,
        .splice_write   = iter_file_splice_write,
+       .iopoll         = iomap_dio_iopoll,
        .unlocked_ioctl = xfs_file_ioctl,
  #ifdef CONFIG_COMPAT
        .compat_ioctl   = xfs_file_compat_ioctl,
diff --combined include/linux/fs.h
index 2cc540805a02e6d99ca50dd0a6607a0240737b28,dedcc2e9265cb44282425a3440d75df67d84256f..7442329a0011d62197e7b761c0fe3945847bc2cd
@@@ -37,9 -37,6 +37,9 @@@
  #include <linux/uuid.h>
  #include <linux/errseq.h>
  #include <linux/ioprio.h>
 +#include <linux/fs_types.h>
 +#include <linux/build_bug.h>
 +#include <linux/stddef.h>
  
  #include <asm/byteorder.h>
  #include <uapi/linux/fs.h>
@@@ -307,19 -304,14 +307,20 @@@ enum rw_hint 
  
  struct kiocb {
        struct file             *ki_filp;
 +
 +      /* The 'ki_filp' pointer is shared in a union for aio */
 +      randomized_struct_fields_start
 +
        loff_t                  ki_pos;
        void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
        void                    *private;
        int                     ki_flags;
        u16                     ki_hint;
        u16                     ki_ioprio; /* See linux/ioprio.h */
 -} __randomize_layout;
+       unsigned int            ki_cookie; /* for ->iopoll */
 +
 +      randomized_struct_fields_end
 +};
  
  static inline bool is_sync_kiocb(struct kiocb *kiocb)
  {
@@@ -1709,6 -1701,22 +1710,6 @@@ int fiemap_fill_next_extent(struct fiem
                            u64 phys, u64 len, u32 flags);
  int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
  
 -/*
 - * File types
 - *
 - * NOTE! These match bits 12..15 of stat.st_mode
 - * (ie "(i_mode >> 12) & 15").
 - */
 -#define DT_UNKNOWN    0
 -#define DT_FIFO               1
 -#define DT_CHR                2
 -#define DT_DIR                4
 -#define DT_BLK                6
 -#define DT_REG                8
 -#define DT_LNK                10
 -#define DT_SOCK               12
 -#define DT_WHT                14
 -
  /*
   * This is the "filldir" function type, used by readdir() to let
   * the kernel specify what kind of dirent layout it wants to have.
@@@ -1780,6 -1788,7 +1781,7 @@@ struct file_operations 
        ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
        ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
        ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+       int (*iopoll)(struct kiocb *kiocb, bool spin);
        int (*iterate) (struct file *, struct dir_context *);
        int (*iterate_shared) (struct file *, struct dir_context *);
        __poll_t (*poll) (struct file *, struct poll_table_struct *);
@@@ -2078,7 -2087,7 +2080,7 @@@ static inline void init_sync_kiocb(stru
   * I_WB_SWITCH                Cgroup bdi_writeback switching in progress.  Used to
   *                    synchronize competing switching instances and to tell
   *                    wb stat updates to grab the i_pages lock.  See
 - *                    inode_switch_wb_work_fn() for details.
 + *                    inode_switch_wbs_work_fn() for details.
   *
   * I_OVL_INUSE                Used by overlayfs to get exclusive ownership on upper
   *                    and work dirs among overlayfs mounts.
@@@ -2480,7 -2489,6 +2482,7 @@@ struct filename 
        struct audit_names      *aname;
        const char              iname[];
  };
 +static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);
  
  extern long vfs_truncate(const struct path *, loff_t);
  extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
This page took 0.231215 seconds and 4 git commands to generate.