Merge tag 'for-5.1/block-20190302' of git://git.kernel.dk/linux-block

author Linus Torvalds <[email protected]>

Fri, 8 Mar 2019 22:12:17 +0000 (14:12 -0800)

committer Linus Torvalds <[email protected]>

Fri, 8 Mar 2019 22:12:17 +0000 (14:12 -0800)
author Linus Torvalds <[email protected]>
Fri, 8 Mar 2019 22:12:17 +0000 (14:12 -0800)
committer Linus Torvalds <[email protected]>
Fri, 8 Mar 2019 22:12:17 +0000 (14:12 -0800)
diff --combined block/blk-mq.c

index 9437a5eb07cff63062ed459afc0b6b90e685e6f7,fa024bce2b38ad211d55e0155a74ba590651e3f5..4e502db8b10c8667ca59604150c951ecac40f671
--- 1/block/blk-mq.c
--- 2/block/blk-mq.c
+++ b/block/blk-mq.c
@@@ -364,7 -364,7 +364,7 @@@ static struct request *blk_mq_get_reque
         }
         if (likely(!data->hctx))
                 data->hctx = blk_mq_map_queue(q, data->cmd_flags,
-                                               data->ctx->cpu);
+                                               data->ctx);
         if (data->cmd_flags & REQ_NOWAIT)
                 data->flags |= BLK_MQ_REQ_NOWAIT;
   
@@@ -737,20 -737,12 +737,20 @@@ static void blk_mq_requeue_work(struct 
         spin_unlock_irq(&q->requeue_lock);
   
         list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
- -              if (!(rq->rq_flags & RQF_SOFTBARRIER))
+ +              if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
                         continue;
   
                 rq->rq_flags &= ~RQF_SOFTBARRIER;
                 list_del_init(&rq->queuelist);
- -              blk_mq_sched_insert_request(rq, true, false, false);
+ +              /*
+ +               * If RQF_DONTPREP, rq has contained some driver specific
+ +               * data, so insert it to hctx dispatch list to avoid any
+ +               * merge.
+ +               */
+ +              if (rq->rq_flags & RQF_DONTPREP)
+ +                      blk_mq_request_bypass_insert(rq, false);
+ +              else
+ +                      blk_mq_sched_insert_request(rq, true, false, false);
         }
   
         while (!list_empty(&rq_list)) {
@@@ -2069,7 -2061,7 +2069,7 @@@ struct blk_mq_tags *blk_mq_alloc_rq_map
         struct blk_mq_tags *tags;
         int node;
   
-       node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
+       node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
         if (node == NUMA_NO_NODE)
                 node = set->numa_node;
   
@@@ -2125,7 -2117,7 +2125,7 @@@ int blk_mq_alloc_rqs(struct blk_mq_tag_
         size_t rq_size, left;
         int node;
   
-       node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
+       node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
         if (node == NUMA_NO_NODE)
                 node = set->numa_node;
   
@@@ -2424,7 -2416,7 +2424,7 @@@ static void blk_mq_map_swqueue(struct r
          * If the cpu isn't present, the cpu is mapped to first hctx.
          */
         for_each_possible_cpu(i) {
-               hctx_idx = set->map[0].mq_map[i];
+               hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i];
                 /* unmapped hw queue can be remapped after CPU topo changed */
                 if (!set->tags[hctx_idx] &&
                     !__blk_mq_alloc_rq_map(set, hctx_idx)) {
@@@ -2434,16 -2426,19 +2434,19 @@@
                          * case, remap the current ctx to hctx[0] which
                          * is guaranteed to always have tags allocated
                          */
-                       set->map[0].mq_map[i] = 0;
+                       set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0;
                 }
   
                 ctx = per_cpu_ptr(q->queue_ctx, i);
                 for (j = 0; j < set->nr_maps; j++) {
-                       if (!set->map[j].nr_queues)
+                       if (!set->map[j].nr_queues) {
+                               ctx->hctxs[j] = blk_mq_map_queue_type(q,
+                                               HCTX_TYPE_DEFAULT, i);
                                 continue;
+                       }
   
                         hctx = blk_mq_map_queue_type(q, j, i);
- 
+                       ctx->hctxs[j] = hctx;
                         /*
                          * If the CPU is already set in the mask, then we've
                          * mapped this one already. This can happen if
@@@ -2463,6 -2458,10 +2466,10 @@@
                          */
                         BUG_ON(!hctx->nr_ctx);
                 }
+ 
+               for (; j < HCTX_MAX_TYPES; j++)
+                       ctx->hctxs[j] = blk_mq_map_queue_type(q,
+                                       HCTX_TYPE_DEFAULT, i);
         }
   
         mutex_unlock(&q->sysfs_lock);
@@@ -2734,7 -2733,7 +2741,7 @@@ static void blk_mq_realloc_hw_ctxs(stru
                 int node;
                 struct blk_mq_hw_ctx *hctx;
   
-               node = blk_mq_hw_queue_to_node(&set->map[0], i);
+               node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
                 /*
                  * If the hw queue has been mapped to another numa node,
                  * we need to realloc the hctx. If allocation fails, fallback
@@@ -2838,9 -2837,6 +2845,6 @@@ struct request_queue *blk_mq_init_alloc
             set->map[HCTX_TYPE_POLL].nr_queues)
                 blk_queue_flag_set(QUEUE_FLAG_POLL, q);
   
-       if (!(set->flags & BLK_MQ_F_SG_MERGE))
-               blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
- 
         q->sg_reserved_size = INT_MAX;
   
         INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
@@@ -2968,7 -2964,7 +2972,7 @@@ static int blk_mq_update_queue_map(stru
                 return set->ops->map_queues(set);
         } else {
                 BUG_ON(set->nr_maps > 1);
-               return blk_mq_map_queues(&set->map[0]);
+               return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
         }
   }
   
@@@ -3090,6 -3086,9 +3094,9 @@@ int blk_mq_update_nr_requests(struct re
         if (!set)
                 return -EINVAL;
   
+       if (q->nr_requests == nr)
+               return 0;
+ 
         blk_mq_freeze_queue(q);
         blk_mq_quiesce_queue(q);
   
@@@ -3235,7 -3234,7 +3242,7 @@@ fallback
                         pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
                                         nr_hw_queues, prev_nr_hw_queues);
                         set->nr_hw_queues = prev_nr_hw_queues;
-                       blk_mq_map_queues(&set->map[0]);
+                       blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
                         goto fallback;
                 }
                 blk_mq_map_swqueue(q);
diff --combined drivers/ata/libata-scsi.c

index c2adfd8486c41f687e0b449c5b21b91a81f5c845,dfe66d00dd5b87e699fe2f1d5a159f779e50bb25..21d1ce20e1a9055b48efd8fbf482b2105a0ac45f
--- 1/drivers/ata/libata-scsi.c
--- 2/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@@ -1318,8 -1318,6 +1318,6 @@@ static int ata_scsi_dev_config(struct s
                 scsi_change_queue_depth(sdev, depth);
         }
   
-       blk_queue_flush_queueable(q, false);
- 
         if (dev->flags & ATA_DFLAG_TRUSTED)
                 sdev->security_supported = 1;
   
@@@ -2990,7 -2988,7 +2988,7 @@@ static unsigned int atapi_xlat(struct a
          * This inconsistency confuses several controllers which
          * perform PIO using DMA such as Intel AHCIs and sil3124/32.
          * These controllers use actual number of transferred bytes to
- -       * update DMA poitner and transfer of 4n+2 bytes make those
+ +       * update DMA pointer and transfer of 4n+2 bytes make those
          * controller push DMA pointer by 4n+4 bytes because SATA data
          * FISes are aligned to 4 bytes.  This causes data corruption
          * and buffer overrun.
diff --combined drivers/block/floppy.c

index 55481b40df9a5ee57c11a05462bc03d1d46d3682,04d47683eddd63cbecdff1a19ea2902f1ab2c751..95f608d1a098a2c0d0d312e1bd6be82517bfabaf
--- 1/drivers/block/floppy.c
--- 2/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@@ -2230,7 -2230,6 +2230,6 @@@ static void floppy_end_request(struct r
   static void request_done(int uptodate)
   {
         struct request *req = current_req;
-       struct request_queue *q;
         int block;
         char msg[sizeof("request done ") + sizeof(int) * 3];
   
@@@ -2243,8 -2242,6 +2242,6 @@@
                 return;
         }
   
-       q = req->q;
- 
         if (uptodate) {
                 /* maintain values for invalidation on geometry
                  * change */
@@@ -4075,7 -4072,7 +4072,7 @@@ static unsigned int floppy_check_events
   
         if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) {
                 if (lock_fdc(drive))
- -                      return -EINTR;
+ +                      return 0;
                 poll_drive(false, 0);
                 process_fd_request();
         }
diff --combined drivers/block/mtip32xx/mtip32xx.c

index 2f3ee4d6af827645248f2d49f6e2de033782b044,9a6f40cd8df6be434f4954f5107234e0eedf3125..83302ecdc8db5ea3627ba7e0c2fb9fde83ab5336
--- 1/drivers/block/mtip32xx/mtip32xx.c
--- 2/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@@ -40,7 -40,6 +40,7 @@@
   #include <linux/export.h>
   #include <linux/debugfs.h>
   #include <linux/prefetch.h>
+ +#include <linux/numa.h>
   #include "mtip32xx.h"
   
   #define HW_CMD_SLOT_SZ                (MTIP_MAX_COMMAND_SLOTS * 32)
@@@ -1416,7 -1415,7 +1416,7 @@@ static blk_status_t mtip_send_trim(stru
         WARN_ON(sizeof(struct mtip_trim) > ATA_SECT_SIZE);
   
         /* Allocate a DMA buffer for the trim structure */
-       buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
+       buf = dma_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
                                                                 GFP_KERNEL);
         if (!buf)
                 return BLK_STS_RESOURCE;
@@@ -1453,7 -1452,7 +1453,7 @@@
                                         MTIP_TRIM_TIMEOUT_MS) < 0)
                 ret = BLK_STS_IOERR;
   
-       dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
+       dma_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
         return ret;
   }
   
@@@ -1656,7 -1655,7 +1656,7 @@@ static int exec_drive_command(struct mt
                 if (!user_buffer)
                         return -EFAULT;
   
-               buf = dmam_alloc_coherent(&port->dd->pdev->dev,
+               buf = dma_alloc_coherent(&port->dd->pdev->dev,
                                 ATA_SECT_SIZE * xfer_sz,
                                 &dma_addr,
                                 GFP_KERNEL);
@@@ -1734,7 -1733,7 +1734,7 @@@
         }
   exit_drive_command:
         if (buf)
-               dmam_free_coherent(&port->dd->pdev->dev,
+               dma_free_coherent(&port->dd->pdev->dev,
                                 ATA_SECT_SIZE * xfer_sz, buf, dma_addr);
         return rv;
   }
@@@ -2838,11 -2837,11 +2838,11 @@@ static void mtip_dma_free(struct driver
         struct mtip_port *port = dd->port;
   
         if (port->block1)
-               dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+               dma_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
                                         port->block1, port->block1_dma);
   
         if (port->command_list) {
-               dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
+               dma_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
                                 port->command_list, port->command_list_dma);
         }
   }
@@@ -2861,7 -2860,7 +2861,7 @@@ static int mtip_dma_alloc(struct driver
   
         /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */
         port->block1 =
-               dmam_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+               dma_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
                                         &port->block1_dma, GFP_KERNEL);
         if (!port->block1)
                 return -ENOMEM;
@@@ -2869,10 -2868,10 +2869,10 @@@
   
         /* Allocate dma memory for command list */
         port->command_list =
-               dmam_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
+               dma_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
                                         &port->command_list_dma, GFP_KERNEL);
         if (!port->command_list) {
-               dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+               dma_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
                                         port->block1, port->block1_dma);
                 port->block1 = NULL;
                 port->block1_dma = 0;
@@@ -3057,13 -3056,8 +3057,8 @@@ static int mtip_hw_init(struct driver_d
         mtip_start_port(dd->port);
   
         /* Setup the ISR and enable interrupts. */
-       rv = devm_request_irq(&dd->pdev->dev,
-                               dd->pdev->irq,
-                               mtip_irq_handler,
-                               IRQF_SHARED,
-                               dev_driver_string(&dd->pdev->dev),
-                               dd);
- 
+       rv = request_irq(dd->pdev->irq, mtip_irq_handler, IRQF_SHARED,
+                        dev_driver_string(&dd->pdev->dev), dd);
         if (rv) {
                 dev_err(&dd->pdev->dev,
                         "Unable to allocate IRQ %d\n", dd->pdev->irq);
@@@ -3091,7 -3085,7 +3086,7 @@@ out3
   
         /* Release the IRQ. */
         irq_set_affinity_hint(dd->pdev->irq, NULL);
-       devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+       free_irq(dd->pdev->irq, dd);
   
   out2:
         mtip_deinit_port(dd->port);
@@@ -3146,7 -3140,7 +3141,7 @@@ static int mtip_hw_exit(struct driver_d
   
         /* Release the IRQ. */
         irq_set_affinity_hint(dd->pdev->irq, NULL);
-       devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+       free_irq(dd->pdev->irq, dd);
         msleep(1000);
   
         /* Free dma regions */
@@@ -3610,8 -3604,8 +3605,8 @@@ static void mtip_free_cmd(struct blk_mq
         if (!cmd->command)
                 return;
   
-       dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
-                               cmd->command, cmd->command_dma);
+       dma_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, cmd->command,
+                         cmd->command_dma);
   }
   
   static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq,
@@@ -3620,7 -3614,7 +3615,7 @@@
         struct driver_data *dd = set->driver_data;
         struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
   
-       cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
+       cmd->command = dma_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
                         &cmd->command_dma, GFP_KERNEL);
         if (!cmd->command)
                 return -ENOMEM;
@@@ -4019,9 -4013,9 +4014,9 @@@ static int get_least_used_cpu_on_node(i
   /* Helper for selecting a node in round robin mode */
   static inline int mtip_get_next_rr_node(void)
   {
- -      static int next_node = -1;
+ +      static int next_node = NUMA_NO_NODE;
   
- -      if (next_node == -1) {
+ +      if (next_node == NUMA_NO_NODE) {
                 next_node = first_online_node;
                 return next_node;
         }
diff --combined drivers/block/rbd.c

index 282e2e82d84974726b93e9752e2003a3ca7c5491,abe9e1c8922742b826afce126cf890c2d04e2814..74088d8dbaf357b9ecf46b02051dabf96e2d0a10
--- 1/drivers/block/rbd.c
--- 2/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@@ -428,13 -428,14 +428,13 @@@ static bool single_major = true
   module_param(single_major, bool, 0444);
   MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
   
- -static ssize_t rbd_add(struct bus_type *bus, const char *buf,
- -                     size_t count);
- -static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
- -                        size_t count);
- -static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
- -                                  size_t count);
- -static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
- -                                     size_t count);
+ +static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
+ +static ssize_t remove_store(struct bus_type *bus, const char *buf,
+ +                          size_t count);
+ +static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
+ +                                    size_t count);
+ +static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
+ +                                       size_t count);
   static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
   
   static int rbd_dev_id_to_minor(int dev_id)
@@@ -463,16 -464,16 +463,16 @@@ static bool rbd_is_lock_owner(struct rb
         return is_lock_owner;
   }
   
- -static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
+ +static ssize_t supported_features_show(struct bus_type *bus, char *buf)
   {
         return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
   }
   
- -static BUS_ATTR(add, 0200, NULL, rbd_add);
- -static BUS_ATTR(remove, 0200, NULL, rbd_remove);
- -static BUS_ATTR(add_single_major, 0200, NULL, rbd_add_single_major);
- -static BUS_ATTR(remove_single_major, 0200, NULL, rbd_remove_single_major);
- -static BUS_ATTR(supported_features, 0444, rbd_supported_features_show, NULL);
+ +static BUS_ATTR_WO(add);
+ +static BUS_ATTR_WO(remove);
+ +static BUS_ATTR_WO(add_single_major);
+ +static BUS_ATTR_WO(remove_single_major);
+ +static BUS_ATTR_RO(supported_features);
   
   static struct attribute *rbd_bus_attrs[] = {
         &bus_attr_add.attr,
@@@ -3987,7 -3988,7 +3987,7 @@@ static int rbd_init_disk(struct rbd_dev
         rbd_dev->tag_set.ops = &rbd_mq_ops;
         rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
         rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
-       rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+       rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
         rbd_dev->tag_set.nr_hw_queues = 1;
         rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
   
@@@ -5933,7 -5934,9 +5933,7 @@@ err_out_args
         goto out;
   }
   
- -static ssize_t rbd_add(struct bus_type *bus,
- -                     const char *buf,
- -                     size_t count)
+ +static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
   {
         if (single_major)
                 return -EINVAL;
@@@ -5941,8 -5944,9 +5941,8 @@@
         return do_rbd_add(bus, buf, count);
   }
   
- -static ssize_t rbd_add_single_major(struct bus_type *bus,
- -                                  const char *buf,
- -                                  size_t count)
+ +static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
+ +                                    size_t count)
   {
         return do_rbd_add(bus, buf, count);
   }
@@@ -6045,7 -6049,9 +6045,7 @@@ static ssize_t do_rbd_remove(struct bus
         return count;
   }
   
- -static ssize_t rbd_remove(struct bus_type *bus,
- -                        const char *buf,
- -                        size_t count)
+ +static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
   {
         if (single_major)
                 return -EINVAL;
@@@ -6053,8 -6059,9 +6053,8 @@@
         return do_rbd_remove(bus, buf, count);
   }
   
- -static ssize_t rbd_remove_single_major(struct bus_type *bus,
- -                                     const char *buf,
- -                                     size_t count)
+ +static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
+ +                                       size_t count)
   {
         return do_rbd_remove(bus, buf, count);
   }
diff --combined drivers/md/dm-crypt.c

index dd538e6b27480c5583731cab60738adb4064f1c0,9a29037f56158f4664c28fa90ef3a733edaf533a..dd6565798778055f3a519c12a716f417631f8f9d
--- 1/drivers/md/dm-crypt.c
--- 2/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@@ -932,7 -932,7 +932,7 @@@ static int dm_crypt_integrity_io_alloc(
         if (IS_ERR(bip))
                 return PTR_ERR(bip);
   
- -      tag_len = io->cc->on_disk_tag_size * bio_sectors(bio);
+ +      tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift);
   
         bip->bip_iter.bi_size = tag_len;
         bip->bip_iter.bi_sector = io->cc->start + io->sector;
@@@ -1447,8 -1447,9 +1447,9 @@@ static void crypt_free_buffer_pages(str
   {
         unsigned int i;
         struct bio_vec *bv;
+       struct bvec_iter_all iter_all;
   
-       bio_for_each_segment_all(bv, clone, i) {
+       bio_for_each_segment_all(bv, clone, i, iter_all) {
                 BUG_ON(!bv->bv_page);
                 mempool_free(bv->bv_page, &cc->page_pool);
         }
diff --combined drivers/md/raid1.c

index fa47249fa3e42819a76f2931963cebec4accda40,88c61d3090b0521f1d3cb6f47192beffef90c8a6..fdf451aac369041c6fccccfca4da74d121862025
--- 1/drivers/md/raid1.c
--- 2/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@@ -1603,11 -1603,9 +1603,9 @@@ static void raid1_error(struct mddev *m
                 return;
         }
         set_bit(Blocked, &rdev->flags);
-       if (test_and_clear_bit(In_sync, &rdev->flags)) {
+       if (test_and_clear_bit(In_sync, &rdev->flags))
                 mddev->degraded++;
-               set_bit(Faulty, &rdev->flags);
-       } else
-               set_bit(Faulty, &rdev->flags);
+       set_bit(Faulty, &rdev->flags);
         spin_unlock_irqrestore(&conf->device_lock, flags);
         /*
          * if recovery is running, make sure it aborts.
@@@ -1863,20 -1861,6 +1861,20 @@@ static void end_sync_read(struct bio *b
                 reschedule_retry(r1_bio);
   }
   
+ +static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
+ +{
+ +      sector_t sync_blocks = 0;
+ +      sector_t s = r1_bio->sector;
+ +      long sectors_to_go = r1_bio->sectors;
+ +
+ +      /* make sure these bits don't get cleared. */
+ +      do {
+ +              md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
+ +              s += sync_blocks;
+ +              sectors_to_go -= sync_blocks;
+ +      } while (sectors_to_go > 0);
+ +}
+ +
   static void end_sync_write(struct bio *bio)
   {
         int uptodate = !bio->bi_status;
@@@ -1888,7 -1872,15 +1886,7 @@@
         struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
   
         if (!uptodate) {
- -              sector_t sync_blocks = 0;
- -              sector_t s = r1_bio->sector;
- -              long sectors_to_go = r1_bio->sectors;
- -              /* make sure these bits doesn't get cleared. */
- -              do {
- -                      md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
- -                      s += sync_blocks;
- -                      sectors_to_go -= sync_blocks;
- -              } while (sectors_to_go > 0);
+ +              abort_sync_write(mddev, r1_bio);
                 set_bit(WriteErrorSeen, &rdev->flags);
                 if (!test_and_set_bit(WantReplacement, &rdev->flags))
                         set_bit(MD_RECOVERY_NEEDED, &
@@@ -2120,13 -2112,14 +2118,14 @@@ static void process_checks(struct r1bi
                 struct page **spages = get_resync_pages(sbio)->pages;
                 struct bio_vec *bi;
                 int page_len[RESYNC_PAGES] = { 0 };
+               struct bvec_iter_all iter_all;
   
                 if (sbio->bi_end_io != end_sync_read)
                         continue;
                 /* Now we can 'fixup' the error value */
                 sbio->bi_status = 0;
   
-               bio_for_each_segment_all(bi, sbio, j)
+               bio_for_each_segment_all(bi, sbio, j, iter_all)
                         page_len[j] = bi->bv_len;
   
                 if (!status) {
@@@ -2178,10 -2171,8 +2177,10 @@@ static void sync_request_write(struct m
                      (i == r1_bio->read_disk ||
                       !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
                         continue;
- -              if (test_bit(Faulty, &conf->mirrors[i].rdev->flags))
+ +              if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
+ +                      abort_sync_write(mddev, r1_bio);
                         continue;
+ +              }
   
                 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
                 if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
diff --combined drivers/mmc/core/queue.c

index 15a45ec6518d75c3fd8e602c313ebe77d3aa7376,cc19e71c71d469815c49d7b550586b206d9e9899..7c364a9c4eeb4bb6f3fa7f45913279078262a30e
--- 1/drivers/mmc/core/queue.c
--- 2/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@@ -355,7 -355,6 +355,7 @@@ static void mmc_setup_queue(struct mmc_
   {
         struct mmc_host *host = card->host;
         u64 limit = BLK_BOUNCE_HIGH;
+ +      unsigned block_size = 512;
   
         if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask)
                 limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT;
@@@ -369,13 -368,7 +369,13 @@@
         blk_queue_max_hw_sectors(mq->queue,
                 min(host->max_blk_count, host->max_req_size / 512));
         blk_queue_max_segments(mq->queue, host->max_segs);
- -      blk_queue_max_segment_size(mq->queue, host->max_seg_size);
+ +
+ +      if (mmc_card_mmc(card))
+ +              block_size = card->ext_csd.data_sector_size;
+ +
+ +      blk_queue_logical_block_size(mq->queue, block_size);
+ +      blk_queue_max_segment_size(mq->queue,
+ +                      round_down(host->max_seg_size, block_size));
   
         INIT_WORK(&mq->recovery_work, mmc_mq_recovery_handler);
         INIT_WORK(&mq->complete_work, mmc_blk_mq_complete_work);
@@@ -417,8 -410,7 +417,7 @@@ int mmc_init_queue(struct mmc_queue *mq
         else
                 mq->tag_set.queue_depth = MMC_QUEUE_DEPTH;
         mq->tag_set.numa_node = NUMA_NO_NODE;
-       mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE |
-                           BLK_MQ_F_BLOCKING;
+       mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
         mq->tag_set.nr_hw_queues = 1;
         mq->tag_set.cmd_size = sizeof(struct mmc_queue_req);
         mq->tag_set.driver_data = mq;
diff --combined drivers/nvme/host/pci.c

index e905861186e35230256a9b66e906be54e72c8f84,f54718b63637dbfcdb3984cc7fcf7a69048be484..92bad1c810acda473bdf5d92e96960a3107e19e6
--- 1/drivers/nvme/host/pci.c
--- 2/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@@ -1,15 -1,7 +1,7 @@@
+ // SPDX-License-Identifier: GPL-2.0
   /*
    * NVM Express device driver
    * Copyright (c) 2011-2014, Intel Corporation.
-  *
-  * This program is free software; you can redistribute it and/or modify it
-  * under the terms and conditions of the GNU General Public License,
-  * version 2, as published by the Free Software Foundation.
-  *
-  * This program is distributed in the hope it will be useful, but WITHOUT
-  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  * more details.
    */
   
   #include <linux/aer.h>
@@@ -157,6 -149,8 +149,8 @@@ static int queue_count_set(const char *
         int n = 0, ret;
   
         ret = kstrtoint(val, 10, &n);
+       if (ret)
+               return ret;
         if (n > num_possible_cpus())
                 n = num_possible_cpus();
   
@@@ -2041,52 -2035,53 +2035,52 @@@ static int nvme_setup_host_mem(struct n
         return ret;
   }
   
- -/* irq_queues covers admin queue */
- -static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues)
+ +/*
+ + * nirqs is the number of interrupts available for write and read
+ + * queues. The core already reserved an interrupt for the admin queue.
+ + */
+ +static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
   {
- -      unsigned int this_w_queues = write_queues;
- -
- -      WARN_ON(!irq_queues);
- -
- -      /*
- -       * Setup read/write queue split, assign admin queue one independent
- -       * irq vector if irq_queues is > 1.
- -       */
- -      if (irq_queues <= 2) {
- -              dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
- -              dev->io_queues[HCTX_TYPE_READ] = 0;
- -              return;
- -      }
+ +      struct nvme_dev *dev = affd->priv;
+ +      unsigned int nr_read_queues;
   
         /*
- -       * If 'write_queues' is set, ensure it leaves room for at least
- -       * one read queue and one admin queue
+ +       * If there is no interupt available for queues, ensure that
+ +       * the default queue is set to 1. The affinity set size is
+ +       * also set to one, but the irq core ignores it for this case.
+ +       *
+ +       * If only one interrupt is available or 'write_queue' == 0, combine
+ +       * write and read queues.
+ +       *
+ +       * If 'write_queues' > 0, ensure it leaves room for at least one read
+ +       * queue.
          */
- -      if (this_w_queues >= irq_queues)
- -              this_w_queues = irq_queues - 2;
- -
- -      /*
- -       * If 'write_queues' is set to zero, reads and writes will share
- -       * a queue set.
- -       */
- -      if (!this_w_queues) {
- -              dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues - 1;
- -              dev->io_queues[HCTX_TYPE_READ] = 0;
+ +      if (!nrirqs) {
+ +              nrirqs = 1;
+ +              nr_read_queues = 0;
+ +      } else if (nrirqs == 1 || !write_queues) {
+ +              nr_read_queues = 0;
+ +      } else if (write_queues >= nrirqs) {
+ +              nr_read_queues = 1;
         } else {
- -              dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
- -              dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues - 1;
+ +              nr_read_queues = nrirqs - write_queues;
         }
+ +
+ +      dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
+ +      affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
+ +      dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
+ +      affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
+ +      affd->nr_sets = nr_read_queues ? 2 : 1;
   }
   
   static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
   {
         struct pci_dev *pdev = to_pci_dev(dev->dev);
- -      int irq_sets[2];
         struct irq_affinity affd = {
- -              .pre_vectors = 1,
- -              .nr_sets = ARRAY_SIZE(irq_sets),
- -              .sets = irq_sets,
+ +              .pre_vectors    = 1,
+ +              .calc_sets      = nvme_calc_irq_sets,
+ +              .priv           = dev,
         };
- -      int result = 0;
         unsigned int irq_queues, this_p_queues;
   
         /*
@@@ -2102,12 -2097,51 +2096,12 @@@
         }
         dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
   
- -      /*
- -       * For irq sets, we have to ask for minvec == maxvec. This passes
- -       * any reduction back to us, so we can adjust our queue counts and
- -       * IRQ vector needs.
- -       */
- -      do {
- -              nvme_calc_io_queues(dev, irq_queues);
- -              irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
- -              irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
- -              if (!irq_sets[1])
- -                      affd.nr_sets = 1;
+ +      /* Initialize for the single interrupt case */
+ +      dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
+ +      dev->io_queues[HCTX_TYPE_READ] = 0;
   
- -              /*
- -               * If we got a failure and we're down to asking for just
- -               * 1 + 1 queues, just ask for a single vector. We'll share
- -               * that between the single IO queue and the admin queue.
- -               * Otherwise, we assign one independent vector to admin queue.
- -               */
- -              if (irq_queues > 1)
- -                      irq_queues = irq_sets[0] + irq_sets[1] + 1;
- -
- -              result = pci_alloc_irq_vectors_affinity(pdev, irq_queues,
- -                              irq_queues,
- -                              PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
- -
- -              /*
- -               * Need to reduce our vec counts. If we get ENOSPC, the
- -               * platform should support mulitple vecs, we just need
- -               * to decrease our ask. If we get EINVAL, the platform
- -               * likely does not. Back down to ask for just one vector.
- -               */
- -              if (result == -ENOSPC) {
- -                      irq_queues--;
- -                      if (!irq_queues)
- -                              return result;
- -                      continue;
- -              } else if (result == -EINVAL) {
- -                      irq_queues = 1;
- -                      continue;
- -              } else if (result <= 0)
- -                      return -EIO;
- -              break;
- -      } while (1);
- -
- -      return result;
+ +      return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
+ +                            PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
   }
   
   static void nvme_disable_io_queues(struct nvme_dev *dev)
@@@ -2520,15 -2554,15 +2514,15 @@@ static void nvme_reset_work(struct work
         mutex_lock(&dev->shutdown_lock);
         result = nvme_pci_enable(dev);
         if (result)
- -              goto out;
+ +              goto out_unlock;
   
         result = nvme_pci_configure_admin_queue(dev);
         if (result)
- -              goto out;
+ +              goto out_unlock;
   
         result = nvme_alloc_admin_tags(dev);
         if (result)
- -              goto out;
+ +              goto out_unlock;
   
         /*
          * Limit the max command size to prevent iod->sg allocations going
@@@ -2611,8 -2645,6 +2605,8 @@@
         nvme_start_ctrl(&dev->ctrl);
         return;
   
+ + out_unlock:
+ +      mutex_unlock(&dev->shutdown_lock);
    out:
         nvme_remove_dead_ctrl(dev, result);
   }
@@@ -2984,7 -3016,6 +2978,7 @@@ static struct pci_driver nvme_driver = 
   
   static int __init nvme_init(void)
   {
+ +      BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
         return pci_register_driver(&nvme_driver);
   }
   
diff --combined drivers/scsi/scsi_lib.c

index a6828391d6b3777873782221c98885c4b8f41885,6cadbe945bdb559e81b549e80f11d1e095c2e617..ca5fd3ae81f848222cf43c9aa629efd56f404ba5
--- 1/drivers/scsi/scsi_lib.c
--- 2/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@@ -655,7 -655,6 +655,7 @@@ static blk_status_t scsi_result_to_blk_
                 set_host_byte(cmd, DID_OK);
                 return BLK_STS_TARGET;
         case DID_NEXUS_FAILURE:
+ +              set_host_byte(cmd, DID_OK);
                 return BLK_STS_NEXUS;
         case DID_ALLOC_FAILURE:
                 set_host_byte(cmd, DID_OK);
@@@ -1900,7 -1899,7 +1900,7 @@@ int scsi_mq_setup_tags(struct Scsi_Hos
         shost->tag_set.queue_depth = shost->can_queue;
         shost->tag_set.cmd_size = cmd_size;
         shost->tag_set.numa_node = NUMA_NO_NODE;
-       shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+       shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
         shost->tag_set.flags |=
                 BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy);
         shost->tag_set.driver_data = shost;
@@@ -2598,6 -2597,7 +2598,6 @@@ void scsi_device_resume(struct scsi_dev
          * device deleted during suspend)
          */
         mutex_lock(&sdev->state_mutex);
- -      WARN_ON_ONCE(!sdev->quiesced_by);
         sdev->quiesced_by = NULL;
         blk_clear_pm_only(sdev->request_queue);
         if (sdev->sdev_state == SDEV_QUIESCE)
diff --combined drivers/staging/erofs/data.c

index 9c471f08ffd4112a97b7ef06897e965ed043a3d8,4871ba7b7d9ac7a826aec3258f52d25f76b402e6..526e0dbea5b5714618b463cb3eab98b0895e99f6
--- 1/drivers/staging/erofs/data.c
--- 2/drivers/staging/erofs/data.c
+++ b/drivers/staging/erofs/data.c
@@@ -20,8 -20,9 +20,9 @@@ static inline void read_endio(struct bi
         int i;
         struct bio_vec *bvec;
         const blk_status_t err = bio->bi_status;
+       struct bvec_iter_all iter_all;
   
-       bio_for_each_segment_all(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i, iter_all) {
                 struct page *page = bvec->bv_page;
   
                 /* page is already locked */
@@@ -165,16 -166,43 +166,16 @@@ err_out
         return err;
   }
   
- -#ifdef CONFIG_EROFS_FS_ZIP
- -extern int z_erofs_map_blocks_iter(struct inode *,
- -                                 struct erofs_map_blocks *,
- -                                 struct page **, int);
- -#endif
- -
- -int erofs_map_blocks_iter(struct inode *inode,
- -                        struct erofs_map_blocks *map,
- -                        struct page **mpage_ret, int flags)
- -{
- -      /* by default, reading raw data never use erofs_map_blocks_iter */
- -      if (unlikely(!is_inode_layout_compression(inode))) {
- -              if (*mpage_ret)
- -                      put_page(*mpage_ret);
- -              *mpage_ret = NULL;
- -
- -              return erofs_map_blocks(inode, map, flags);
- -      }
- -
- -#ifdef CONFIG_EROFS_FS_ZIP
- -      return z_erofs_map_blocks_iter(inode, map, mpage_ret, flags);
- -#else
- -      /* data compression is not available */
- -      return -ENOTSUPP;
- -#endif
- -}
- -
   int erofs_map_blocks(struct inode *inode,
                      struct erofs_map_blocks *map, int flags)
   {
         if (unlikely(is_inode_layout_compression(inode))) {
- -              struct page *mpage = NULL;
- -              int err;
+ +              int err = z_erofs_map_blocks_iter(inode, map, flags);
   
- -              err = erofs_map_blocks_iter(inode, map, &mpage, flags);
- -              if (mpage)
- -                      put_page(mpage);
+ +              if (map->mpage) {
+ +                      put_page(map->mpage);
+ +                      map->mpage = NULL;
+ +              }
                 return err;
         }
         return erofs_map_blocks_flatmode(inode, map, flags);
diff --combined drivers/staging/erofs/unzip_vle.c

index 02f34a83147d21874121a8eee5a39068cfb812d2,c057c5616b1ddd0feccc069449c29ef34878e3d9..8715bc50e09c16d44ece32baa474eb7d9bc5ab8f
--- 1/drivers/staging/erofs/unzip_vle.c
--- 2/drivers/staging/erofs/unzip_vle.c
+++ b/drivers/staging/erofs/unzip_vle.c
@@@ -107,30 -107,15 +107,30 @@@ enum z_erofs_vle_work_role 
         Z_EROFS_VLE_WORK_SECONDARY,
         Z_EROFS_VLE_WORK_PRIMARY,
         /*
- -       * The current work has at least been linked with the following
- -       * processed chained works, which means if the processing page
- -       * is the tail partial page of the work, the current work can
- -       * safely use the whole page, as illustrated below:
- -       * +--------------+-------------------------------------------+
- -       * |  tail page   |      head page (of the previous work)     |
- -       * +--------------+-------------------------------------------+
- -       *   /\  which belongs to the current work
- -       * [  (*) this page can be used for the current work itself.  ]
+ +       * The current work was the tail of an exist chain, and the previous
+ +       * processed chained works are all decided to be hooked up to it.
+ +       * A new chain should be created for the remaining unprocessed works,
+ +       * therefore different from Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED,
+ +       * the next work cannot reuse the whole page in the following scenario:
+ +       *  ________________________________________________________________
+ +       * |      tail (partial) page     |       head (partial) page       |
+ +       * |  (belongs to the next work)  |  (belongs to the current work)  |
+ +       * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
+ +       */
+ +      Z_EROFS_VLE_WORK_PRIMARY_HOOKED,
+ +      /*
+ +       * The current work has been linked with the processed chained works,
+ +       * and could be also linked with the potential remaining works, which
+ +       * means if the processing page is the tail partial page of the work,
+ +       * the current work can safely use the whole page (since the next work
+ +       * is under control) for in-place decompression, as illustrated below:
+ +       *  ________________________________________________________________
+ +       * |  tail (partial) page  |          head (partial) page           |
+ +       * | (of the current work) |         (of the previous work)         |
+ +       * |  PRIMARY_FOLLOWED or  |                                        |
+ +       * |_____PRIMARY_HOOKED____|____________PRIMARY_FOLLOWED____________|
+ +       *
+ +       * [  (*) the above page can be used for the current work itself.  ]
          */
         Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED,
         Z_EROFS_VLE_WORK_MAX
@@@ -253,9 -238,14 +253,9 @@@ int erofs_try_to_free_cached_page(struc
   {
         struct erofs_sb_info *const sbi = EROFS_SB(mapping->host->i_sb);
         const unsigned int clusterpages = erofs_clusterpages(sbi);
- -
- -      struct z_erofs_vle_workgroup *grp;
+ +      struct z_erofs_vle_workgroup *const grp = (void *)page_private(page);
         int ret = 0;    /* 0 - busy */
   
- -      /* prevent the workgroup from being freed */
- -      rcu_read_lock();
- -      grp = (void *)page_private(page);
- -
         if (erofs_workgroup_try_to_freeze(&grp->obj, 1)) {
                 unsigned int i;
   
@@@ -267,11 -257,12 +267,11 @@@
                         }
                 }
                 erofs_workgroup_unfreeze(&grp->obj, 1);
- -      }
- -      rcu_read_unlock();
   
- -      if (ret) {
- -              ClearPagePrivate(page);
- -              put_page(page);
+ +              if (ret) {
+ +                      ClearPagePrivate(page);
+ +                      put_page(page);
+ +              }
         }
         return ret;
   }
@@@ -324,10 -315,10 +324,10 @@@ static int z_erofs_vle_work_add_page
         return ret ? 0 : -EAGAIN;
   }
   
- -static inline bool try_to_claim_workgroup(
- -      struct z_erofs_vle_workgroup *grp,
- -      z_erofs_vle_owned_workgrp_t *owned_head,
- -      bool *hosted)
+ +static enum z_erofs_vle_work_role
+ +try_to_claim_workgroup(struct z_erofs_vle_workgroup *grp,
+ +                     z_erofs_vle_owned_workgrp_t *owned_head,
+ +                     bool *hosted)
   {
         DBG_BUGON(*hosted == true);
   
@@@ -341,9 -332,6 +341,9 @@@ retry
   
                 *owned_head = &grp->next;
                 *hosted = true;
+ +              /* lucky, I am the followee :) */
+ +              return Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED;
+ +
         } else if (grp->next == Z_EROFS_VLE_WORKGRP_TAIL) {
                 /*
                  * type 2, link to the end of a existing open chain,
@@@ -353,11 -341,12 +353,11 @@@
                 if (cmpxchg(&grp->next, Z_EROFS_VLE_WORKGRP_TAIL,
                             *owned_head) != Z_EROFS_VLE_WORKGRP_TAIL)
                         goto retry;
- -
                 *owned_head = Z_EROFS_VLE_WORKGRP_TAIL;
- -      } else
- -              return false;   /* :( better luck next time */
+ +              return Z_EROFS_VLE_WORK_PRIMARY_HOOKED;
+ +      }
   
- -      return true;    /* lucky, I am the followee :) */
+ +      return Z_EROFS_VLE_WORK_PRIMARY; /* :( better luck next time */
   }
   
   struct z_erofs_vle_work_finder {
@@@ -435,9 -424,12 +435,9 @@@ z_erofs_vle_work_lookup(const struct z_
         *f->hosted = false;
         if (!primary)
                 *f->role = Z_EROFS_VLE_WORK_SECONDARY;
- -      /* claim the workgroup if possible */
- -      else if (try_to_claim_workgroup(grp, f->owned_head, f->hosted))
- -              *f->role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED;
- -      else
- -              *f->role = Z_EROFS_VLE_WORK_PRIMARY;
- -
+ +      else    /* claim the workgroup if possible */
+ +              *f->role = try_to_claim_workgroup(grp, f->owned_head,
+ +                                                f->hosted);
         return work;
   }
   
@@@ -501,9 -493,6 +501,9 @@@ z_erofs_vle_work_register(const struct 
         return work;
   }
   
+ +#define builder_is_hooked(builder) \
+ +      ((builder)->role >= Z_EROFS_VLE_WORK_PRIMARY_HOOKED)
+ +
   #define builder_is_followed(builder) \
         ((builder)->role >= Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED)
   
@@@ -550,7 -539,7 +550,7 @@@ repeat
         if (unlikely(work == ERR_PTR(-EAGAIN)))
                 goto repeat;
   
- -      if (unlikely(IS_ERR(work)))
+ +      if (IS_ERR(work))
                 return PTR_ERR(work);
   got_it:
         z_erofs_pagevec_ctor_init(&builder->vector,
@@@ -600,7 -589,7 +600,7 @@@ static void __z_erofs_vle_work_release(
         erofs_workgroup_put(&grp->obj);
   }
   
- -void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
+ +static void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
   {
         struct z_erofs_vle_workgroup *grp =
                 z_erofs_vle_work_workgroup(work, true);
@@@ -647,7 -636,7 +647,7 @@@ struct z_erofs_vle_frontend 
         struct inode *const inode;
   
         struct z_erofs_vle_work_builder builder;
- -      struct erofs_map_blocks_iter m_iter;
+ +      struct erofs_map_blocks map;
   
         z_erofs_vle_owned_workgrp_t owned_head;
   
@@@ -658,9 -647,8 +658,9 @@@
   
   #define VLE_FRONTEND_INIT(__i) { \
         .inode = __i, \
- -      .m_iter = { \
- -              { .m_llen = 0, .m_plen = 0 }, \
+ +      .map = { \
+ +              .m_llen = 0, \
+ +              .m_plen = 0, \
                 .mpage = NULL \
         }, \
         .builder = VLE_WORK_BUILDER_INIT(), \
@@@ -693,11 -681,12 +693,11 @@@ static int z_erofs_do_read_page(struct 
   {
         struct super_block *const sb = fe->inode->i_sb;
         struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb);
- -      struct erofs_map_blocks_iter *const m = &fe->m_iter;
- -      struct erofs_map_blocks *const map = &m->map;
+ +      struct erofs_map_blocks *const map = &fe->map;
         struct z_erofs_vle_work_builder *const builder = &fe->builder;
         const loff_t offset = page_offset(page);
   
- -      bool tight = builder_is_followed(builder);
+ +      bool tight = builder_is_hooked(builder);
         struct z_erofs_vle_work *work = builder->work;
   
         enum z_erofs_cache_alloctype cache_strategy;
@@@ -715,12 -704,8 +715,12 @@@ repeat
   
         /* lucky, within the range of the current map_blocks */
         if (offset + cur >= map->m_la &&
- -              offset + cur < map->m_la + map->m_llen)
+ +              offset + cur < map->m_la + map->m_llen) {
+ +              /* didn't get a valid unzip work previously (very rare) */
+ +              if (!builder->work)
+ +                      goto restart_now;
                 goto hitted;
+ +      }
   
         /* go ahead the next map_blocks */
         debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
@@@ -730,11 -715,10 +730,11 @@@
   
         map->m_la = offset + cur;
         map->m_llen = 0;
- -      err = erofs_map_blocks_iter(fe->inode, map, &m->mpage, 0);
+ +      err = z_erofs_map_blocks_iter(fe->inode, map, 0);
         if (unlikely(err))
                 goto err_out;
   
+ +restart_now:
         if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED)))
                 goto hitted;
   
@@@ -756,7 -740,7 +756,7 @@@
                                  map->m_plen / PAGE_SIZE,
                                  cache_strategy, page_pool, GFP_KERNEL);
   
- -      tight &= builder_is_followed(builder);
+ +      tight &= builder_is_hooked(builder);
         work = builder->work;
   hitted:
         cur = end - min_t(unsigned int, offset + end - map->m_la, end);
@@@ -771,9 -755,6 +771,9 @@@
                         (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
                                 Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
   
+ +      if (cur)
+ +              tight &= builder_is_followed(builder);
+ +
   retry:
         err = z_erofs_vle_work_add_page(builder, page, page_type);
         /* should allocate an additional staging page for pagevec */
@@@ -849,8 -830,9 +849,9 @@@ static inline void z_erofs_vle_read_end
   #ifdef EROFS_FS_HAS_MANAGED_CACHE
         struct address_space *mc = NULL;
   #endif
+       struct bvec_iter_all iter_all;
   
-       bio_for_each_segment_all(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i, iter_all) {
                 struct page *page = bvec->bv_page;
                 bool cachemngd = false;
   
@@@ -1011,10 -993,11 +1012,10 @@@ repeat
         if (llen > grp->llen)
                 llen = grp->llen;
   
- -      err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
- -              clusterpages, pages, llen, work->pageofs,
- -              z_erofs_onlinepage_endio);
+ +      err = z_erofs_vle_unzip_fast_percpu(compressed_pages, clusterpages,
+ +                                          pages, llen, work->pageofs);
         if (err != -ENOTSUPP)
- -              goto out_percpu;
+ +              goto out;
   
         if (sparsemem_pages >= nr_pages)
                 goto skip_allocpage;
@@@ -1035,25 -1018,8 +1036,25 @@@ skip_allocpage
         erofs_vunmap(vout, nr_pages);
   
   out:
+ +      /* must handle all compressed pages before endding pages */
+ +      for (i = 0; i < clusterpages; ++i) {
+ +              page = compressed_pages[i];
+ +
+ +#ifdef EROFS_FS_HAS_MANAGED_CACHE
+ +              if (page->mapping == MNGD_MAPPING(sbi))
+ +                      continue;
+ +#endif
+ +              /* recycle all individual staging pages */
+ +              (void)z_erofs_gather_if_stagingpage(page_pool, page);
+ +
+ +              WRITE_ONCE(compressed_pages[i], NULL);
+ +      }
+ +
         for (i = 0; i < nr_pages; ++i) {
                 page = pages[i];
+ +              if (!page)
+ +                      continue;
+ +
                 DBG_BUGON(!page->mapping);
   
                 /* recycle all individual staging pages */
@@@ -1066,6 -1032,20 +1067,6 @@@
                 z_erofs_onlinepage_endio(page);
         }
   
- -out_percpu:
- -      for (i = 0; i < clusterpages; ++i) {
- -              page = compressed_pages[i];
- -
- -#ifdef EROFS_FS_HAS_MANAGED_CACHE
- -              if (page->mapping == MNGD_MAPPING(sbi))
- -                      continue;
- -#endif
- -              /* recycle all individual staging pages */
- -              (void)z_erofs_gather_if_stagingpage(page_pool, page);
- -
- -              WRITE_ONCE(compressed_pages[i], NULL);
- -      }
- -
         if (pages == z_pagemap_global)
                 mutex_unlock(&z_pagemap_global_lock);
         else if (unlikely(pages != pages_onstack))
@@@ -1505,8 -1485,8 +1506,8 @@@ static int z_erofs_vle_normalaccess_rea
   
         z_erofs_submit_and_unzip(&f, &pagepool, true);
   out:
- -      if (f.m_iter.mpage)
- -              put_page(f.m_iter.mpage);
+ +      if (f.map.mpage)
+ +              put_page(f.map.mpage);
   
         /* clean up the remaining free pages */
         put_pages_list(&pagepool);
@@@ -1576,8 -1556,8 +1577,8 @@@ static int z_erofs_vle_normalaccess_rea
   
         z_erofs_submit_and_unzip(&f, &pagepool, sync);
   
- -      if (f.m_iter.mpage)
- -              put_page(f.m_iter.mpage);
+ +      if (f.map.mpage)
+ +              put_page(f.map.mpage);
   
         /* clean up the remaining free pages */
         put_pages_list(&pagepool);
@@@ -1722,14 -1702,14 +1723,14 @@@ vle_get_logical_extent_head(const struc
   
   int z_erofs_map_blocks_iter(struct inode *inode,
         struct erofs_map_blocks *map,
- -      struct page **mpage_ret, int flags)
+ +      int flags)
   {
         void *kaddr;
         const struct vle_map_blocks_iter_ctx ctx = {
                 .inode = inode,
                 .sb = inode->i_sb,
                 .clusterbits = EROFS_I_SB(inode)->clusterbits,
- -              .mpage_ret = mpage_ret,
+ +              .mpage_ret = &map->mpage,
                 .kaddr_ret = &kaddr
         };
         const unsigned int clustersize = 1 << ctx.clusterbits;
@@@ -1743,7 -1723,7 +1744,7 @@@
   
         /* initialize `pblk' to keep gcc from printing foolish warnings */
         erofs_blk_t mblk, pblk = 0;
- -      struct page *mpage = *mpage_ret;
+ +      struct page *mpage = map->mpage;
         struct z_erofs_vle_decompressed_index *di;
         unsigned int cluster_type, logical_cluster_ofs;
         int err = 0;
@@@ -1779,7 -1759,7 +1780,7 @@@
                         err = PTR_ERR(mpage);
                         goto out;
                 }
- -              *mpage_ret = mpage;
+ +              map->mpage = mpage;
         } else {
                 lock_page(mpage);
                 DBG_BUGON(!PageUptodate(mpage));
@@@ -1839,7 -1819,7 +1840,7 @@@
                 /* get the correspoinding first chunk */
                 err = vle_get_logical_extent_head(&ctx, lcn, &ofs,
                                                   &pblk, &map->m_flags);
- -              mpage = *mpage_ret;
+ +              mpage = map->mpage;
   
                 if (unlikely(err)) {
                         if (mpage)
diff --combined fs/btrfs/compression.c

index eb8e20b740d6e245ec303da68ab335ce21fcd14a,6896ea60c843229b412bf705028172d652cad1ca..4f2a8ae0aa421f35fc5fc5ea97e598273ffc10ba
--- 1/fs/btrfs/compression.c
--- 2/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@@ -162,13 -162,14 +162,14 @@@ csum_failed
         } else {
                 int i;
                 struct bio_vec *bvec;
+               struct bvec_iter_all iter_all;
   
                 /*
                  * we have verified the checksum already, set page
                  * checked so the end_io handlers know about it
                  */
                 ASSERT(!bio_flagged(bio, BIO_CLONED));
-               bio_for_each_segment_all(bvec, cb->orig_bio, i)
+               bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all)
                         SetPageChecked(bvec->bv_page);
   
                 bio_endio(cb->orig_bio);
@@@ -730,28 -731,6 +731,28 @@@ struct heuristic_ws 
         struct list_head list;
   };
   
+ +static struct workspace_manager heuristic_wsm;
+ +
+ +static void heuristic_init_workspace_manager(void)
+ +{
+ +      btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress);
+ +}
+ +
+ +static void heuristic_cleanup_workspace_manager(void)
+ +{
+ +      btrfs_cleanup_workspace_manager(&heuristic_wsm);
+ +}
+ +
+ +static struct list_head *heuristic_get_workspace(unsigned int level)
+ +{
+ +      return btrfs_get_workspace(&heuristic_wsm, level);
+ +}
+ +
+ +static void heuristic_put_workspace(struct list_head *ws)
+ +{
+ +      btrfs_put_workspace(&heuristic_wsm, ws);
+ +}
+ +
   static void free_heuristic_ws(struct list_head *ws)
   {
         struct heuristic_ws *workspace;
@@@ -764,7 -743,7 +765,7 @@@
         kfree(workspace);
   }
   
- -static struct list_head *alloc_heuristic_ws(void)
+ +static struct list_head *alloc_heuristic_ws(unsigned int level)
   {
         struct heuristic_ws *ws;
   
@@@ -791,59 -770,65 +792,59 @@@ fail
         return ERR_PTR(-ENOMEM);
   }
   
- -struct workspaces_list {
- -      struct list_head idle_ws;
- -      spinlock_t ws_lock;
- -      /* Number of free workspaces */
- -      int free_ws;
- -      /* Total number of allocated workspaces */
- -      atomic_t total_ws;
- -      /* Waiters for a free workspace */
- -      wait_queue_head_t ws_wait;
+ +const struct btrfs_compress_op btrfs_heuristic_compress = {
+ +      .init_workspace_manager = heuristic_init_workspace_manager,
+ +      .cleanup_workspace_manager = heuristic_cleanup_workspace_manager,
+ +      .get_workspace = heuristic_get_workspace,
+ +      .put_workspace = heuristic_put_workspace,
+ +      .alloc_workspace = alloc_heuristic_ws,
+ +      .free_workspace = free_heuristic_ws,
   };
   
- -static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
- -
- -static struct workspaces_list btrfs_heuristic_ws;
- -
   static const struct btrfs_compress_op * const btrfs_compress_op[] = {
+ +      /* The heuristic is represented as compression type 0 */
+ +      &btrfs_heuristic_compress,
         &btrfs_zlib_compress,
         &btrfs_lzo_compress,
         &btrfs_zstd_compress,
   };
   
- -void __init btrfs_init_compress(void)
+ +void btrfs_init_workspace_manager(struct workspace_manager *wsm,
+ +                                const struct btrfs_compress_op *ops)
   {
         struct list_head *workspace;
- -      int i;
   
- -      INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws);
- -      spin_lock_init(&btrfs_heuristic_ws.ws_lock);
- -      atomic_set(&btrfs_heuristic_ws.total_ws, 0);
- -      init_waitqueue_head(&btrfs_heuristic_ws.ws_wait);
+ +      wsm->ops = ops;
   
- -      workspace = alloc_heuristic_ws();
+ +      INIT_LIST_HEAD(&wsm->idle_ws);
+ +      spin_lock_init(&wsm->ws_lock);
+ +      atomic_set(&wsm->total_ws, 0);
+ +      init_waitqueue_head(&wsm->ws_wait);
+ +
+ +      /*
+ +       * Preallocate one workspace for each compression type so we can
+ +       * guarantee forward progress in the worst case
+ +       */
+ +      workspace = wsm->ops->alloc_workspace(0);
         if (IS_ERR(workspace)) {
                 pr_warn(
- -      "BTRFS: cannot preallocate heuristic workspace, will try later\n");
+ +      "BTRFS: cannot preallocate compression workspace, will try later\n");
         } else {
- -              atomic_set(&btrfs_heuristic_ws.total_ws, 1);
- -              btrfs_heuristic_ws.free_ws = 1;
- -              list_add(workspace, &btrfs_heuristic_ws.idle_ws);
+ +              atomic_set(&wsm->total_ws, 1);
+ +              wsm->free_ws = 1;
+ +              list_add(workspace, &wsm->idle_ws);
         }
+ +}
   
- -      for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- -              INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
- -              spin_lock_init(&btrfs_comp_ws[i].ws_lock);
- -              atomic_set(&btrfs_comp_ws[i].total_ws, 0);
- -              init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
+ +void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman)
+ +{
+ +      struct list_head *ws;
   
- -              /*
- -               * Preallocate one workspace for each compression type so
- -               * we can guarantee forward progress in the worst case
- -               */
- -              workspace = btrfs_compress_op[i]->alloc_workspace();
- -              if (IS_ERR(workspace)) {
- -                      pr_warn("BTRFS: cannot preallocate compression workspace, will try later\n");
- -              } else {
- -                      atomic_set(&btrfs_comp_ws[i].total_ws, 1);
- -                      btrfs_comp_ws[i].free_ws = 1;
- -                      list_add(workspace, &btrfs_comp_ws[i].idle_ws);
- -              }
+ +      while (!list_empty(&wsman->idle_ws)) {
+ +              ws = wsman->idle_ws.next;
+ +              list_del(ws);
+ +              wsman->ops->free_workspace(ws);
+ +              atomic_dec(&wsman->total_ws);
         }
   }
   
@@@ -853,11 -838,11 +854,11 @@@
    * Preallocation makes a forward progress guarantees and we do not return
    * errors.
    */
- -static struct list_head *__find_workspace(int type, bool heuristic)
+ +struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
+ +                                    unsigned int level)
   {
         struct list_head *workspace;
         int cpus = num_online_cpus();
- -      int idx = type - 1;
         unsigned nofs_flag;
         struct list_head *idle_ws;
         spinlock_t *ws_lock;
@@@ -865,11 -850,19 +866,11 @@@
         wait_queue_head_t *ws_wait;
         int *free_ws;
   
- -      if (heuristic) {
- -              idle_ws  = &btrfs_heuristic_ws.idle_ws;
- -              ws_lock  = &btrfs_heuristic_ws.ws_lock;
- -              total_ws = &btrfs_heuristic_ws.total_ws;
- -              ws_wait  = &btrfs_heuristic_ws.ws_wait;
- -              free_ws  = &btrfs_heuristic_ws.free_ws;
- -      } else {
- -              idle_ws  = &btrfs_comp_ws[idx].idle_ws;
- -              ws_lock  = &btrfs_comp_ws[idx].ws_lock;
- -              total_ws = &btrfs_comp_ws[idx].total_ws;
- -              ws_wait  = &btrfs_comp_ws[idx].ws_wait;
- -              free_ws  = &btrfs_comp_ws[idx].free_ws;
- -      }
+ +      idle_ws  = &wsm->idle_ws;
+ +      ws_lock  = &wsm->ws_lock;
+ +      total_ws = &wsm->total_ws;
+ +      ws_wait  = &wsm->ws_wait;
+ +      free_ws  = &wsm->free_ws;
   
   again:
         spin_lock(ws_lock);
@@@ -900,7 -893,10 +901,7 @@@
          * context of btrfs_compress_bio/btrfs_compress_pages
          */
         nofs_flag = memalloc_nofs_save();
- -      if (heuristic)
- -              workspace = alloc_heuristic_ws();
- -      else
- -              workspace = btrfs_compress_op[idx]->alloc_workspace();
+ +      workspace = wsm->ops->alloc_workspace(level);
         memalloc_nofs_restore(nofs_flag);
   
         if (IS_ERR(workspace)) {
@@@ -931,47 -927,85 +932,47 @@@
         return workspace;
   }
   
- -static struct list_head *find_workspace(int type)
+ +static struct list_head *get_workspace(int type, int level)
   {
- -      return __find_workspace(type, false);
+ +      return btrfs_compress_op[type]->get_workspace(level);
   }
   
   /*
    * put a workspace struct back on the list or free it if we have enough
    * idle ones sitting around
    */
- -static void __free_workspace(int type, struct list_head *workspace,
- -                           bool heuristic)
+ +void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws)
   {
- -      int idx = type - 1;
         struct list_head *idle_ws;
         spinlock_t *ws_lock;
         atomic_t *total_ws;
         wait_queue_head_t *ws_wait;
         int *free_ws;
   
- -      if (heuristic) {
- -              idle_ws  = &btrfs_heuristic_ws.idle_ws;
- -              ws_lock  = &btrfs_heuristic_ws.ws_lock;
- -              total_ws = &btrfs_heuristic_ws.total_ws;
- -              ws_wait  = &btrfs_heuristic_ws.ws_wait;
- -              free_ws  = &btrfs_heuristic_ws.free_ws;
- -      } else {
- -              idle_ws  = &btrfs_comp_ws[idx].idle_ws;
- -              ws_lock  = &btrfs_comp_ws[idx].ws_lock;
- -              total_ws = &btrfs_comp_ws[idx].total_ws;
- -              ws_wait  = &btrfs_comp_ws[idx].ws_wait;
- -              free_ws  = &btrfs_comp_ws[idx].free_ws;
- -      }
+ +      idle_ws  = &wsm->idle_ws;
+ +      ws_lock  = &wsm->ws_lock;
+ +      total_ws = &wsm->total_ws;
+ +      ws_wait  = &wsm->ws_wait;
+ +      free_ws  = &wsm->free_ws;
   
         spin_lock(ws_lock);
         if (*free_ws <= num_online_cpus()) {
- -              list_add(workspace, idle_ws);
+ +              list_add(ws, idle_ws);
                 (*free_ws)++;
                 spin_unlock(ws_lock);
                 goto wake;
         }
         spin_unlock(ws_lock);
   
- -      if (heuristic)
- -              free_heuristic_ws(workspace);
- -      else
- -              btrfs_compress_op[idx]->free_workspace(workspace);
+ +      wsm->ops->free_workspace(ws);
         atomic_dec(total_ws);
   wake:
         cond_wake_up(ws_wait);
   }
   
- -static void free_workspace(int type, struct list_head *ws)
+ +static void put_workspace(int type, struct list_head *ws)
   {
- -      return __free_workspace(type, ws, false);
- -}
- -
- -/*
- - * cleanup function for module exit
- - */
- -static void free_workspaces(void)
- -{
- -      struct list_head *workspace;
- -      int i;
- -
- -      while (!list_empty(&btrfs_heuristic_ws.idle_ws)) {
- -              workspace = btrfs_heuristic_ws.idle_ws.next;
- -              list_del(workspace);
- -              free_heuristic_ws(workspace);
- -              atomic_dec(&btrfs_heuristic_ws.total_ws);
- -      }
- -
- -      for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- -              while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
- -                      workspace = btrfs_comp_ws[i].idle_ws.next;
- -                      list_del(workspace);
- -                      btrfs_compress_op[i]->free_workspace(workspace);
- -                      atomic_dec(&btrfs_comp_ws[i].total_ws);
- -              }
- -      }
+ +      return btrfs_compress_op[type]->put_workspace(ws);
   }
   
   /*
@@@ -1003,17 -1037,18 +1004,17 @@@ int btrfs_compress_pages(unsigned int t
                          unsigned long *total_in,
                          unsigned long *total_out)
   {
+ +      int type = btrfs_compress_type(type_level);
+ +      int level = btrfs_compress_level(type_level);
         struct list_head *workspace;
         int ret;
- -      int type = type_level & 0xF;
- -
- -      workspace = find_workspace(type);
   
- -      btrfs_compress_op[type - 1]->set_level(workspace, type_level);
- -      ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+ +      workspace = get_workspace(type, level);
+ +      ret = btrfs_compress_op[type]->compress_pages(workspace, mapping,
                                                       start, pages,
                                                       out_pages,
                                                       total_in, total_out);
- -      free_workspace(type, workspace);
+ +      put_workspace(type, workspace);
         return ret;
   }
   
@@@ -1037,9 -1072,9 +1038,9 @@@ static int btrfs_decompress_bio(struct 
         int ret;
         int type = cb->compress_type;
   
- -      workspace = find_workspace(type);
- -      ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb);
- -      free_workspace(type, workspace);
+ +      workspace = get_workspace(type, 0);
+ +      ret = btrfs_compress_op[type]->decompress_bio(workspace, cb);
+ +      put_workspace(type, workspace);
   
         return ret;
   }
@@@ -1055,29 -1090,19 +1056,29 @@@ int btrfs_decompress(int type, unsigne
         struct list_head *workspace;
         int ret;
   
- -      workspace = find_workspace(type);
- -
- -      ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+ +      workspace = get_workspace(type, 0);
+ +      ret = btrfs_compress_op[type]->decompress(workspace, data_in,
                                                   dest_page, start_byte,
                                                   srclen, destlen);
+ +      put_workspace(type, workspace);
   
- -      free_workspace(type, workspace);
         return ret;
   }
   
+ +void __init btrfs_init_compress(void)
+ +{
+ +      int i;
+ +
+ +      for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
+ +              btrfs_compress_op[i]->init_workspace_manager();
+ +}
+ +
   void __cold btrfs_exit_compress(void)
   {
- -      free_workspaces();
+ +      int i;
+ +
+ +      for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
+ +              btrfs_compress_op[i]->cleanup_workspace_manager();
   }
   
   /*
@@@ -1488,7 -1513,7 +1489,7 @@@ static void heuristic_collect_sample(st
    */
   int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
   {
- -      struct list_head *ws_list = __find_workspace(0, true);
+ +      struct list_head *ws_list = get_workspace(0, 0);
         struct heuristic_ws *ws;
         u32 i;
         u8 byte;
@@@ -1557,29 -1582,18 +1558,29 @@@
         }
   
   out:
- -      __free_workspace(0, ws_list, true);
+ +      put_workspace(0, ws_list);
         return ret;
   }
   
- -unsigned int btrfs_compress_str2level(const char *str)
+ +/*
+ + * Convert the compression suffix (eg. after "zlib" starting with ":") to
+ + * level, unrecognized string will set the default level
+ + */
+ +unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
   {
- -      if (strncmp(str, "zlib", 4) != 0)
+ +      unsigned int level = 0;
+ +      int ret;
+ +
+ +      if (!type)
                 return 0;
   
- -      /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */
- -      if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0)
- -              return str[5] - '0';
+ +      if (str[0] == ':') {
+ +              ret = kstrtouint(str + 1, 10, &level);
+ +              if (ret)
+ +                      level = 0;
+ +      }
+ +
+ +      level = btrfs_compress_op[type]->set_level(level);
   
- -      return BTRFS_ZLIB_DEFAULT_LEVEL;
+ +      return level;
   }
diff --combined fs/btrfs/disk-io.c

index 5216e7b3f9ada29308a1c6fc5cf75b816032aded,ca1b7da6dd1b927e54fe84ef12c04ec56197875f..f0cdb53f3e2dc86f0dee6080d3b719482214170d
--- 1/fs/btrfs/disk-io.c
--- 2/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@@ -17,7 -17,6 +17,7 @@@
   #include <linux/semaphore.h>
   #include <linux/error-injection.h>
   #include <linux/crc32c.h>
+ +#include <linux/sched/mm.h>
   #include <asm/unaligned.h>
   #include "ctree.h"
   #include "disk-io.h"
@@@ -342,7 -341,7 +342,7 @@@ static int verify_parent_transid(struc
   
         if (need_lock) {
                 btrfs_tree_read_lock(eb);
- -              btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ +              btrfs_set_lock_blocking_read(eb);
         }
   
         lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
@@@ -833,9 -832,10 +833,10 @@@ static blk_status_t btree_csum_one_bio(
         struct bio_vec *bvec;
         struct btrfs_root *root;
         int i, ret = 0;
+       struct bvec_iter_all iter_all;
   
         ASSERT(!bio_flagged(bio, BIO_CLONED));
-       bio_for_each_segment_all(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i, iter_all) {
                 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
                 ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
                 if (ret)
@@@ -1121,7 -1121,7 +1122,7 @@@ void clean_tree_block(struct btrfs_fs_i
                                                  -buf->len,
                                                  fs_info->dirty_metadata_batch);
                         /* ugh, clear_extent_buffer_dirty needs to lock the page */
- -                      btrfs_set_lock_blocking(buf);
+ +                      btrfs_set_lock_blocking_write(buf);
                         clear_extent_buffer_dirty(buf);
                 }
         }
@@@ -1176,7 -1176,6 +1177,7 @@@ static void __setup_root(struct btrfs_r
         INIT_LIST_HEAD(&root->delalloc_root);
         INIT_LIST_HEAD(&root->ordered_extents);
         INIT_LIST_HEAD(&root->ordered_root);
+ +      INIT_LIST_HEAD(&root->reloc_dirty_list);
         INIT_LIST_HEAD(&root->logged_list[0]);
         INIT_LIST_HEAD(&root->logged_list[1]);
         spin_lock_init(&root->inode_lock);
@@@ -1220,7 -1219,6 +1221,7 @@@
         root->anon_dev = 0;
   
         spin_lock_init(&root->root_item_lock);
+ +      btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
   }
   
   static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
@@@ -1261,17 -1259,10 +1262,17 @@@ struct btrfs_root *btrfs_create_tree(st
         struct btrfs_root *tree_root = fs_info->tree_root;
         struct btrfs_root *root;
         struct btrfs_key key;
+ +      unsigned int nofs_flag;
         int ret = 0;
         uuid_le uuid = NULL_UUID_LE;
   
+ +      /*
+ +       * We're holding a transaction handle, so use a NOFS memory allocation
+ +       * context to avoid deadlock if reclaim happens.
+ +       */
+ +      nofs_flag = memalloc_nofs_save();
         root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+ +      memalloc_nofs_restore(nofs_flag);
         if (!root)
                 return ERR_PTR(-ENOMEM);
   
@@@ -1717,7 -1708,9 +1718,7 @@@ static int cleaner_kthread(void *arg
                         goto sleep;
                 }
   
- -              mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
                 btrfs_run_delayed_iputs(fs_info);
- -              mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
   
                 again = btrfs_clean_one_deleted_snapshot(root);
                 mutex_unlock(&fs_info->cleaner_mutex);
@@@ -2109,7 -2102,7 +2110,7 @@@ static void btrfs_init_scrub(struct btr
         atomic_set(&fs_info->scrubs_paused, 0);
         atomic_set(&fs_info->scrub_cancel_req, 0);
         init_waitqueue_head(&fs_info->scrub_pause_wait);
- -      fs_info->scrub_workers_refcnt = 0;
+ +      refcount_set(&fs_info->scrub_workers_refcnt, 0);
   }
   
   static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
@@@ -2674,6 -2667,7 +2675,6 @@@ int open_ctree(struct super_block *sb
         mutex_init(&fs_info->delete_unused_bgs_mutex);
         mutex_init(&fs_info->reloc_mutex);
         mutex_init(&fs_info->delalloc_root_mutex);
- -      mutex_init(&fs_info->cleaner_delayed_iput_mutex);
         seqlock_init(&fs_info->profiles_lock);
   
         INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@@ -2695,7 -2689,6 +2696,7 @@@
         atomic_set(&fs_info->defrag_running, 0);
         atomic_set(&fs_info->qgroup_op_seq, 0);
         atomic_set(&fs_info->reada_works_cnt, 0);
+ +      atomic_set(&fs_info->nr_delayed_iputs, 0);
         atomic64_set(&fs_info->tree_mod_seq, 0);
         fs_info->sb = sb;
         fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
@@@ -2773,7 -2766,6 +2774,7 @@@
         init_waitqueue_head(&fs_info->transaction_wait);
         init_waitqueue_head(&fs_info->transaction_blocked_wait);
         init_waitqueue_head(&fs_info->async_submit_wait);
+ +      init_waitqueue_head(&fs_info->delayed_iputs_wait);
   
         INIT_LIST_HEAD(&fs_info->pinned_chunks);
   
@@@ -4247,9 -4239,16 +4248,9 @@@ static int btrfs_destroy_delayed_refs(s
   
                 head = rb_entry(node, struct btrfs_delayed_ref_head,
                                 href_node);
- -              if (!mutex_trylock(&head->mutex)) {
- -                      refcount_inc(&head->refs);
- -                      spin_unlock(&delayed_refs->lock);
- -
- -                      mutex_lock(&head->mutex);
- -                      mutex_unlock(&head->mutex);
- -                      btrfs_put_delayed_ref_head(head);
- -                      spin_lock(&delayed_refs->lock);
+ +              if (btrfs_delayed_ref_lock(delayed_refs, head))
                         continue;
- -              }
+ +
                 spin_lock(&head->lock);
                 while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
                         ref = rb_entry(n, struct btrfs_delayed_ref_node,
@@@ -4265,7 -4264,12 +4266,7 @@@
                 if (head->must_insert_reserved)
                         pin_bytes = true;
                 btrfs_free_delayed_extent_op(head->extent_op);
- -              delayed_refs->num_heads--;
- -              if (head->processing == 0)
- -                      delayed_refs->num_heads_ready--;
- -              atomic_dec(&delayed_refs->num_entries);
- -              rb_erase_cached(&head->href_node, &delayed_refs->href_root);
- -              RB_CLEAR_NODE(&head->href_node);
+ +              btrfs_delete_ref_head(delayed_refs, head);
                 spin_unlock(&head->lock);
                 spin_unlock(&delayed_refs->lock);
                 mutex_unlock(&head->mutex);
diff --combined fs/btrfs/extent_io.c

index ca259c75bbcd1a32f462cfada377362ddfa2c8ca,4ed58c9a94a9916959e304d28f77e9002b64e8df..ab705183d749709f004e28c2f02b56243f1b23ff
--- 1/fs/btrfs/extent_io.c
--- 2/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@@ -147,38 -147,7 +147,39 @@@ static int add_extent_changeset(struct 
         return ret;
   }
   
- -static void flush_write_bio(struct extent_page_data *epd);
+ +static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ +                                     unsigned long bio_flags)
+ +{
+ +      blk_status_t ret = 0;
+ +      struct bio_vec *bvec = bio_last_bvec_all(bio);
-       struct page *page = bvec->bv_page;
++      struct bio_vec bv;
+ +      struct extent_io_tree *tree = bio->bi_private;
+ +      u64 start;
+ +
-       start = page_offset(page) + bvec->bv_offset;
++      mp_bvec_last_segment(bvec, &bv);
++      start = page_offset(bv.bv_page) + bv.bv_offset;
+ +
+ +      bio->bi_private = NULL;
+ +
+ +      if (tree->ops)
+ +              ret = tree->ops->submit_bio_hook(tree->private_data, bio,
+ +                                         mirror_num, bio_flags, start);
+ +      else
+ +              btrfsic_submit_bio(bio);
+ +
+ +      return blk_status_to_errno(ret);
+ +}
+ +
+ +static void flush_write_bio(struct extent_page_data *epd)
+ +{
+ +      if (epd->bio) {
+ +              int ret;
+ +
+ +              ret = submit_one_bio(epd->bio, 0, 0);
+ +              BUG_ON(ret < 0); /* -ENOMEM */
+ +              epd->bio = NULL;
+ +      }
+ +}
   
   int __init extent_io_init(void)
   {
@@@ -312,8 -281,8 +313,8 @@@ do_insert
   }
   
   static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
- -                                    struct rb_node **prev_ret,
                                       struct rb_node **next_ret,
+ +                                    struct rb_node **prev_ret,
                                       struct rb_node ***p_ret,
                                       struct rb_node **parent_ret)
   {
@@@ -342,23 -311,23 +343,23 @@@
         if (parent_ret)
                 *parent_ret = prev;
   
- -      if (prev_ret) {
+ +      if (next_ret) {
                 orig_prev = prev;
                 while (prev && offset > prev_entry->end) {
                         prev = rb_next(prev);
                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
                 }
- -              *prev_ret = prev;
+ +              *next_ret = prev;
                 prev = orig_prev;
         }
   
- -      if (next_ret) {
+ +      if (prev_ret) {
                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
                 while (prev && offset < prev_entry->start) {
                         prev = rb_prev(prev);
                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
                 }
- -              *next_ret = prev;
+ +              *prev_ret = prev;
         }
         return NULL;
   }
@@@ -369,12 -338,12 +370,12 @@@ tree_search_for_insert(struct extent_io
                        struct rb_node ***p_ret,
                        struct rb_node **parent_ret)
   {
- -      struct rb_node *prev = NULL;
+ +      struct rb_node *next= NULL;
         struct rb_node *ret;
   
- -      ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
+ +      ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
         if (!ret)
- -              return prev;
+ +              return next;
         return ret;
   }
   
@@@ -616,6 -585,7 +617,6 @@@ int __clear_extent_bit(struct extent_io
   
         if (delete)
                 bits |= ~EXTENT_CTLBITS;
- -      bits |= EXTENT_FIRST_DELALLOC;
   
         if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
                 clear = 1;
@@@ -880,6 -850,7 +881,6 @@@ __set_extent_bit(struct extent_io_tree 
   
         btrfs_debug_check_extent_io_range(tree, start, end);
   
- -      bits |= EXTENT_FIRST_DELALLOC;
   again:
         if (!prealloc && gfpflags_allow_blocking(mask)) {
                 /*
@@@ -2379,7 -2350,7 +2380,7 @@@ static int bio_readpage_error(struct bi
         int read_mode = 0;
         blk_status_t status;
         int ret;
-       unsigned failed_bio_pages = bio_pages_all(failed_bio);
+       unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
   
         BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
   
@@@ -2451,9 -2422,10 +2452,10 @@@ static void end_bio_extent_writepage(st
         u64 start;
         u64 end;
         int i;
+       struct bvec_iter_all iter_all;
   
         ASSERT(!bio_flagged(bio, BIO_CLONED));
-       bio_for_each_segment_all(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i, iter_all) {
                 struct page *page = bvec->bv_page;
                 struct inode *inode = page->mapping->host;
                 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@@ -2522,9 -2494,10 +2524,10 @@@ static void end_bio_extent_readpage(str
         int mirror;
         int ret;
         int i;
+       struct bvec_iter_all iter_all;
   
         ASSERT(!bio_flagged(bio, BIO_CLONED));
-       bio_for_each_segment_all(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i, iter_all) {
                 struct page *page = bvec->bv_page;
                 struct inode *inode = page->mapping->host;
                 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@@ -2721,6 -2694,29 +2724,6 @@@ struct bio *btrfs_bio_clone_partial(str
         return bio;
   }
   
- -static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- -                                     unsigned long bio_flags)
- -{
- -      blk_status_t ret = 0;
- -      struct bio_vec *bvec = bio_last_bvec_all(bio);
- -      struct bio_vec bv;
- -      struct extent_io_tree *tree = bio->bi_private;
- -      u64 start;
- -
- -      mp_bvec_last_segment(bvec, &bv);
- -      start = page_offset(bv.bv_page) + bv.bv_offset;
- -
- -      bio->bi_private = NULL;
- -
- -      if (tree->ops)
- -              ret = tree->ops->submit_bio_hook(tree->private_data, bio,
- -                                         mirror_num, bio_flags, start);
- -      else
- -              btrfsic_submit_bio(bio);
- -
- -      return blk_status_to_errno(ret);
- -}
- -
   /*
    * @opf:      bio REQ_OP_* and REQ_* flags as one value
    * @tree:     tree so we can call our merge_bio hook
@@@ -3641,9 -3637,10 +3644,10 @@@ static void end_bio_extent_buffer_write
         struct bio_vec *bvec;
         struct extent_buffer *eb;
         int i, done;
+       struct bvec_iter_all iter_all;
   
         ASSERT(!bio_flagged(bio, BIO_CLONED));
-       bio_for_each_segment_all(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i, iter_all) {
                 struct page *page = bvec->bv_page;
   
                 eb = (struct extent_buffer *)page->private;
@@@ -4014,6 -4011,17 +4018,6 @@@ retry
         return ret;
   }
   
- -static void flush_write_bio(struct extent_page_data *epd)
- -{
- -      if (epd->bio) {
- -              int ret;
- -
- -              ret = submit_one_bio(epd->bio, 0, 0);
- -              BUG_ON(ret < 0); /* -ENOMEM */
- -              epd->bio = NULL;
- -      }
- -}
- -
   int extent_write_full_page(struct page *page, struct writeback_control *wbc)
   {
         int ret;
@@@ -4255,7 -4263,8 +4259,7 @@@ static struct extent_map *get_extent_sk
                 if (len == 0)
                         break;
                 len = ALIGN(len, sectorsize);
- -              em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset,
- -                              len, 0);
+ +              em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
                 if (IS_ERR_OR_NULL(em))
                         return em;
   
diff --combined fs/btrfs/inode.c

index 3f180b857e202bc628a65ff0955606f880193de5,7ade5769f6915acfa1acfbb492db1d926c4df594..82fdda8ff5ab82b5298c4b72859e697d8bd1a3d5
--- 1/fs/btrfs/inode.c
--- 2/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -453,6 -453,7 +453,6 @@@ static noinline void compress_file_rang
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         u64 blocksize = fs_info->sectorsize;
         u64 actual_end;
- -      u64 isize = i_size_read(inode);
         int ret = 0;
         struct page **pages = NULL;
         unsigned long nr_pages;
@@@ -466,7 -467,7 +466,7 @@@
         inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
                         SZ_16K);
   
- -      actual_end = min_t(u64, isize, end + 1);
+ +      actual_end = min_t(u64, i_size_read(inode), end + 1);
   again:
         will_compress = 0;
         nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
@@@ -713,9 -714,9 +713,9 @@@ static void free_async_extent_pages(str
    * queued.  We walk all the async extents created by compress_file_range
    * and send them down to the disk.
    */
- -static noinline void submit_compressed_extents(struct inode *inode,
- -                                            struct async_cow *async_cow)
+ +static noinline void submit_compressed_extents(struct async_cow *async_cow)
   {
+ +      struct inode *inode = async_cow->inode;
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct async_extent *async_extent;
         u64 alloc_hint = 0;
@@@ -1165,14 -1166,8 +1165,14 @@@ static noinline void async_cow_submit(s
             5 * SZ_1M)
                 cond_wake_up_nomb(&fs_info->async_submit_wait);
   
+ +      /*
+ +       * ->inode could be NULL if async_cow_start has failed to compress,
+ +       * in which case we don't have anything to submit, yet we need to
+ +       * always adjust ->async_delalloc_pages as its paired with the init
+ +       * happening in cow_file_range_async
+ +       */
         if (async_cow->inode)
- -              submit_compressed_extents(async_cow->inode, async_cow);
+ +              submit_compressed_extents(async_cow);
   }
   
   static noinline void async_cow_free(struct btrfs_work *work)
@@@ -1199,12 -1194,7 +1199,12 @@@ static int cow_file_range_async(struct 
         while (start < end) {
                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
                 BUG_ON(!async_cow); /* -ENOMEM */
- -              async_cow->inode = igrab(inode);
+ +              /*
+ +               * igrab is called higher up in the call chain, take only the
+ +               * lightweight reference for the callback lifetime
+ +               */
+ +              ihold(inode);
+ +              async_cow->inode = inode;
                 async_cow->fs_info = fs_info;
                 async_cow->locked_page = locked_page;
                 async_cow->start = start;
@@@ -1596,10 -1586,11 +1596,10 @@@ static inline int need_force_cow(struc
    * Function to process delayed allocation (create CoW) for ranges which are
    * being touched for the first time.
    */
- -int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
+ +int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
                 u64 start, u64 end, int *page_started, unsigned long *nr_written,
                 struct writeback_control *wbc)
   {
- -      struct inode *inode = private_data;
         int ret;
         int force_cow = need_force_cow(inode, start, end);
         unsigned int write_flags = wbc_to_write_flags(wbc);
@@@ -3256,7 -3247,6 +3256,7 @@@ void btrfs_add_delayed_iput(struct inod
         if (atomic_add_unless(&inode->i_count, -1, 1))
                 return;
   
+ +      atomic_inc(&fs_info->nr_delayed_iputs);
         spin_lock(&fs_info->delayed_iput_lock);
         ASSERT(list_empty(&binode->delayed_iput));
         list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
@@@ -3277,32 -3267,11 +3277,32 @@@ void btrfs_run_delayed_iputs(struct btr
                 list_del_init(&inode->delayed_iput);
                 spin_unlock(&fs_info->delayed_iput_lock);
                 iput(&inode->vfs_inode);
+ +              if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
+ +                      wake_up(&fs_info->delayed_iputs_wait);
                 spin_lock(&fs_info->delayed_iput_lock);
         }
         spin_unlock(&fs_info->delayed_iput_lock);
   }
   
+ +/**
+ + * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
+ + * @fs_info - the fs_info for this fs
+ + * @return - EINTR if we were killed, 0 if nothing's pending
+ + *
+ + * This will wait on any delayed iputs that are currently running with KILLABLE
+ + * set.  Once they are all done running we will return, unless we are killed in
+ + * which case we return EINTR. This helps in user operations like fallocate etc
+ + * that might get blocked on the iputs.
+ + */
+ +int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
+ +{
+ +      int ret = wait_event_killable(fs_info->delayed_iputs_wait,
+ +                      atomic_read(&fs_info->nr_delayed_iputs) == 0);
+ +      if (ret)
+ +              return -EINTR;
+ +      return 0;
+ +}
+ +
   /*
    * This creates an orphan entry for the given inode in case something goes wrong
    * in the middle of an unlink.
@@@ -5293,15 -5262,13 +5293,15 @@@ static struct btrfs_trans_handle *evict
   {
         struct btrfs_fs_info *fs_info = root->fs_info;
         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ +      u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1);
         int failures = 0;
   
         for (;;) {
                 struct btrfs_trans_handle *trans;
                 int ret;
   
- -              ret = btrfs_block_rsv_refill(root, rsv, rsv->size,
+ +              ret = btrfs_block_rsv_refill(root, rsv,
+ +                                           rsv->size + delayed_refs_extra,
                                              BTRFS_RESERVE_FLUSH_LIMIT);
   
                 if (ret && ++failures > 2) {
@@@ -5310,28 -5277,9 +5310,28 @@@
                         return ERR_PTR(-ENOSPC);
                 }
   
+ +              /*
+ +               * Evict can generate a large amount of delayed refs without
+ +               * having a way to add space back since we exhaust our temporary
+ +               * block rsv.  We aren't allowed to do FLUSH_ALL in this case
+ +               * because we could deadlock with so many things in the flushing
+ +               * code, so we have to try and hold some extra space to
+ +               * compensate for our delayed ref generation.  If we can't get
+ +               * that space then we need see if we can steal our minimum from
+ +               * the global reserve.  We will be ratelimited by the amount of
+ +               * space we have for the delayed refs rsv, so we'll end up
+ +               * committing and trying again.
+ +               */
                 trans = btrfs_join_transaction(root);
- -              if (IS_ERR(trans) || !ret)
+ +              if (IS_ERR(trans) || !ret) {
+ +                      if (!IS_ERR(trans)) {
+ +                              trans->block_rsv = &fs_info->trans_block_rsv;
+ +                              trans->bytes_reserved = delayed_refs_extra;
+ +                              btrfs_block_rsv_migrate(rsv, trans->block_rsv,
+ +                                                      delayed_refs_extra, 1);
+ +                      }
                         return trans;
+ +              }
   
                 /*
                  * Try to steal from the global reserve if there is space for
@@@ -6783,7 -6731,7 +6783,7 @@@ struct extent_map *btrfs_get_extent(str
         u64 extent_start = 0;
         u64 extent_end = 0;
         u64 objectid = btrfs_ino(inode);
- -      u32 found_type;
+ +      u8 extent_type;
         struct btrfs_path *path = NULL;
         struct btrfs_root *root = inode->root;
         struct btrfs_file_extent_item *item;
@@@ -6838,7 -6786,9 +6838,7 @@@
         if (ret < 0) {
                 err = ret;
                 goto out;
- -      }
- -
- -      if (ret != 0) {
+ +      } else if (ret > 0) {
                 if (path->slots[0] == 0)
                         goto not_found;
                 path->slots[0]--;
@@@ -6847,9 -6797,11 +6847,9 @@@
         leaf = path->nodes[0];
         item = btrfs_item_ptr(leaf, path->slots[0],
                               struct btrfs_file_extent_item);
- -      /* are we inside the extent that was found? */
         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- -      found_type = found_key.type;
         if (found_key.objectid != objectid ||
- -          found_type != BTRFS_EXTENT_DATA_KEY) {
+ +          found_key.type != BTRFS_EXTENT_DATA_KEY) {
                 /*
                  * If we backup past the first extent we want to move forward
                  * and see if there is an extent in front of us, otherwise we'll
@@@ -6860,16 -6812,16 +6860,16 @@@
                 goto next;
         }
   
- -      found_type = btrfs_file_extent_type(leaf, item);
+ +      extent_type = btrfs_file_extent_type(leaf, item);
         extent_start = found_key.offset;
- -      if (found_type == BTRFS_FILE_EXTENT_REG ||
- -          found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ +      if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ +          extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
                 extent_end = extent_start +
                        btrfs_file_extent_num_bytes(leaf, item);
   
                 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
                                                        extent_start);
- -      } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ +      } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                 size_t size;
   
                 size = btrfs_file_extent_ram_bytes(leaf, item);
@@@ -6888,9 -6840,9 +6888,9 @@@ next
                         if (ret < 0) {
                                 err = ret;
                                 goto out;
- -                      }
- -                      if (ret > 0)
+ +                      } else if (ret > 0) {
                                 goto not_found;
+ +                      }
                         leaf = path->nodes[0];
                 }
                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@@ -6901,22 -6853,19 +6901,22 @@@
                         goto not_found;
                 if (start > found_key.offset)
                         goto next;
+ +
+ +              /* New extent overlaps with existing one */
                 em->start = start;
                 em->orig_start = start;
                 em->len = found_key.offset - start;
- -              goto not_found_em;
+ +              em->block_start = EXTENT_MAP_HOLE;
+ +              goto insert;
         }
   
         btrfs_extent_item_to_extent_map(inode, path, item,
                         new_inline, em);
   
- -      if (found_type == BTRFS_FILE_EXTENT_REG ||
- -          found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ +      if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ +          extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
                 goto insert;
- -      } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ +      } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                 unsigned long ptr;
                 char *map;
                 size_t size;
@@@ -6967,6 -6916,7 +6967,6 @@@ not_found
         em->start = start;
         em->orig_start = start;
         em->len = len;
- -not_found_em:
         em->block_start = EXTENT_MAP_HOLE;
   insert:
         btrfs_release_path(path);
@@@ -6996,17 -6946,19 +6996,17 @@@ out
   }
   
   struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- -              struct page *page,
- -              size_t pg_offset, u64 start, u64 len,
- -              int create)
+ +                                         u64 start, u64 len)
   {
         struct extent_map *em;
         struct extent_map *hole_em = NULL;
- -      u64 range_start = start;
+ +      u64 delalloc_start = start;
         u64 end;
- -      u64 found;
- -      u64 found_end;
+ +      u64 delalloc_len;
+ +      u64 delalloc_end;
         int err = 0;
   
- -      em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+ +      em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
         if (IS_ERR(em))
                 return em;
         /*
@@@ -7031,84 -6983,80 +7031,84 @@@
         em = NULL;
   
         /* ok, we didn't find anything, lets look for delalloc */
- -      found = count_range_bits(&inode->io_tree, &range_start,
+ +      delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
                                  end, len, EXTENT_DELALLOC, 1);
- -      found_end = range_start + found;
- -      if (found_end < range_start)
- -              found_end = (u64)-1;
+ +      delalloc_end = delalloc_start + delalloc_len;
+ +      if (delalloc_end < delalloc_start)
+ +              delalloc_end = (u64)-1;
   
         /*
- -       * we didn't find anything useful, return
- -       * the original results from get_extent()
+ +       * We didn't find anything useful, return the original results from
+ +       * get_extent()
          */
- -      if (range_start > end || found_end <= start) {
+ +      if (delalloc_start > end || delalloc_end <= start) {
                 em = hole_em;
                 hole_em = NULL;
                 goto out;
         }
   
- -      /* adjust the range_start to make sure it doesn't
- -       * go backwards from the start they passed in
+ +      /*
+ +       * Adjust the delalloc_start to make sure it doesn't go backwards from
+ +       * the start they passed in
          */
- -      range_start = max(start, range_start);
- -      found = found_end - range_start;
+ +      delalloc_start = max(start, delalloc_start);
+ +      delalloc_len = delalloc_end - delalloc_start;
   
- -      if (found > 0) {
- -              u64 hole_start = start;
- -              u64 hole_len = len;
+ +      if (delalloc_len > 0) {
+ +              u64 hole_start;
+ +              u64 hole_len;
+ +              const u64 hole_end = extent_map_end(hole_em);
   
                 em = alloc_extent_map();
                 if (!em) {
                         err = -ENOMEM;
                         goto out;
                 }
+ +              em->bdev = NULL;
+ +
+ +              ASSERT(hole_em);
                 /*
- -               * when btrfs_get_extent can't find anything it
- -               * returns one huge hole
+ +               * When btrfs_get_extent can't find anything it returns one
+ +               * huge hole
                  *
- -               * make sure what it found really fits our range, and
- -               * adjust to make sure it is based on the start from
- -               * the caller
+ +               * Make sure what it found really fits our range, and adjust to
+ +               * make sure it is based on the start from the caller
                  */
- -              if (hole_em) {
- -                      u64 calc_end = extent_map_end(hole_em);
- -
- -                      if (calc_end <= start || (hole_em->start > end)) {
- -                              free_extent_map(hole_em);
- -                              hole_em = NULL;
- -                      } else {
- -                              hole_start = max(hole_em->start, start);
- -                              hole_len = calc_end - hole_start;
- -                      }
+ +              if (hole_end <= start || hole_em->start > end) {
+ +                     free_extent_map(hole_em);
+ +                     hole_em = NULL;
+ +              } else {
+ +                     hole_start = max(hole_em->start, start);
+ +                     hole_len = hole_end - hole_start;
                 }
- -              em->bdev = NULL;
- -              if (hole_em && range_start > hole_start) {
- -                      /* our hole starts before our delalloc, so we
- -                       * have to return just the parts of the hole
- -                       * that go until  the delalloc starts
+ +
+ +              if (hole_em && delalloc_start > hole_start) {
+ +                      /*
+ +                       * Our hole starts before our delalloc, so we have to
+ +                       * return just the parts of the hole that go until the
+ +                       * delalloc starts
                          */
- -                      em->len = min(hole_len,
- -                                    range_start - hole_start);
+ +                      em->len = min(hole_len, delalloc_start - hole_start);
                         em->start = hole_start;
                         em->orig_start = hole_start;
                         /*
- -                       * don't adjust block start at all,
- -                       * it is fixed at EXTENT_MAP_HOLE
+ +                       * Don't adjust block start at all, it is fixed at
+ +                       * EXTENT_MAP_HOLE
                          */
                         em->block_start = hole_em->block_start;
                         em->block_len = hole_len;
                         if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
                                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
                 } else {
- -                      em->start = range_start;
- -                      em->len = found;
- -                      em->orig_start = range_start;
+ +                      /*
+ +                       * Hole is out of passed range or it starts after
+ +                       * delalloc range
+ +                       */
+ +                      em->start = delalloc_start;
+ +                      em->len = delalloc_len;
+ +                      em->orig_start = delalloc_start;
                         em->block_start = EXTENT_MAP_DELALLOC;
- -                      em->block_len = found;
+ +                      em->block_len = delalloc_len;
                 }
         } else {
                 return hole_em;
@@@ -7829,6 -7777,7 +7829,7 @@@ static void btrfs_retry_endio_nocsum(st
         struct bio_vec *bvec;
         struct extent_io_tree *io_tree, *failure_tree;
         int i;
+       struct bvec_iter_all iter_all;
   
         if (bio->bi_status)
                 goto end;
@@@ -7840,7 -7789,7 +7841,7 @@@
   
         done->uptodate = 1;
         ASSERT(!bio_flagged(bio, BIO_CLONED));
-       bio_for_each_segment_all(bvec, bio, i)
+       bio_for_each_segment_all(bvec, bio, i, iter_all)
                 clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
                                  io_tree, done->start, bvec->bv_page,
                                  btrfs_ino(BTRFS_I(inode)), 0);
@@@ -7919,6 -7868,7 +7920,7 @@@ static void btrfs_retry_endio(struct bi
         int uptodate;
         int ret;
         int i;
+       struct bvec_iter_all iter_all;
   
         if (bio->bi_status)
                 goto end;
@@@ -7932,7 -7882,7 +7934,7 @@@
         failure_tree = &BTRFS_I(inode)->io_failure_tree;
   
         ASSERT(!bio_flagged(bio, BIO_CLONED));
-       bio_for_each_segment_all(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i, iter_all) {
                 ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
                                              bvec->bv_offset, done->start,
                                              bvec->bv_len);
@@@ -9962,6 -9912,7 +9964,6 @@@ static struct btrfs_delalloc_work *btrf
         init_completion(&work->completion);
         INIT_LIST_HEAD(&work->list);
         work->inode = inode;
- -      WARN_ON_ONCE(!inode);
         btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
                         btrfs_run_delalloc_work, NULL, NULL);
   
diff --combined fs/gfs2/lops.c

index 2295042bc6259887c87bfcb786e5d4bfc511662c,15deefeaafd0d68f912c4b4733aa5dee936da43d..8722c60b11feb478fe4dbf560f8afcdcda4e2dac
--- 1/fs/gfs2/lops.c
--- 2/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@@ -17,7 -17,9 +17,7 @@@
   #include <linux/bio.h>
   #include <linux/fs.h>
   #include <linux/list_sort.h>
- -#include <linux/blkdev.h>
   
- -#include "bmap.h"
   #include "dir.h"
   #include "gfs2.h"
   #include "incore.h"
@@@ -168,7 -170,8 +168,8 @@@ u64 gfs2_log_bmap(struct gfs2_sbd *sdp
    * that is pinned in the pagecache.
    */
   
- static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
+ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
+                                 struct bio_vec *bvec,
                                   blk_status_t error)
   {
         struct buffer_head *bh, *next;
@@@ -193,7 -196,6 +194,7 @@@
   /**
    * gfs2_end_log_write - end of i/o to the log
    * @bio: The bio
+ + * @error: Status of i/o request
    *
    * Each bio_vec contains either data from the pagecache or data
    * relating to the log itself. Here we iterate over the bio_vec
@@@ -207,6 -209,7 +208,7 @@@ static void gfs2_end_log_write(struct b
         struct bio_vec *bvec;
         struct page *page;
         int i;
+       struct bvec_iter_all iter_all;
   
         if (bio->bi_status) {
                 fs_err(sdp, "Error %d writing to journal, jid=%u\n",
@@@ -214,7 -217,7 +216,7 @@@
                 wake_up(&sdp->sd_logd_waitq);
         }
   
-       bio_for_each_segment_all(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i, iter_all) {
                 page = bvec->bv_page;
                 if (page_has_buffers(page))
                         gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
@@@ -230,20 -233,19 +232,20 @@@
   /**
    * gfs2_log_submit_bio - Submit any pending log bio
    * @biop: Address of the bio pointer
- - * @opf: REQ_OP | op_flags
+ + * @op: REQ_OP
+ + * @op_flags: req_flag_bits
    *
    * Submit any pending part-built or full bio to the block device. If
    * there is no pending bio, then this is a no-op.
    */
   
- -void gfs2_log_submit_bio(struct bio **biop, int opf)
+ +void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags)
   {
         struct bio *bio = *biop;
         if (bio) {
                 struct gfs2_sbd *sdp = bio->bi_private;
                 atomic_inc(&sdp->sd_log_in_flight);
- -              bio->bi_opf = opf;
+ +              bio_set_op_attrs(bio, op, op_flags);
                 submit_bio(bio);
                 *biop = NULL;
         }
@@@ -304,7 -306,7 +306,7 @@@ static struct bio *gfs2_log_get_bio(str
                 nblk >>= sdp->sd_fsb2bb_shift;
                 if (blkno == nblk && !flush)
                         return bio;
- -              gfs2_log_submit_bio(biop, op);
+ +              gfs2_log_submit_bio(biop, op, 0);
         }
   
         *biop = gfs2_log_alloc_bio(sdp, blkno, end_io);
@@@ -375,6 -377,185 +377,6 @@@ void gfs2_log_write_page(struct gfs2_sb
                        gfs2_log_bmap(sdp));
   }
   
- -/**
- - * gfs2_end_log_read - end I/O callback for reads from the log
- - * @bio: The bio
- - *
- - * Simply unlock the pages in the bio. The main thread will wait on them and
- - * process them in order as necessary.
- - */
- -
- -static void gfs2_end_log_read(struct bio *bio)
- -{
- -      struct page *page;
- -      struct bio_vec *bvec;
- -      int i;
- -      struct bvec_iter_all iter_all;
- -
- -      bio_for_each_segment_all(bvec, bio, i, iter_all) {
- -              page = bvec->bv_page;
- -              if (bio->bi_status) {
- -                      int err = blk_status_to_errno(bio->bi_status);
- -
- -                      SetPageError(page);
- -                      mapping_set_error(page->mapping, err);
- -              }
- -              unlock_page(page);
- -      }
- -
- -      bio_put(bio);
- -}
- -
- -/**
- - * gfs2_jhead_pg_srch - Look for the journal head in a given page.
- - * @jd: The journal descriptor
- - * @page: The page to look in
- - *
- - * Returns: 1 if found, 0 otherwise.
- - */
- -
- -static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
- -                            struct gfs2_log_header_host *head,
- -                            struct page *page)
- -{
- -      struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
- -      struct gfs2_log_header_host uninitialized_var(lh);
- -      void *kaddr = kmap_atomic(page);
- -      unsigned int offset;
- -      bool ret = false;
- -
- -      for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) {
- -              if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) {
- -                      if (lh.lh_sequence > head->lh_sequence)
- -                              *head = lh;
- -                      else {
- -                              ret = true;
- -                              break;
- -                      }
- -              }
- -      }
- -      kunmap_atomic(kaddr);
- -      return ret;
- -}
- -
- -/**
- - * gfs2_jhead_process_page - Search/cleanup a page
- - * @jd: The journal descriptor
- - * @index: Index of the page to look into
- - * @done: If set, perform only cleanup, else search and set if found.
- - *
- - * Find the page with 'index' in the journal's mapping. Search the page for
- - * the journal head if requested (cleanup == false). Release refs on the
- - * page so the page cache can reclaim it (put_page() twice). We grabbed a
- - * reference on this page two times, first when we did a find_or_create_page()
- - * to obtain the page to add it to the bio and second when we do a
- - * find_get_page() here to get the page to wait on while I/O on it is being
- - * completed.
- - * This function is also used to free up a page we might've grabbed but not
- - * used. Maybe we added it to a bio, but not submitted it for I/O. Or we
- - * submitted the I/O, but we already found the jhead so we only need to drop
- - * our references to the page.
- - */
- -
- -static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index,
- -                                  struct gfs2_log_header_host *head,
- -                                  bool *done)
- -{
- -      struct page *page;
- -
- -      page = find_get_page(jd->jd_inode->i_mapping, index);
- -      wait_on_page_locked(page);
- -
- -      if (PageError(page))
- -              *done = true;
- -
- -      if (!*done)
- -              *done = gfs2_jhead_pg_srch(jd, head, page);
- -
- -      put_page(page); /* Once for find_get_page */
- -      put_page(page); /* Once more for find_or_create_page */
- -}
- -
- -/**
- - * gfs2_find_jhead - find the head of a log
- - * @jd: The journal descriptor
- - * @head: The log descriptor for the head of the log is returned here
- - *
- - * Do a search of a journal by reading it in large chunks using bios and find
- - * the valid log entry with the highest sequence number.  (i.e. the log head)
- - *
- - * Returns: 0 on success, errno otherwise
- - */
- -
- -int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
- -{
- -      struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
- -      struct address_space *mapping = jd->jd_inode->i_mapping;
- -      struct gfs2_journal_extent *je;
- -      u32 block, read_idx = 0, submit_idx = 0, index = 0;
- -      int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift;
- -      int blocks_per_page = 1 << shift, sz, ret = 0;
- -      struct bio *bio = NULL;
- -      struct page *page;
- -      bool done = false;
- -      errseq_t since;
- -
- -      memset(head, 0, sizeof(*head));
- -      if (list_empty(&jd->extent_list))
- -              gfs2_map_journal_extents(sdp, jd);
- -
- -      since = filemap_sample_wb_err(mapping);
- -      list_for_each_entry(je, &jd->extent_list, list) {
- -              for (block = 0; block < je->blocks; block += blocks_per_page) {
- -                      index = (je->lblock + block) >> shift;
- -
- -                      page = find_or_create_page(mapping, index, GFP_NOFS);
- -                      if (!page) {
- -                              ret = -ENOMEM;
- -                              done = true;
- -                              goto out;
- -                      }
- -
- -                      if (bio) {
- -                              sz = bio_add_page(bio, page, PAGE_SIZE, 0);
- -                              if (sz == PAGE_SIZE)
- -                                      goto page_added;
- -                              submit_idx = index;
- -                              submit_bio(bio);
- -                              bio = NULL;
- -                      }
- -
- -                      bio = gfs2_log_alloc_bio(sdp,
- -                                               je->dblock + (index << shift),
- -                                               gfs2_end_log_read);
- -                      bio->bi_opf = REQ_OP_READ;
- -                      sz = bio_add_page(bio, page, PAGE_SIZE, 0);
- -                      gfs2_assert_warn(sdp, sz == PAGE_SIZE);
- -
- -page_added:
- -                      if (submit_idx <= read_idx + BIO_MAX_PAGES) {
- -                              /* Keep at least one bio in flight */
- -                              continue;
- -                      }
- -
- -                      gfs2_jhead_process_page(jd, read_idx++, head, &done);
- -                      if (done)
- -                              goto out;  /* found */
- -              }
- -      }
- -
- -out:
- -      if (bio)
- -              submit_bio(bio);
- -      while (read_idx <= index)
- -              gfs2_jhead_process_page(jd, read_idx++, head, &done);
- -
- -      if (!ret)
- -              ret = filemap_check_wb_err(mapping, since);
- -
- -      return ret;
- -}
- -
   static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
                                       u32 ld_length, u32 ld_data1)
   {
diff --combined fs/xfs/xfs_aops.c

index 7b8bb6bde981028ad692fa07c87b0e4911bf3436,55f3e194a8522b748196128a8aee06e1e0686be6..3619e9e8d359e839b8ac88b633a52eedff360301
--- 1/fs/xfs/xfs_aops.c
--- 2/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@@ -28,8 -28,7 +28,8 @@@
    */
   struct xfs_writepage_ctx {
         struct xfs_bmbt_irec    imap;
- -      unsigned int            io_type;
+ +      int                     fork;
+ +      unsigned int            data_seq;
         unsigned int            cow_seq;
         struct xfs_ioend        *ioend;
   };
@@@ -63,7 -62,7 +63,7 @@@ xfs_find_daxdev_for_inode
   static void
   xfs_finish_page_writeback(
         struct inode            *inode,
-       struct bio_vec          *bvec,
+       struct bio_vec  *bvec,
         int                     error)
   {
         struct iomap_page       *iop = to_iomap_page(bvec->bv_page);
@@@ -99,6 -98,7 +99,7 @@@ xfs_destroy_ioend
         for (bio = &ioend->io_inline_bio; bio; bio = next) {
                 struct bio_vec  *bvec;
                 int             i;
+               struct bvec_iter_all iter_all;
   
                 /*
                  * For the last bio, bi_private points to the ioend, so we
@@@ -110,7 -110,7 +111,7 @@@
                         next = bio->bi_private;
   
                 /* walk each page on bio, ending page IO on them */
-               bio_for_each_segment_all(bvec, bio, i)
+               bio_for_each_segment_all(bvec, bio, i, iter_all)
                         xfs_finish_page_writeback(inode, bvec, error);
                 bio_put(bio);
         }
@@@ -256,20 -256,30 +257,20 @@@ xfs_end_io
          */
         error = blk_status_to_errno(ioend->io_bio->bi_status);
         if (unlikely(error)) {
- -              switch (ioend->io_type) {
- -              case XFS_IO_COW:
+ +              if (ioend->io_fork == XFS_COW_FORK)
                         xfs_reflink_cancel_cow_range(ip, offset, size, true);
- -                      break;
- -              }
- -
                 goto done;
         }
   
         /*
- -       * Success:  commit the COW or unwritten blocks if needed.
+ +       * Success: commit the COW or unwritten blocks if needed.
          */
- -      switch (ioend->io_type) {
- -      case XFS_IO_COW:
+ +      if (ioend->io_fork == XFS_COW_FORK)
                 error = xfs_reflink_end_cow(ip, offset, size);
- -              break;
- -      case XFS_IO_UNWRITTEN:
- -              /* writeback should never update isize */
+ +      else if (ioend->io_state == XFS_EXT_UNWRITTEN)
                 error = xfs_iomap_write_unwritten(ip, offset, size, false);
- -              break;
- -      default:
+ +      else
                 ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
- -              break;
- -      }
   
   done:
         if (ioend->io_append_trans)
@@@ -284,8 -294,7 +285,8 @@@ xfs_end_bio
         struct xfs_ioend        *ioend = bio->bi_private;
         struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
   
- -      if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
+ +      if (ioend->io_fork == XFS_COW_FORK ||
+ +          ioend->io_state == XFS_EXT_UNWRITTEN)
                 queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
         else if (ioend->io_append_trans)
                 queue_work(mp->m_data_workqueue, &ioend->io_work);
@@@ -293,75 -302,6 +294,75 @@@
                 xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
   }
   
+ +/*
+ + * Fast revalidation of the cached writeback mapping. Return true if the current
+ + * mapping is valid, false otherwise.
+ + */
+ +static bool
+ +xfs_imap_valid(
+ +      struct xfs_writepage_ctx        *wpc,
+ +      struct xfs_inode                *ip,
+ +      xfs_fileoff_t                   offset_fsb)
+ +{
+ +      if (offset_fsb < wpc->imap.br_startoff ||
+ +          offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ +              return false;
+ +      /*
+ +       * If this is a COW mapping, it is sufficient to check that the mapping
+ +       * covers the offset. Be careful to check this first because the caller
+ +       * can revalidate a COW mapping without updating the data seqno.
+ +       */
+ +      if (wpc->fork == XFS_COW_FORK)
+ +              return true;
+ +
+ +      /*
+ +       * This is not a COW mapping. Check the sequence number of the data fork
+ +       * because concurrent changes could have invalidated the extent. Check
+ +       * the COW fork because concurrent changes since the last time we
+ +       * checked (and found nothing at this offset) could have added
+ +       * overlapping blocks.
+ +       */
+ +      if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
+ +              return false;
+ +      if (xfs_inode_has_cow_data(ip) &&
+ +          wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+ +              return false;
+ +      return true;
+ +}
+ +
+ +/*
+ + * Pass in a dellalloc extent and convert it to real extents, return the real
+ + * extent that maps offset_fsb in wpc->imap.
+ + *
+ + * The current page is held locked so nothing could have removed the block
+ + * backing offset_fsb, although it could have moved from the COW to the data
+ + * fork by another thread.
+ + */
+ +static int
+ +xfs_convert_blocks(
+ +      struct xfs_writepage_ctx *wpc,
+ +      struct xfs_inode        *ip,
+ +      xfs_fileoff_t           offset_fsb)
+ +{
+ +      int                     error;
+ +
+ +      /*
+ +       * Attempt to allocate whatever delalloc extent currently backs
+ +       * offset_fsb and put the result into wpc->imap.  Allocate in a loop
+ +       * because it may take several attempts to allocate real blocks for a
+ +       * contiguous delalloc extent if free space is sufficiently fragmented.
+ +       */
+ +      do {
+ +              error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
+ +                              &wpc->imap, wpc->fork == XFS_COW_FORK ?
+ +                                      &wpc->cow_seq : &wpc->data_seq);
+ +              if (error)
+ +                      return error;
+ +      } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
+ +
+ +      return 0;
+ +}
+ +
   STATIC int
   xfs_map_blocks(
         struct xfs_writepage_ctx *wpc,
@@@ -371,16 -311,26 +372,16 @@@
         struct xfs_inode        *ip = XFS_I(inode);
         struct xfs_mount        *mp = ip->i_mount;
         ssize_t                 count = i_blocksize(inode);
- -      xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
+ +      xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ +      xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + count);
         xfs_fileoff_t           cow_fsb = NULLFILEOFF;
         struct xfs_bmbt_irec    imap;
- -      int                     whichfork = XFS_DATA_FORK;
         struct xfs_iext_cursor  icur;
- -      bool                    imap_valid;
+ +      int                     retries = 0;
         int                     error = 0;
   
- -      /*
- -       * We have to make sure the cached mapping is within EOF to protect
- -       * against eofblocks trimming on file release leaving us with a stale
- -       * mapping. Otherwise, a page for a subsequent file extending buffered
- -       * write could get picked up by this writeback cycle and written to the
- -       * wrong blocks.
- -       *
- -       * Note that what we really want here is a generic mapping invalidation
- -       * mechanism to protect us from arbitrary extent modifying contexts, not
- -       * just eofblocks.
- -       */
- -      xfs_trim_extent_eof(&wpc->imap, ip);
+ +      if (XFS_FORCED_SHUTDOWN(mp))
+ +              return -EIO;
   
         /*
          * COW fork blocks can overlap data fork blocks even if the blocks
@@@ -397,19 -347,31 +398,19 @@@
          * against concurrent updates and provides a memory barrier on the way
          * out that ensures that we always see the current value.
          */
- -      imap_valid = offset_fsb >= wpc->imap.br_startoff &&
- -                   offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
- -      if (imap_valid &&
- -          (!xfs_inode_has_cow_data(ip) ||
- -           wpc->io_type == XFS_IO_COW ||
- -           wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq)))
+ +      if (xfs_imap_valid(wpc, ip, offset_fsb))
                 return 0;
   
- -      if (XFS_FORCED_SHUTDOWN(mp))
- -              return -EIO;
- -
         /*
          * If we don't have a valid map, now it's time to get a new one for this
          * offset.  This will convert delayed allocations (including COW ones)
          * into real extents.  If we return without a valid map, it means we
          * landed in a hole and we skip the block.
          */
+ +retry:
         xfs_ilock(ip, XFS_ILOCK_SHARED);
         ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
                (ip->i_df.if_flags & XFS_IFEXTENTS));
- -      ASSERT(offset <= mp->m_super->s_maxbytes);
- -
- -      if (offset > mp->m_super->s_maxbytes - count)
- -              count = mp->m_super->s_maxbytes - offset;
- -      end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
   
         /*
          * Check if this is offset is covered by a COW extents, and if yes use
@@@ -421,16 -383,30 +422,16 @@@
         if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
                 wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
- -              /*
- -               * Truncate can race with writeback since writeback doesn't
- -               * take the iolock and truncate decreases the file size before
- -               * it starts truncating the pages between new_size and old_size.
- -               * Therefore, we can end up in the situation where writeback
- -               * gets a CoW fork mapping but the truncate makes the mapping
- -               * invalid and we end up in here trying to get a new mapping.
- -               * bail out here so that we simply never get a valid mapping
- -               * and so we drop the write altogether.  The page truncation
- -               * will kill the contents anyway.
- -               */
- -              if (offset > i_size_read(inode)) {
- -                      wpc->io_type = XFS_IO_HOLE;
- -                      return 0;
- -              }
- -              whichfork = XFS_COW_FORK;
- -              wpc->io_type = XFS_IO_COW;
+ +
+ +              wpc->fork = XFS_COW_FORK;
                 goto allocate_blocks;
         }
   
         /*
- -       * Map valid and no COW extent in the way?  We're done.
+ +       * No COW extent overlap. Revalidate now that we may have updated
+ +       * ->cow_seq. If the data mapping is still valid, we're done.
          */
- -      if (imap_valid) {
+ +      if (xfs_imap_valid(wpc, ip, offset_fsb)) {
                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
                 return 0;
         }
@@@ -442,65 -418,51 +443,65 @@@
          */
         if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
                 imap.br_startoff = end_fsb;     /* fake a hole past EOF */
+ +      wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
         xfs_iunlock(ip, XFS_ILOCK_SHARED);
   
+ +      wpc->fork = XFS_DATA_FORK;
+ +
+ +      /* landed in a hole or beyond EOF? */
         if (imap.br_startoff > offset_fsb) {
- -              /* landed in a hole or beyond EOF */
                 imap.br_blockcount = imap.br_startoff - offset_fsb;
                 imap.br_startoff = offset_fsb;
                 imap.br_startblock = HOLESTARTBLOCK;
- -              wpc->io_type = XFS_IO_HOLE;
- -      } else {
- -              /*
- -               * Truncate to the next COW extent if there is one.  This is the
- -               * only opportunity to do this because we can skip COW fork
- -               * lookups for the subsequent blocks in the mapping; however,
- -               * the requirement to treat the COW range separately remains.
- -               */
- -              if (cow_fsb != NULLFILEOFF &&
- -                  cow_fsb < imap.br_startoff + imap.br_blockcount)
- -                      imap.br_blockcount = cow_fsb - imap.br_startoff;
- -
- -              if (isnullstartblock(imap.br_startblock)) {
- -                      /* got a delalloc extent */
- -                      wpc->io_type = XFS_IO_DELALLOC;
- -                      goto allocate_blocks;
- -              }
- -
- -              if (imap.br_state == XFS_EXT_UNWRITTEN)
- -                      wpc->io_type = XFS_IO_UNWRITTEN;
- -              else
- -                      wpc->io_type = XFS_IO_OVERWRITE;
+ +              imap.br_state = XFS_EXT_NORM;
         }
   
+ +      /*
+ +       * Truncate to the next COW extent if there is one.  This is the only
+ +       * opportunity to do this because we can skip COW fork lookups for the
+ +       * subsequent blocks in the mapping; however, the requirement to treat
+ +       * the COW range separately remains.
+ +       */
+ +      if (cow_fsb != NULLFILEOFF &&
+ +          cow_fsb < imap.br_startoff + imap.br_blockcount)
+ +              imap.br_blockcount = cow_fsb - imap.br_startoff;
+ +
+ +      /* got a delalloc extent? */
+ +      if (imap.br_startblock != HOLESTARTBLOCK &&
+ +          isnullstartblock(imap.br_startblock))
+ +              goto allocate_blocks;
+ +
         wpc->imap = imap;
- -      xfs_trim_extent_eof(&wpc->imap, ip);
- -      trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
+ +      trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
         return 0;
   allocate_blocks:
- -      error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
- -                      &wpc->cow_seq);
- -      if (error)
+ +      error = xfs_convert_blocks(wpc, ip, offset_fsb);
+ +      if (error) {
+ +              /*
+ +               * If we failed to find the extent in the COW fork we might have
+ +               * raced with a COW to data fork conversion or truncate.
+ +               * Restart the lookup to catch the extent in the data fork for
+ +               * the former case, but prevent additional retries to avoid
+ +               * looping forever for the latter case.
+ +               */
+ +              if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
+ +                      goto retry;
+ +              ASSERT(error != -EAGAIN);
                 return error;
- -      ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF ||
- -             imap.br_startoff + imap.br_blockcount <= cow_fsb);
- -      wpc->imap = imap;
- -      xfs_trim_extent_eof(&wpc->imap, ip);
- -      trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
+ +      }
+ +
+ +      /*
+ +       * Due to merging the return real extent might be larger than the
+ +       * original delalloc one.  Trim the return extent to the next COW
+ +       * boundary again to force a re-lookup.
+ +       */
+ +      if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
+ +          cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ +              wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
+ +
+ +      ASSERT(wpc->imap.br_startoff <= offset_fsb);
+ +      ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
+ +      trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
         return 0;
   }
   
@@@ -525,7 -487,7 +526,7 @@@ xfs_submit_ioend
         int                     status)
   {
         /* Convert CoW extents to regular */
- -      if (!status && ioend->io_type == XFS_IO_COW) {
+ +      if (!status && ioend->io_fork == XFS_COW_FORK) {
                 /*
                  * Yuk. This can do memory allocation, but is not a
                  * transactional operation so everything is done in GFP_KERNEL
@@@ -543,8 -505,7 +544,8 @@@
   
         /* Reserve log space if we might write beyond the on-disk inode size. */
         if (!status &&
- -          ioend->io_type != XFS_IO_UNWRITTEN &&
+ +          (ioend->io_fork == XFS_COW_FORK ||
+ +           ioend->io_state != XFS_EXT_UNWRITTEN) &&
             xfs_ioend_is_append(ioend) &&
             !ioend->io_append_trans)
                 status = xfs_setfilesize_trans_alloc(ioend);
@@@ -573,8 -534,7 +574,8 @@@
   static struct xfs_ioend *
   xfs_alloc_ioend(
         struct inode            *inode,
- -      unsigned int            type,
+ +      int                     fork,
+ +      xfs_exntst_t            state,
         xfs_off_t               offset,
         struct block_device     *bdev,
         sector_t                sector)
@@@ -588,8 -548,7 +589,8 @@@
   
         ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
         INIT_LIST_HEAD(&ioend->io_list);
- -      ioend->io_type = type;
+ +      ioend->io_fork = fork;
+ +      ioend->io_state = state;
         ioend->io_inode = inode;
         ioend->io_size = 0;
         ioend->io_offset = offset;
@@@ -650,23 -609,21 +651,23 @@@ xfs_add_to_ioend
         sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
                 ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
   
- -      if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+ +      if (!wpc->ioend ||
+ +          wpc->fork != wpc->ioend->io_fork ||
+ +          wpc->imap.br_state != wpc->ioend->io_state ||
             sector != bio_end_sector(wpc->ioend->io_bio) ||
             offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
                 if (wpc->ioend)
                         list_add(&wpc->ioend->io_list, iolist);
- -              wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
- -                              bdev, sector);
+ +              wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
+ +                              wpc->imap.br_state, offset, bdev, sector);
         }
   
-       if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
+       if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) {
                 if (iop)
                         atomic_inc(&iop->write_count);
                 if (bio_full(wpc->ioend->io_bio))
                         xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
-               __bio_add_page(wpc->ioend->io_bio, page, len, poff);
+               bio_add_page(wpc->ioend->io_bio, page, len, poff);
         }
   
         wpc->ioend->io_size += len;
@@@ -767,7 -724,7 +768,7 @@@ xfs_writepage_map
                 error = xfs_map_blocks(wpc, inode, file_offset);
                 if (error)
                         break;
- -              if (wpc->io_type == XFS_IO_HOLE)
+ +              if (wpc->imap.br_startblock == HOLESTARTBLOCK)
                         continue;
                 xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
                                  &submit_list);
@@@ -962,7 -919,9 +963,7 @@@ xfs_vm_writepage
         struct page             *page,
         struct writeback_control *wbc)
   {
- -      struct xfs_writepage_ctx wpc = {
- -              .io_type = XFS_IO_HOLE,
- -      };
+ +      struct xfs_writepage_ctx wpc = { };
         int                     ret;
   
         ret = xfs_do_writepage(page, wbc, &wpc);
@@@ -976,7 -935,9 +977,7 @@@ xfs_vm_writepages
         struct address_space    *mapping,
         struct writeback_control *wbc)
   {
- -      struct xfs_writepage_ctx wpc = {
- -              .io_type = XFS_IO_HOLE,
- -      };
+ +      struct xfs_writepage_ctx wpc = { };
         int                     ret;
   
         xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
@@@ -1023,7 -984,7 +1024,7 @@@ xfs_vm_bmap
          * Since we don't pass back blockdev info, we can't return bmap
          * information for rt files either.
          */
- -      if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
+ +      if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
                 return 0;
         return iomap_bmap(mapping, block, &xfs_iomap_ops);
   }
diff --combined fs/xfs/xfs_file.c

index 770cc2edf777f4bb3ef6089986d5d49f75788ee4,60c2da41f0fc2890006d6bc9b9eb6dcb955b85b0..1f2e2845eb76c2c78a932c913057e1028cec2f05
--- 1/fs/xfs/xfs_file.c
--- 2/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@@ -507,7 -507,7 +507,7 @@@ xfs_file_dio_aio_write
                  * We can't properly handle unaligned direct I/O to reflink
                  * files yet, as we can't unshare a partial block.
                  */
- -              if (xfs_is_reflink_inode(ip)) {
+ +              if (xfs_is_cow_inode(ip)) {
                         trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
                         return -EREMCHG;
                 }
@@@ -872,27 -872,14 +872,27 @@@ xfs_file_fallocate
                                 goto out_unlock;
                 }
   
- -              if (mode & FALLOC_FL_ZERO_RANGE)
+ +              if (mode & FALLOC_FL_ZERO_RANGE) {
                         error = xfs_zero_file_space(ip, offset, len);
- -              else {
- -                      if (mode & FALLOC_FL_UNSHARE_RANGE) {
- -                              error = xfs_reflink_unshare(ip, offset, len);
- -                              if (error)
- -                                      goto out_unlock;
+ +              } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+ +                      error = xfs_reflink_unshare(ip, offset, len);
+ +                      if (error)
+ +                              goto out_unlock;
+ +
+ +                      if (!xfs_is_always_cow_inode(ip)) {
+ +                              error = xfs_alloc_file_space(ip, offset, len,
+ +                                              XFS_BMAPI_PREALLOC);
                         }
+ +              } else {
+ +                      /*
+ +                       * If always_cow mode we can't use preallocations and
+ +                       * thus should not create them.
+ +                       */
+ +                      if (xfs_is_always_cow_inode(ip)) {
+ +                              error = -EOPNOTSUPP;
+ +                              goto out_unlock;
+ +                      }
+ +
                         error = xfs_alloc_file_space(ip, offset, len,
                                                      XFS_BMAPI_PREALLOC);
                 }
@@@ -1081,10 -1068,10 +1081,10 @@@ xfs_file_llseek
         default:
                 return generic_file_llseek(file, offset, whence);
         case SEEK_HOLE:
- -              offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
+ +              offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
                 break;
         case SEEK_DATA:
- -              offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
+ +              offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
                 break;
         }
   
@@@ -1216,6 -1203,7 +1216,7 @@@ const struct file_operations xfs_file_o
         .write_iter     = xfs_file_write_iter,
         .splice_read    = generic_file_splice_read,
         .splice_write   = iter_file_splice_write,
+       .iopoll         = iomap_dio_iopoll,
         .unlocked_ioctl = xfs_file_ioctl,
   #ifdef CONFIG_COMPAT
         .compat_ioctl   = xfs_file_compat_ioctl,
diff --combined include/linux/fs.h

index 2cc540805a02e6d99ca50dd0a6607a0240737b28,dedcc2e9265cb44282425a3440d75df67d84256f..7442329a0011d62197e7b761c0fe3945847bc2cd
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -37,9 -37,6 +37,9 @@@
   #include <linux/uuid.h>
   #include <linux/errseq.h>
   #include <linux/ioprio.h>
+ +#include <linux/fs_types.h>
+ +#include <linux/build_bug.h>
+ +#include <linux/stddef.h>
   
   #include <asm/byteorder.h>
   #include <uapi/linux/fs.h>
@@@ -307,19 -304,14 +307,20 @@@ enum rw_hint 
   
   struct kiocb {
         struct file             *ki_filp;
+ +
+ +      /* The 'ki_filp' pointer is shared in a union for aio */
+ +      randomized_struct_fields_start
+ +
         loff_t                  ki_pos;
         void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
         void                    *private;
         int                     ki_flags;
         u16                     ki_hint;
         u16                     ki_ioprio; /* See linux/ioprio.h */
- -} __randomize_layout;
+       unsigned int            ki_cookie; /* for ->iopoll */
+ +
+ +      randomized_struct_fields_end
+ +};
   
   static inline bool is_sync_kiocb(struct kiocb *kiocb)
   {
@@@ -1709,6 -1701,22 +1710,6 @@@ int fiemap_fill_next_extent(struct fiem
                             u64 phys, u64 len, u32 flags);
   int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
   
- -/*
- - * File types
- - *
- - * NOTE! These match bits 12..15 of stat.st_mode
- - * (ie "(i_mode >> 12) & 15").
- - */
- -#define DT_UNKNOWN    0
- -#define DT_FIFO               1
- -#define DT_CHR                2
- -#define DT_DIR                4
- -#define DT_BLK                6
- -#define DT_REG                8
- -#define DT_LNK                10
- -#define DT_SOCK               12
- -#define DT_WHT                14
- -
   /*
    * This is the "filldir" function type, used by readdir() to let
    * the kernel specify what kind of dirent layout it wants to have.
@@@ -1780,6 -1788,7 +1781,7 @@@ struct file_operations 
         ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
         ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
         ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+       int (*iopoll)(struct kiocb *kiocb, bool spin);
         int (*iterate) (struct file *, struct dir_context *);
         int (*iterate_shared) (struct file *, struct dir_context *);
         __poll_t (*poll) (struct file *, struct poll_table_struct *);
@@@ -2078,7 -2087,7 +2080,7 @@@ static inline void init_sync_kiocb(stru
    * I_WB_SWITCH                Cgroup bdi_writeback switching in progress.  Used to
    *                    synchronize competing switching instances and to tell
    *                    wb stat updates to grab the i_pages lock.  See
- - *                    inode_switch_wb_work_fn() for details.
+ + *                    inode_switch_wbs_work_fn() for details.
    *
    * I_OVL_INUSE                Used by overlayfs to get exclusive ownership on upper
    *                    and work dirs among overlayfs mounts.
@@@ -2480,7 -2489,6 +2482,7 @@@ struct filename 
         struct audit_names      *aname;
         const char              iname[];
   };
+ +static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);
   
   extern long vfs_truncate(const struct path *, loff_t);
   extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
author	Linus Torvalds <[email protected]>
	Fri, 8 Mar 2019 22:12:17 +0000 (14:12 -0800)
committer	Linus Torvalds <[email protected]>
	Fri, 8 Mar 2019 22:12:17 +0000 (14:12 -0800)
		1	2
block/blk-mq.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/ata/libata-scsi.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/floppy.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/mtip32xx/mtip32xx.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/rbd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-crypt.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid1.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/mmc/core/queue.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/pci.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/scsi_lib.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/staging/erofs/data.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/staging/erofs/unzip_vle.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/compression.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/disk-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/extent_io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/gfs2/lops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_aops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_file.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history