]> Git Repo - linux.git/commitdiff
Merge branch 'for-4.15/block' of git://git.kernel.dk/linux-block
authorLinus Torvalds <[email protected]>
Tue, 14 Nov 2017 23:32:19 +0000 (15:32 -0800)
committerLinus Torvalds <[email protected]>
Tue, 14 Nov 2017 23:32:19 +0000 (15:32 -0800)
Pull core block layer updates from Jens Axboe:
 "This is the main pull request for block storage for 4.15-rc1.

  Nothing out of the ordinary in here, and no API changes or anything
  like that. Just various new features for drivers, core changes, etc.
  In particular, this pull request contains:

   - A patch series from Bart, closing the whole on blk/scsi-mq queue
     quescing.

   - A series from Christoph, building towards hidden gendisks (for
     multipath) and ability to move bio chains around.

   - NVMe
        - Support for native multipath for NVMe (Christoph).
        - Userspace notifications for AENs (Keith).
        - Command side-effects support (Keith).
        - SGL support (Chaitanya Kulkarni)
        - FC fixes and improvements (James Smart)
        - Lots of fixes and tweaks (Various)

   - bcache
        - New maintainer (Michael Lyle)
        - Writeback control improvements (Michael)
        - Various fixes (Coly, Elena, Eric, Liang, et al)

   - lightnvm updates, mostly centered around the pblk interface
     (Javier, Hans, and Rakesh).

   - Removal of unused bio/bvec kmap atomic interfaces (me, Christoph)

   - Writeback series that fix the much discussed hundreds of millions
     of sync-all units. This goes all the way, as discussed previously
     (me).

   - Fix for missing wakeup on writeback timer adjustments (Yafang
     Shao).

   - Fix laptop mode on blk-mq (me).

   - {mq,name} tupple lookup for IO schedulers, allowing us to have
     alias names. This means you can use 'deadline' on both !mq and on
     mq (where it's called mq-deadline). (me).

   - blktrace race fix, oopsing on sg load (me).

   - blk-mq optimizations (me).

   - Obscure waitqueue race fix for kyber (Omar).

   - NBD fixes (Josef).

   - Disable writeback throttling by default on bfq, like we do on cfq
     (Luca Miccio).

   - Series from Ming that enable us to treat flush requests on blk-mq
     like any other request. This is a really nice cleanup.

   - Series from Ming that improves merging on blk-mq with schedulers,
     getting us closer to flipping the switch on scsi-mq again.

   - BFQ updates (Paolo).

   - blk-mq atomic flags memory ordering fixes (Peter Z).

   - Loop cgroup support (Shaohua).

   - Lots of minor fixes from lots of different folks, both for core and
     driver code"

* 'for-4.15/block' of git://git.kernel.dk/linux-block: (294 commits)
  nvme: fix visibility of "uuid" ns attribute
  blk-mq: fixup some comment typos and lengths
  ide: ide-atapi: fix compile error with defining macro DEBUG
  blk-mq: improve tag waiting setup for non-shared tags
  brd: remove unused brd_mutex
  blk-mq: only run the hardware queue if IO is pending
  block: avoid null pointer dereference on null disk
  fs: guard_bio_eod() needs to consider partitions
  xtensa/simdisk: fix compile error
  nvme: expose subsys attribute to sysfs
  nvme: create 'slaves' and 'holders' entries for hidden controllers
  block: create 'slaves' and 'holders' entries for hidden gendisks
  nvme: also expose the namespace identification sysfs files for mpath nodes
  nvme: implement multipath access to nvme subsystems
  nvme: track shared namespaces
  nvme: introduce a nvme_ns_ids structure
  nvme: track subsystems
  block, nvme: Introduce blk_mq_req_flags_t
  block, scsi: Make SCSI quiesce and resume work reliably
  block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag
  ...

60 files changed:
1  2 
MAINTAINERS
block/bio.c
block/blk-lib.c
block/blk-mq-debugfs.c
block/blk-mq-tag.h
block/blk-mq.h
block/blk-throttle.c
block/blk-wbt.c
block/blk.h
block/genhd.c
drivers/block/Kconfig
drivers/block/nbd.c
drivers/block/null_blk.c
drivers/block/paride/Kconfig
drivers/block/skd_main.c
drivers/cdrom/Makefile
drivers/ide/ide-pm.c
drivers/md/bcache/alloc.c
drivers/md/bcache/bcache.h
drivers/md/bcache/btree.c
drivers/md/bcache/btree.h
drivers/md/bcache/closure.h
drivers/md/bcache/request.c
drivers/md/bcache/sysfs.c
drivers/md/bcache/util.h
drivers/md/bcache/writeback.c
drivers/md/bcache/writeback.h
drivers/md/dm.c
drivers/nvme/host/Makefile
drivers/nvme/host/core.c
drivers/nvme/host/fc.c
drivers/nvme/host/pci.c
drivers/nvme/host/rdma.c
drivers/nvme/target/core.c
drivers/nvme/target/nvmet.h
drivers/scsi/scsi_lib.c
drivers/scsi/sg.c
fs/block_dev.c
fs/buffer.c
fs/direct-io.c
fs/iomap.c
fs/sync.c
include/linux/backing-dev-defs.h
include/linux/backing-dev.h
include/linux/blk-cgroup.h
include/linux/blk-mq.h
include/linux/blk_types.h
include/linux/blkdev.h
include/linux/buffer_head.h
include/linux/elevator.h
include/linux/genhd.h
include/linux/kthread.h
include/linux/lightnvm.h
include/linux/writeback.h
include/scsi/scsi_device.h
include/trace/events/writeback.h
kernel/kthread.c
kernel/sysctl.c
mm/page_io.c
mm/vmscan.c

diff --combined MAINTAINERS
index e372994747b7ea2340344ea46ca013cc7f3e5562,6e9343af6bbfa5088d555c74e69fa9dd84846133..ba3d8c197d92b8c4973d9a7952a419f03a488558
@@@ -873,7 -873,7 +873,7 @@@ F: drivers/android
  F:    drivers/staging/android/
  
  ANDROID GOLDFISH RTC DRIVER
 -M:    Miodrag Dinic <miodrag.dinic@imgtec.com>
 +M:    Miodrag Dinic <miodrag.dinic@mips.com>
  S:    Supported
  F:    Documentation/devicetree/bindings/rtc/google,goldfish-rtc.txt
  F:    drivers/rtc/rtc-goldfish.c
@@@ -2562,10 -2562,12 +2562,12 @@@ S:   Maintaine
  F:    drivers/net/hamradio/baycom*
  
  BCACHE (BLOCK LAYER CACHE)
+ M:    Michael Lyle <[email protected]>
  M:    Kent Overstreet <[email protected]>
  L:    [email protected]
  W:    http://bcache.evilpiepirate.org
- S:    Orphan
+ C:    irc://irc.oftc.net/bcache
+ S:    Maintained
  F:    drivers/md/bcache/
  
  BDISP ST MEDIA DRIVER
@@@ -2896,13 -2898,6 +2898,13 @@@ S:    Supporte
  F:    drivers/gpio/gpio-brcmstb.c
  F:    Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.txt
  
 +BROADCOM BRCMSTB USB2 and USB3 PHY DRIVER
 +M:    Al Cooper <[email protected]>
 +L:    [email protected]
 +L:    [email protected]
 +S:    Maintained
 +F:    drivers/phy/broadcom/phy-brcm-usb*
 +
  BROADCOM GENET ETHERNET DRIVER
  M:    Florian Fainelli <[email protected]>
  L:    [email protected]
@@@ -3451,8 -3446,7 +3453,8 @@@ M:      Thomas Gleixner <[email protected]
  L:    [email protected]
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core
  S:    Supported
 -F:    drivers/clocksource
 +F:    drivers/clocksource/
 +F:    Documentation/devicetree/bindings/timer/
  
  CMPC ACPI DRIVER
  M:    Thadeu Lima de Souza Cascardo <[email protected]>
@@@ -3644,8 -3638,6 +3646,8 @@@ F:      drivers/cpufreq/arm_big_little_dt.
  
  CPU POWER MONITORING SUBSYSTEM
  M:    Thomas Renninger <[email protected]>
 +M:    Shuah Khan <[email protected]>
 +M:    Shuah Khan <[email protected]>
  L:    [email protected]
  S:    Maintained
  F:    tools/power/cpupower/
@@@ -4244,7 -4236,7 +4246,7 @@@ S:      Maintaine
  F:    drivers/dma/
  F:    include/linux/dmaengine.h
  F:    Documentation/devicetree/bindings/dma/
 -F:    Documentation/dmaengine/
 +F:    Documentation/driver-api/dmaengine/
  T:    git git://git.infradead.org/users/vkoul/slave-dma.git
  
  DMA MAPPING HELPERS
  S:    Maintained
  F:    drivers/edac/highbank*
  
 -EDAC-CAVIUM
 +EDAC-CAVIUM OCTEON
  M:    Ralf Baechle <[email protected]>
  M:    David Daney <[email protected]>
  L:    [email protected]
  L:    [email protected]
  S:    Supported
  F:    drivers/edac/octeon_edac*
 +
 +EDAC-CAVIUM THUNDERX
 +M:    David Daney <[email protected]>
 +M:    Jan Glauber <[email protected]>
 +L:    [email protected]
 +S:    Supported
  F:    drivers/edac/thunderx_edac*
  
  EDAC-CORE
@@@ -5229,7 -5215,8 +5231,7 @@@ F:      fs/ext4
  
  Extended Verification Module (EVM)
  M:    Mimi Zohar <[email protected]>
 -L:    [email protected]
 -L:    [email protected]
 +L:    [email protected]
  S:    Supported
  F:    security/integrity/evm/
  
@@@ -5274,8 -5261,7 +5276,8 @@@ S:      Maintaine
  F:    drivers/iommu/exynos-iommu.c
  
  EZchip NPS platform support
 -M:    Noam Camus <[email protected]>
 +M:    Elad Kanfi <[email protected]>
 +M:    Vineet Gupta <[email protected]>
  S:    Supported
  F:    arch/arc/plat-eznps
  F:    arch/arc/boot/dts/eznps.dts
@@@ -5361,7 -5347,9 +5363,7 @@@ M:      "J. Bruce Fields" <bfields@fieldses.
  L:    [email protected]
  S:    Maintained
  F:    include/linux/fcntl.h
 -F:    include/linux/fs.h
  F:    include/uapi/linux/fcntl.h
 -F:    include/uapi/linux/fs.h
  F:    fs/fcntl.c
  F:    fs/locks.c
  
@@@ -5370,8 -5358,6 +5372,8 @@@ M:      Alexander Viro <[email protected]
  L:    [email protected]
  S:    Maintained
  F:    fs/*
 +F:    include/linux/fs.h
 +F:    include/uapi/linux/fs.h
  
  FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER
  M:    Riku Voipio <[email protected]>
@@@ -5484,7 -5470,7 +5486,7 @@@ F:      include/uapi/linux/fb.
  
  FREESCALE CAAM (Cryptographic Acceleration and Assurance Module) DRIVER
  M:    Horia Geantă <[email protected]>
 -M:    Dan Douglass <dan.douglass@nxp.com>
 +M:    Aymen Sghaier <aymen.sghaier@nxp.com>
  L:    [email protected]
  S:    Maintained
  F:    drivers/crypto/caam/
@@@ -5664,7 -5650,6 +5666,7 @@@ T:      git git://git.kernel.org/pub/scm/lin
  S:    Supported
  F:    fs/crypto/
  F:    include/linux/fscrypt*.h
 +F:    Documentation/filesystems/fscrypt.rst
  
  FUJITSU FR-V (FRV) PORT
  S:    Orphan
@@@ -6687,7 -6672,7 +6689,7 @@@ F:      include/net/ieee802154_netdev.
  F:    Documentation/networking/ieee802154.txt
  
  IFE PROTOCOL
 -M:    Yotam Gigi <yotamg@mellanox.com>
 +M:    Yotam Gigi <yotam.gi@gmail.com>
  M:    Jamal Hadi Salim <[email protected]>
  F:    net/ife
  F:    include/net/ife.h
@@@ -6749,13 -6734,13 +6751,13 @@@ S:   Maintaine
  F:    drivers/usb/atm/ueagle-atm.c
  
  IMGTEC ASCII LCD DRIVER
 -M:    Paul Burton <paul.burton@imgtec.com>
 +M:    Paul Burton <paul.burton@mips.com>
  S:    Maintained
  F:    Documentation/devicetree/bindings/auxdisplay/img-ascii-lcd.txt
  F:    drivers/auxdisplay/img-ascii-lcd.c
  
  IMGTEC IR DECODER DRIVER
 -M:    James Hogan <j[email protected]>
 +M:    James Hogan <j[email protected]>
  S:    Maintained
  F:    drivers/media/rc/img-ir/
  
@@@ -6857,7 -6842,9 +6859,7 @@@ L:      [email protected]
  INTEGRITY MEASUREMENT ARCHITECTURE (IMA)
  M:    Mimi Zohar <[email protected]>
  M:    Dmitry Kasatkin <[email protected]>
 -L:    [email protected]
 -L:    [email protected]
 -L:    [email protected]
 +L:    [email protected]
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/zohar/linux-integrity.git
  S:    Supported
  F:    security/integrity/ima/
@@@ -7577,7 -7564,7 +7579,7 @@@ F:      arch/arm64/include/asm/kvm
  F:    arch/arm64/kvm/
  
  KERNEL VIRTUAL MACHINE FOR MIPS (KVM/mips)
 -M:    James Hogan <j[email protected]>
 +M:    James Hogan <j[email protected]>
  L:    [email protected]
  S:    Supported
  F:    arch/mips/include/uapi/asm/kvm*
@@@ -7585,7 -7572,7 +7587,7 @@@ F:      arch/mips/include/asm/kvm
  F:    arch/mips/kvm/
  
  KERNEL VIRTUAL MACHINE FOR POWERPC (KVM/powerpc)
 -M:    Alexander Graf <[email protected]>
 +M:    Paul Mackerras <[email protected]>
  L:    [email protected]
  W:    http://www.linux-kvm.org/
  T:    git git://github.com/agraf/linux-2.6.git
@@@ -7640,7 -7627,8 +7642,7 @@@ F:      kernel/kexec
  
  KEYS-ENCRYPTED
  M:    Mimi Zohar <[email protected]>
 -M:    David Safford <[email protected]>
 -L:    [email protected]
 +L:    [email protected]
  L:    [email protected]
  S:    Supported
  F:    Documentation/security/keys/trusted-encrypted.rst
@@@ -7648,8 -7636,9 +7650,8 @@@ F:      include/keys/encrypted-type.
  F:    security/keys/encrypted-keys/
  
  KEYS-TRUSTED
 -M:    David Safford <[email protected]>
  M:    Mimi Zohar <[email protected]>
 -L:    linux-security-module@vger.kernel.org
 +L:    linux-integrity@vger.kernel.org
  L:    [email protected]
  S:    Supported
  F:    Documentation/security/keys/trusted-encrypted.rst
@@@ -7757,11 -7746,6 +7759,11 @@@ S:    Maintaine
  F:    Documentation/scsi/53c700.txt
  F:    drivers/scsi/53c700*
  
 +LEAKING_ADDRESSES
 +M:    Tobin C. Harding <[email protected]>
 +S:    Maintained
 +F:    scripts/leaking_addresses.pl
 +
  LED SUBSYSTEM
  M:    Richard Purdie <[email protected]>
  M:    Jacek Anaszewski <[email protected]>
  S:    Orphan
  F:    drivers/net/wireless/marvell/libertas/
  
 +MARVELL MACCHIATOBIN SUPPORT
 +M:    Russell King <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +F:    arch/arm64/boot/dts/marvell/armada-8040-mcbin.dts
 +
  MARVELL MV643XX ETHERNET DRIVER
  M:    Sebastian Hesselbarth <[email protected]>
  L:    [email protected]
@@@ -8621,12 -8599,6 +8623,12 @@@ M:    Sean Wang <[email protected]
  S:    Maintained
  F:    drivers/media/rc/mtk-cir.c
  
 +MEDIATEK PMIC LED DRIVER
 +M:    Sean Wang <[email protected]>
 +S:    Maintained
 +F:    drivers/leds/leds-mt6323.c
 +F:    Documentation/devicetree/bindings/leds/leds-mt6323.txt
 +
  MEDIATEK ETHERNET DRIVER
  M:    Felix Fietkau <[email protected]>
  M:    John Crispin <[email protected]>
@@@ -8760,7 -8732,7 +8762,7 @@@ Q:      http://patchwork.ozlabs.org/project/
  F:    drivers/net/ethernet/mellanox/mlxsw/
  
  MELLANOX FIRMWARE FLASH LIBRARY (mlxfw)
 -M:    Yotam Gigi <[email protected]>
 +M:    [email protected]
  L:    [email protected]
  S:    Supported
  W:    http://www.mellanox.com
@@@ -8909,7 -8881,7 +8911,7 @@@ F:      Documentation/devicetree/bindings/me
  T:    git git://linuxtv.org/media_tree.git
  
  METAG ARCHITECTURE
 -M:    James Hogan <j[email protected]>
 +M:    James Hogan <j[email protected]>
  L:    [email protected]
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jhogan/metag.git
  S:    Odd Fixes
@@@ -9010,7 -8982,7 +9012,7 @@@ F:      Documentation/mips
  F:    arch/mips/
  
  MIPS BOSTON DEVELOPMENT BOARD
 -M:    Paul Burton <paul.burton@imgtec.com>
 +M:    Paul Burton <paul.burton@mips.com>
  L:    [email protected]
  S:    Maintained
  F:    Documentation/devicetree/bindings/clock/img,boston-clock.txt
@@@ -9020,7 -8992,7 +9022,7 @@@ F:      drivers/clk/imgtec/clk-boston.
  F:    include/dt-bindings/clock/boston-clock.h
  
  MIPS GENERIC PLATFORM
 -M:    Paul Burton <paul.burton@imgtec.com>
 +M:    Paul Burton <paul.burton@mips.com>
  L:    [email protected]
  S:    Supported
  F:    arch/mips/generic/
@@@ -9036,7 -9008,7 +9038,7 @@@ F:      drivers/*/*loongson1
  F:    drivers/*/*/*loongson1*
  
  MIPS RINT INSTRUCTION EMULATION
 -M:    Aleksandar Markovic <aleksandar.markovic@imgtec.com>
 +M:    Aleksandar Markovic <aleksandar.markovic@mips.com>
  L:    [email protected]
  S:    Supported
  F:    arch/mips/math-emu/sp_rint.c
@@@ -9230,6 -9202,7 +9232,6 @@@ F:      include/linux/isicom.
  MUSB MULTIPOINT HIGH SPEED DUAL-ROLE CONTROLLER
  M:    Bin Liu <[email protected]>
  L:    [email protected]
 -T:    git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git
  S:    Maintained
  F:    drivers/usb/musb/
  
@@@ -9377,7 -9350,7 +9379,7 @@@ NETWORK BLOCK DEVICE (NBD
  M:    Josef Bacik <[email protected]>
  S:    Maintained
  L:    [email protected]
 -L:    nbd[email protected]
 +L:    nbd@other.debian.org
  F:    Documentation/blockdev/nbd.txt
  F:    drivers/block/nbd.c
  F:    include/uapi/linux/nbd.h
@@@ -10048,11 -10021,7 +10050,11 @@@ T: git git://github.com/openrisc/linux.
  L:    [email protected]
  W:    http://openrisc.io
  S:    Maintained
 +F:    Documentation/devicetree/bindings/openrisc/
 +F:    Documentation/openrisc/
  F:    arch/openrisc/
 +F:    drivers/irqchip/irq-ompic.c
 +F:    drivers/irqchip/irq-or1k-*
  
  OPENVSWITCH
  M:    Pravin Shelar <[email protected]>
@@@ -10070,7 -10039,7 +10072,7 @@@ M:   Stephen Boyd <[email protected]
  L:    [email protected]
  S:    Maintained
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git
 -F:    drivers/base/power/opp/
 +F:    drivers/opp/
  F:    include/linux/pm_opp.h
  F:    Documentation/power/opp.txt
  F:    Documentation/devicetree/bindings/opp/
@@@ -10200,6 -10169,7 +10202,6 @@@ F:   Documentation/parport*.tx
  
  PARAVIRT_OPS INTERFACE
  M:    Juergen Gross <[email protected]>
 -M:    Chris Wright <[email protected]>
  M:    Alok Kataria <[email protected]>
  M:    Rusty Russell <[email protected]>
  L:    [email protected]
@@@ -10357,6 -10327,7 +10359,6 @@@ F:   drivers/pci/host/vmd.
  
  PCI DRIVER FOR MICROSEMI SWITCHTEC
  M:    Kurt Schwemmer <[email protected]>
 -M:    Stephen Bates <[email protected]>
  M:    Logan Gunthorpe <[email protected]>
  L:    [email protected]
  S:    Maintained
@@@ -10421,7 -10392,6 +10423,7 @@@ F:   drivers/pci/dwc/*keystone
  
  PCI ENDPOINT SUBSYSTEM
  M:    Kishon Vijay Abraham I <[email protected]>
 +M:    Lorenzo Pieralisi <[email protected]>
  L:    [email protected]
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/kishon/pci-endpoint.git
  S:    Supported
@@@ -10473,15 -10443,6 +10475,15 @@@ F: include/linux/pci
  F:    arch/x86/pci/
  F:    arch/x86/kernel/quirks.c
  
 +PCI NATIVE HOST BRIDGE AND ENDPOINT DRIVERS
 +M:    Lorenzo Pieralisi <[email protected]>
 +L:    [email protected]
 +Q:    http://patchwork.ozlabs.org/project/linux-pci/list/
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git/
 +S:    Supported
 +F:    drivers/pci/host/
 +F:    drivers/pci/dwc/
 +
  PCIE DRIVER FOR AXIS ARTPEC
  M:    Niklas Cassel <[email protected]>
  M:    Jesper Nilsson <[email protected]>
@@@ -10501,6 -10462,7 +10503,6 @@@ F:   drivers/pci/host/pci-thunder-
  
  PCIE DRIVER FOR HISILICON
  M:    Zhou Wang <[email protected]>
 -M:    Gabriele Paoloni <[email protected]>
  L:    [email protected]
  S:    Maintained
  F:    Documentation/devicetree/bindings/pci/hisilicon-pcie.txt
@@@ -10587,8 -10549,6 +10589,8 @@@ M:   Peter Zijlstra <[email protected]
  M:    Ingo Molnar <[email protected]>
  M:    Arnaldo Carvalho de Melo <[email protected]>
  R:    Alexander Shishkin <[email protected]>
 +R:    Jiri Olsa <[email protected]>
 +R:    Namhyung Kim <[email protected]>
  L:    [email protected]
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core
  S:    Supported
@@@ -10712,9 -10672,10 +10714,9 @@@ S:  Maintaine
  F:    drivers/pinctrl/spear/
  
  PISTACHIO SOC SUPPORT
 -M:    James Hartley <[email protected]>
 -M:    Ionela Voinescu <[email protected]>
 +M:    James Hartley <[email protected]>
  L:    [email protected]
 -S:    Maintained
 +S:    Odd Fixes
  F:    arch/mips/pistachio/
  F:    arch/mips/include/asm/mach-pistachio/
  F:    arch/mips/boot/dts/img/pistachio*
@@@ -10918,7 -10879,7 +10920,7 @@@ S:   Maintaine
  F:    drivers/block/ps3vram.c
  
  PSAMPLE PACKET SAMPLING SUPPORT:
 -M:    Yotam Gigi <yotamg@mellanox.com>
 +M:    Yotam Gigi <yotam.gi@gmail.com>
  S:    Maintained
  F:    net/psample
  F:    include/net/psample.h
@@@ -11061,6 -11022,7 +11063,6 @@@ F:   drivers/mtd/nand/pxa3xx_nand.
  
  QAT DRIVER
  M:    Giovanni Cabiddu <[email protected]>
 -M:    Salvatore Benedetto <[email protected]>
  L:    [email protected]
  S:    Supported
  F:    drivers/crypto/qat/
  L:    [email protected]
  S:    Maintained
  F:    drivers/crypto/exynos-rng.c
 -F:    Documentation/devicetree/bindings/rng/samsung,exynos-rng4.txt
 +F:    Documentation/devicetree/bindings/crypto/samsung,exynos-rng4.txt
  
  SAMSUNG FRAMEBUFFER DRIVER
  M:    Jingoo Han <[email protected]>
  S:    Maintained
  F:    drivers/mmc/host/sdhci-spear.c
  
 +SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) TI OMAP DRIVER
 +M:    Kishon Vijay Abraham I <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +F:    drivers/mmc/host/sdhci-omap.c
 +
  SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER
  M:    Scott Bauer <[email protected]>
  M:    Jonathan Derrick <[email protected]>
- M:    Rafael Antognolli <[email protected]>
  L:    [email protected]
  S:    Supported
  F:    block/sed*
@@@ -12971,9 -12926,9 +12972,9 @@@ F:   drivers/mmc/host/dw_mmc
  SYNOPSYS HSDK RESET CONTROLLER DRIVER
  M:    Eugeniy Paltsev <[email protected]>
  S:    Supported
 -F:    drivers/reset/reset-hsdk-v1.c
 -F:    include/dt-bindings/reset/snps,hsdk-v1-reset.h
 -F:    Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt
 +F:    drivers/reset/reset-hsdk.c
 +F:    include/dt-bindings/reset/snps,hsdk-reset.h
 +F:    Documentation/devicetree/bindings/reset/snps,hsdk-reset.txt
  
  SYSTEM CONFIGURATION (SYSCON)
  M:    Lee Jones <[email protected]>
@@@ -13632,14 -13587,23 +13633,14 @@@ F:        drivers/platform/x86/toshiba-wmi.
  
  TPM DEVICE DRIVER
  M:    Peter Huewe <[email protected]>
 -M:    Marcel Selhorst <[email protected]>
  M:    Jarkko Sakkinen <[email protected]>
  R:    Jason Gunthorpe <[email protected]>
 -W:    http://tpmdd.sourceforge.net
 -L:    [email protected] (moderated for non-subscribers)
 -Q:    https://patchwork.kernel.org/project/tpmdd-devel/list/
 +L:    [email protected]
 +Q:    https://patchwork.kernel.org/project/linux-integrity/list/
  T:    git git://git.infradead.org/users/jjs/linux-tpmdd.git
  S:    Maintained
  F:    drivers/char/tpm/
  
 -TPM IBM_VTPM DEVICE DRIVER
 -M:    Ashley Lai <[email protected]>
 -W:    http://tpmdd.sourceforge.net
 -L:    [email protected] (moderated for non-subscribers)
 -S:    Maintained
 -F:    drivers/char/tpm/tpm_ibmvtpm*
 -
  TRACING
  M:    Steven Rostedt <[email protected]>
  M:    Ingo Molnar <[email protected]>
  L:    [email protected]
  S:    Supported
  F:    drivers/s390/virtio/
 +F:    arch/s390/include/uapi/asm/virtio-ccw.h
  
  VIRTIO GPU DRIVER
  M:    David Airlie <[email protected]>
diff --combined block/bio.c
index cc60213e56d8695d121cec7e7402341175e24f46,ae9ad34e6a71f206cdcc717f0ac0993c8034cc99..b94a802f8ba341894d6c4eafb0c04f87eacbe1c6
@@@ -400,7 -400,7 +400,7 @@@ static void punt_bios_to_rescuer(struc
  
  /**
   * bio_alloc_bioset - allocate a bio for I/O
-  * @gfp_mask:   the GFP_ mask given to the slab allocator
+  * @gfp_mask:   the GFP_* mask given to the slab allocator
   * @nr_iovecs:        number of iovecs to pre-allocate
   * @bs:               the bio_set to allocate from.
   *
@@@ -917,9 -917,17 +917,9 @@@ int bio_iov_iter_get_pages(struct bio *
  }
  EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
  
 -struct submit_bio_ret {
 -      struct completion event;
 -      int error;
 -};
 -
  static void submit_bio_wait_endio(struct bio *bio)
  {
 -      struct submit_bio_ret *ret = bio->bi_private;
 -
 -      ret->error = blk_status_to_errno(bio->bi_status);
 -      complete(&ret->event);
 +      complete(bio->bi_private);
  }
  
  /**
   */
  int submit_bio_wait(struct bio *bio)
  {
 -      struct submit_bio_ret ret;
 +      DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map);
  
 -      init_completion(&ret.event);
 -      bio->bi_private = &ret;
 +      bio->bi_private = &done;
        bio->bi_end_io = submit_bio_wait_endio;
        bio->bi_opf |= REQ_SYNC;
        submit_bio(bio);
 -      wait_for_completion_io(&ret.event);
 +      wait_for_completion_io(&done);
  
 -      return ret.error;
 +      return blk_status_to_errno(bio->bi_status);
  }
  EXPORT_SYMBOL(submit_bio_wait);
  
@@@ -1230,8 -1239,8 +1230,8 @@@ struct bio *bio_copy_user_iov(struct re
         */
        bmd->is_our_pages = map_data ? 0 : 1;
        memcpy(bmd->iov, iter->iov, sizeof(struct iovec) * iter->nr_segs);
 -      iov_iter_init(&bmd->iter, iter->type, bmd->iov,
 -                      iter->nr_segs, iter->count);
 +      bmd->iter = *iter;
 +      bmd->iter.iov = bmd->iov;
  
        ret = -ENOMEM;
        bio = bio_kmalloc(gfp_mask, nr_pages);
@@@ -1322,7 -1331,6 +1322,7 @@@ struct bio *bio_map_user_iov(struct req
        int ret, offset;
        struct iov_iter i;
        struct iovec iov;
 +      struct bio_vec *bvec;
  
        iov_for_each(iov, i, *iter) {
                unsigned long uaddr = (unsigned long) iov.iov_base;
                ret = get_user_pages_fast(uaddr, local_nr_pages,
                                (iter->type & WRITE) != WRITE,
                                &pages[cur_page]);
 -              if (ret < local_nr_pages) {
 +              if (unlikely(ret < local_nr_pages)) {
 +                      for (j = cur_page; j < page_limit; j++) {
 +                              if (!pages[j])
 +                                      break;
 +                              put_page(pages[j]);
 +                      }
                        ret = -EFAULT;
                        goto out_unmap;
                }
                offset = offset_in_page(uaddr);
                for (j = cur_page; j < page_limit; j++) {
                        unsigned int bytes = PAGE_SIZE - offset;
 +                      unsigned short prev_bi_vcnt = bio->bi_vcnt;
  
                        if (len <= 0)
                                break;
                                            bytes)
                                break;
  
 +                      /*
 +                       * check if vector was merged with previous
 +                       * drop page reference if needed
 +                       */
 +                      if (bio->bi_vcnt == prev_bi_vcnt)
 +                              put_page(pages[j]);
 +
                        len -= bytes;
                        offset = 0;
                }
        return bio;
  
   out_unmap:
 -      for (j = 0; j < nr_pages; j++) {
 -              if (!pages[j])
 -                      break;
 -              put_page(pages[j]);
 +      bio_for_each_segment_all(bvec, bio, j) {
 +              put_page(bvec->bv_page);
        }
   out:
        kfree(pages);
@@@ -1931,11 -1928,8 +1931,8 @@@ void bioset_free(struct bio_set *bs
        if (bs->rescue_workqueue)
                destroy_workqueue(bs->rescue_workqueue);
  
-       if (bs->bio_pool)
-               mempool_destroy(bs->bio_pool);
-       if (bs->bvec_pool)
-               mempool_destroy(bs->bvec_pool);
+       mempool_destroy(bs->bio_pool);
+       mempool_destroy(bs->bvec_pool);
  
        bioset_integrity_free(bs);
        bio_put_slab(bs);
@@@ -2035,37 -2029,6 +2032,6 @@@ int bio_associate_blkcg(struct bio *bio
  }
  EXPORT_SYMBOL_GPL(bio_associate_blkcg);
  
- /**
-  * bio_associate_current - associate a bio with %current
-  * @bio: target bio
-  *
-  * Associate @bio with %current if it hasn't been associated yet.  Block
-  * layer will treat @bio as if it were issued by %current no matter which
-  * task actually issues it.
-  *
-  * This function takes an extra reference of @task's io_context and blkcg
-  * which will be put when @bio is released.  The caller must own @bio,
-  * ensure %current->io_context exists, and is responsible for synchronizing
-  * calls to this function.
-  */
- int bio_associate_current(struct bio *bio)
- {
-       struct io_context *ioc;
-       if (bio->bi_css)
-               return -EBUSY;
-       ioc = current->io_context;
-       if (!ioc)
-               return -ENOENT;
-       get_io_context_active(ioc);
-       bio->bi_ioc = ioc;
-       bio->bi_css = task_get_css(current, io_cgrp_id);
-       return 0;
- }
- EXPORT_SYMBOL_GPL(bio_associate_current);
  /**
   * bio_disassociate_task - undo bio_associate_current()
   * @bio: target bio
diff --combined block/blk-lib.c
index 63fb971d65745ac0621c69b6bc22ad5b0b76dd84,f625fda5f0955a42ae6aa6f5379f15e417ddc66f..2bc544ce3d2e5a89c2c216c85d72e00f9172bb09
@@@ -1,4 -1,3 +1,4 @@@
 +// SPDX-License-Identifier: GPL-2.0
  /*
   * Functions related to generic helpers functions
   */
@@@ -275,6 -274,40 +275,40 @@@ static unsigned int __blkdev_sectors_to
        return min(pages, (sector_t)BIO_MAX_PAGES);
  }
  
+ static int __blkdev_issue_zero_pages(struct block_device *bdev,
+               sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
+               struct bio **biop)
+ {
+       struct request_queue *q = bdev_get_queue(bdev);
+       struct bio *bio = *biop;
+       int bi_size = 0;
+       unsigned int sz;
+       if (!q)
+               return -ENXIO;
+       while (nr_sects != 0) {
+               bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
+                              gfp_mask);
+               bio->bi_iter.bi_sector = sector;
+               bio_set_dev(bio, bdev);
+               bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+               while (nr_sects != 0) {
+                       sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
+                       bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0);
+                       nr_sects -= bi_size >> 9;
+                       sector += bi_size >> 9;
+                       if (bi_size < sz)
+                               break;
+               }
+               cond_resched();
+       }
+       *biop = bio;
+       return 0;
+ }
  /**
   * __blkdev_issue_zeroout - generate number of zero filed write bios
   * @bdev:     blockdev to issue
   *  Zero-fill a block range, either using hardware offload or by explicitly
   *  writing zeroes to the device.
   *
-  *  Note that this function may fail with -EOPNOTSUPP if the driver signals
-  *  zeroing offload support, but the device fails to process the command (for
-  *  some devices there is no non-destructive way to verify whether this
-  *  operation is actually supported).  In this case the caller should call
-  *  retry the call to blkdev_issue_zeroout() and the fallback path will be used.
-  *
   *  If a device is using logical block provisioning, the underlying space will
   *  not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
   *
@@@ -305,9 -332,6 +333,6 @@@ int __blkdev_issue_zeroout(struct block
                unsigned flags)
  {
        int ret;
-       int bi_size = 0;
-       struct bio *bio = *biop;
-       unsigned int sz;
        sector_t bs_mask;
  
        bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
        ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
                        biop, flags);
        if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
-               goto out;
-       ret = 0;
-       while (nr_sects != 0) {
-               bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
-                              gfp_mask);
-               bio->bi_iter.bi_sector = sector;
-               bio_set_dev(bio, bdev);
-               bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-               while (nr_sects != 0) {
-                       sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
-                       bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0);
-                       nr_sects -= bi_size >> 9;
-                       sector += bi_size >> 9;
-                       if (bi_size < sz)
-                               break;
-               }
-               cond_resched();
-       }
+               return ret;
  
-       *biop = bio;
- out:
-       return ret;
+       return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask,
+                                        biop);
  }
  EXPORT_SYMBOL(__blkdev_issue_zeroout);
  
  int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
  {
-       int ret;
-       struct bio *bio = NULL;
+       int ret = 0;
+       sector_t bs_mask;
+       struct bio *bio;
        struct blk_plug plug;
+       bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev);
  
+       bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
+       if ((sector | nr_sects) & bs_mask)
+               return -EINVAL;
+ retry:
+       bio = NULL;
        blk_start_plug(&plug);
-       ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
-                       &bio, flags);
+       if (try_write_zeroes) {
+               ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects,
+                                                 gfp_mask, &bio, flags);
+       } else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
+               ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects,
+                                               gfp_mask, &bio);
+       } else {
+               /* No zeroing offload support */
+               ret = -EOPNOTSUPP;
+       }
        if (ret == 0 && bio) {
                ret = submit_bio_wait(bio);
                bio_put(bio);
        }
        blk_finish_plug(&plug);
+       if (ret && try_write_zeroes) {
+               if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
+                       try_write_zeroes = false;
+                       goto retry;
+               }
+               if (!bdev_write_zeroes_sectors(bdev)) {
+                       /*
+                        * Zeroing offload support was indicated, but the
+                        * device reported ILLEGAL REQUEST (for some devices
+                        * there is no non-destructive way to verify whether
+                        * WRITE ZEROES is actually supported).
+                        */
+                       ret = -EOPNOTSUPP;
+               }
+       }
  
        return ret;
  }
diff --combined block/blk-mq-debugfs.c
index de294d775acfa413854c109eeaa08b9b6bdfd354,e4f2bb936e6630c6f529d9be97ccd27cc828b682..b56a4f35720d8a46e8a5daf3f2c63a51475308a3
@@@ -54,7 -54,6 +54,6 @@@ static const char *const blk_queue_flag
        QUEUE_FLAG_NAME(NOMERGES),
        QUEUE_FLAG_NAME(SAME_COMP),
        QUEUE_FLAG_NAME(FAIL_IO),
-       QUEUE_FLAG_NAME(STACKABLE),
        QUEUE_FLAG_NAME(NONROT),
        QUEUE_FLAG_NAME(IO_STAT),
        QUEUE_FLAG_NAME(DISCARD),
@@@ -75,6 -74,7 +74,7 @@@
        QUEUE_FLAG_NAME(REGISTERED),
        QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
        QUEUE_FLAG_NAME(QUIESCED),
+       QUEUE_FLAG_NAME(PREEMPT_ONLY),
  };
  #undef QUEUE_FLAG_NAME
  
@@@ -180,7 -180,6 +180,6 @@@ static const char *const hctx_state_nam
        HCTX_STATE_NAME(STOPPED),
        HCTX_STATE_NAME(TAG_ACTIVE),
        HCTX_STATE_NAME(SCHED_RESTART),
-       HCTX_STATE_NAME(TAG_WAITING),
        HCTX_STATE_NAME(START_ON_RUN),
  };
  #undef HCTX_STATE_NAME
@@@ -815,14 -814,10 +814,14 @@@ int blk_mq_debugfs_register(struct requ
                goto err;
  
        /*
 -       * blk_mq_init_hctx() attempted to do this already, but q->debugfs_dir
 +       * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir
         * didn't exist yet (because we don't know what to name the directory
         * until the queue is registered to a gendisk).
         */
 +      if (q->elevator && !q->sched_debugfs_dir)
 +              blk_mq_debugfs_register_sched(q);
 +
 +      /* Similarly, blk_mq_init_hctx() couldn't do this previously. */
        queue_for_each_hw_ctx(q, hctx, i) {
                if (!hctx->debugfs_dir && blk_mq_debugfs_register_hctx(q, hctx))
                        goto err;
diff --combined block/blk-mq-tag.h
index c190165d92ea3fda4a24efca107be007f9787051,5932a7ac7fc4e9f98d417d303e80de18ae8c17e5..61deab0b5a5a565c1214ad305cac52f0dde7fb3d
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef INT_BLK_MQ_TAG_H
  #define INT_BLK_MQ_TAG_H
  
@@@ -44,14 -43,9 +44,9 @@@ static inline struct sbq_wait_state *bt
        return sbq_wait_ptr(bt, &hctx->wait_index);
  }
  
- enum {
-       BLK_MQ_TAG_CACHE_MIN    = 1,
-       BLK_MQ_TAG_CACHE_MAX    = 64,
- };
  enum {
        BLK_MQ_TAG_FAIL         = -1U,
-       BLK_MQ_TAG_MIN          = BLK_MQ_TAG_CACHE_MIN,
+       BLK_MQ_TAG_MIN          = 1,
        BLK_MQ_TAG_MAX          = BLK_MQ_TAG_FAIL - 1,
  };
  
diff --combined block/blk-mq.h
index 4933af9d61f736ed1b99630231b6988071b1e6d2,dcf379a892dda3c349884c4041a213a4bd1d8bd4..6c7c3ff5bf627d3e36a8e1bf1feca66ff00ac74d
@@@ -1,8 -1,8 +1,9 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef INT_BLK_MQ_H
  #define INT_BLK_MQ_H
  
  #include "blk-stat.h"
+ #include "blk-mq-tag.h"
  
  struct blk_mq_tag_set;
  
@@@ -26,16 -26,16 +27,16 @@@ struct blk_mq_ctx 
        struct kobject          kobj;
  } ____cacheline_aligned_in_smp;
  
- void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
  void blk_mq_freeze_queue(struct request_queue *q);
  void blk_mq_free_queue(struct request_queue *q);
  int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
  void blk_mq_wake_waiters(struct request_queue *q);
- bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *);
+ bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
  void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
- bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
  bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
                                bool wait);
+ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+                                       struct blk_mq_ctx *start);
  
  /*
   * Internal helpers for allocating/freeing the request map
@@@ -55,7 -55,7 +56,7 @@@ int blk_mq_alloc_rqs(struct blk_mq_tag_
   */
  void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
                                bool at_head);
- void blk_mq_request_bypass_insert(struct request *rq);
+ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
  void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
                                struct list_head *list);
  
@@@ -109,7 -109,7 +110,7 @@@ static inline void blk_mq_put_ctx(struc
  struct blk_mq_alloc_data {
        /* input parameter */
        struct request_queue *q;
-       unsigned int flags;
+       blk_mq_req_flags_t flags;
        unsigned int shallow_depth;
  
        /* input & output parameter */
@@@ -138,4 -138,53 +139,53 @@@ static inline bool blk_mq_hw_queue_mapp
  void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
                        unsigned int inflight[2]);
  
+ static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
+ {
+       struct request_queue *q = hctx->queue;
+       if (q->mq_ops->put_budget)
+               q->mq_ops->put_budget(hctx);
+ }
+ static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
+ {
+       struct request_queue *q = hctx->queue;
+       if (q->mq_ops->get_budget)
+               return q->mq_ops->get_budget(hctx);
+       return true;
+ }
+ static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
+                                          struct request *rq)
+ {
+       blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
+       rq->tag = -1;
+       if (rq->rq_flags & RQF_MQ_INFLIGHT) {
+               rq->rq_flags &= ~RQF_MQ_INFLIGHT;
+               atomic_dec(&hctx->nr_active);
+       }
+ }
+ static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
+                                      struct request *rq)
+ {
+       if (rq->tag == -1 || rq->internal_tag == -1)
+               return;
+       __blk_mq_put_driver_tag(hctx, rq);
+ }
+ static inline void blk_mq_put_driver_tag(struct request *rq)
+ {
+       struct blk_mq_hw_ctx *hctx;
+       if (rq->tag == -1 || rq->internal_tag == -1)
+               return;
+       hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
+       __blk_mq_put_driver_tag(hctx, rq);
+ }
  #endif
diff --combined block/blk-throttle.c
index 8631763866c6d973225cd643f3d422c9c6a21845,fe49c465ec8604baf0da4ec9ecf078fb8fbd26f3..96ad32623427d4794ad7563369bc9f89bb85fd26
@@@ -1,4 -1,3 +1,4 @@@
 +// SPDX-License-Identifier: GPL-2.0
  /*
   * Interface for controlling IO bandwidth on a request queue
   *
@@@ -1912,11 -1911,11 +1912,11 @@@ static void throtl_upgrade_state(struc
  
                tg->disptime = jiffies - 1;
                throtl_select_dispatch(sq);
 -              throtl_schedule_next_dispatch(sq, false);
 +              throtl_schedule_next_dispatch(sq, true);
        }
        rcu_read_unlock();
        throtl_select_dispatch(&td->service_queue);
 -      throtl_schedule_next_dispatch(&td->service_queue, false);
 +      throtl_schedule_next_dispatch(&td->service_queue, true);
        queue_work(kthrotld_workqueue, &td->dispatch_work);
  }
  
@@@ -2113,8 -2112,12 +2113,12 @@@ static inline void throtl_update_latenc
  static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
  {
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-       if (bio->bi_css)
+       if (bio->bi_css) {
+               if (bio->bi_cg_private)
+                       blkg_put(tg_to_blkg(bio->bi_cg_private));
                bio->bi_cg_private = tg;
+               blkg_get(tg_to_blkg(tg));
+       }
        blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
  #endif
  }
@@@ -2284,8 -2287,10 +2288,10 @@@ void blk_throtl_bio_endio(struct bio *b
  
        start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
        finish_time = __blk_stat_time(finish_time_ns) >> 10;
-       if (!start_time || finish_time <= start_time)
+       if (!start_time || finish_time <= start_time) {
+               blkg_put(tg_to_blkg(tg));
                return;
+       }
  
        lat = finish_time - start_time;
        /* this is only for bio based driver */
                tg->bio_cnt /= 2;
                tg->bad_bio_cnt /= 2;
        }
+       blkg_put(tg_to_blkg(tg));
  }
  #endif
  
diff --combined block/blk-wbt.c
index d822530e6aeade81a7c9b2b3d9c6a6cccc0bb351,e59d59c11ebbb308fe2cc41562fedfd90eecfe6c..b252da0e4c11051f7c78be797122448e231a0bde
@@@ -261,7 -261,7 +261,7 @@@ static inline bool stat_sample_valid(st
  
  static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
  {
 -      u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
 +      u64 now, issue = READ_ONCE(rwb->sync_issue);
  
        if (!issue || !rwb->sync_cookie)
                return 0;
@@@ -654,7 -654,7 +654,7 @@@ void wbt_set_write_cache(struct rq_wb *
  }
  
  /*
-  * Disable wbt, if enabled by default. Only called from CFQ.
+  * Disable wbt, if enabled by default.
   */
  void wbt_disable_default(struct request_queue *q)
  {
diff --combined block/blk.h
index 85be8b232b373b3f69ac829f374e550f06725460,6ac43dfd68a7d489b295b5416fa1723b6e1cd7e8..3f1446937aece26f38ceb66cf4e3d159a23df871
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef BLK_INTERNAL_H
  #define BLK_INTERNAL_H
  
@@@ -123,8 -122,15 +123,15 @@@ void blk_account_io_done(struct reques
   * Internal atomic flags for request handling
   */
  enum rq_atomic_flags {
+       /*
+        * Keep these two bits first - not because we depend on the
+        * value of them, but we do depend on them being in the same
+        * byte of storage to ensure ordering on writes. Keeping them
+        * first will achieve that nicely.
+        */
        REQ_ATOM_COMPLETE = 0,
        REQ_ATOM_STARTED,
        REQ_ATOM_POLL_SLEPT,
  };
  
@@@ -149,45 -155,6 +156,6 @@@ static inline void blk_clear_rq_complet
  
  void blk_insert_flush(struct request *rq);
  
- static inline struct request *__elv_next_request(struct request_queue *q)
- {
-       struct request *rq;
-       struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
-       WARN_ON_ONCE(q->mq_ops);
-       while (1) {
-               if (!list_empty(&q->queue_head)) {
-                       rq = list_entry_rq(q->queue_head.next);
-                       return rq;
-               }
-               /*
-                * Flush request is running and flush request isn't queueable
-                * in the drive, we can hold the queue till flush request is
-                * finished. Even we don't do this, driver can't dispatch next
-                * requests and will requeue them. And this can improve
-                * throughput too. For example, we have request flush1, write1,
-                * flush 2. flush1 is dispatched, then queue is hold, write1
-                * isn't inserted to queue. After flush1 is finished, flush2
-                * will be dispatched. Since disk cache is already clean,
-                * flush2 will be finished very soon, so looks like flush2 is
-                * folded to flush1.
-                * Since the queue is hold, a flag is set to indicate the queue
-                * should be restarted later. Please see flush_end_io() for
-                * details.
-                */
-               if (fq->flush_pending_idx != fq->flush_running_idx &&
-                               !queue_flush_queueable(q)) {
-                       fq->flush_queue_delayed = 1;
-                       return NULL;
-               }
-               if (unlikely(blk_queue_bypass(q)) ||
-                   !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
-                       return NULL;
-       }
- }
  static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
  {
        struct elevator_queue *e = q->elevator;
diff --combined block/genhd.c
index 630c0da6cfcf2633bf8340616b6ba1d013f5f408,997e598f3b86e92864fbb1694a01c6bda66e80e5..c2223f12a8051411d4e89a0bc2850e03d7d0427e
@@@ -588,6 -588,11 +588,11 @@@ static void register_disk(struct devic
        disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
  
+       if (disk->flags & GENHD_FL_HIDDEN) {
+               dev_set_uevent_suppress(ddev, 0);
+               return;
+       }
        /* No minors to use for partitions */
        if (!disk_part_scan_enabled(disk))
                goto exit;
@@@ -616,6 -621,11 +621,11 @@@ exit
        while ((part = disk_part_iter_next(&piter)))
                kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
        disk_part_iter_exit(&piter);
+       err = sysfs_create_link(&ddev->kobj,
+                               &disk->queue->backing_dev_info->dev->kobj,
+                               "bdi");
+       WARN_ON(err);
  }
  
  /**
   */
  void device_add_disk(struct device *parent, struct gendisk *disk)
  {
-       struct backing_dev_info *bdi;
        dev_t devt;
        int retval;
  
         * parameters make sense.
         */
        WARN_ON(disk->minors && !(disk->major || disk->first_minor));
-       WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT));
+       WARN_ON(!disk->minors &&
+               !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN)));
  
        disk->flags |= GENHD_FL_UP;
  
                WARN_ON(1);
                return;
        }
-       disk_to_dev(disk)->devt = devt;
-       /* ->major and ->first_minor aren't supposed to be
-        * dereferenced from here on, but set them just in case.
-        */
        disk->major = MAJOR(devt);
        disk->first_minor = MINOR(devt);
  
        disk_alloc_events(disk);
  
-       /* Register BDI before referencing it from bdev */
-       bdi = disk->queue->backing_dev_info;
-       bdi_register_owner(bdi, disk_to_dev(disk));
-       blk_register_region(disk_devt(disk), disk->minors, NULL,
-                           exact_match, exact_lock, disk);
+       if (disk->flags & GENHD_FL_HIDDEN) {
+               /*
+                * Don't let hidden disks show up in /proc/partitions,
+                * and don't bother scanning for partitions either.
+                */
+               disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
+               disk->flags |= GENHD_FL_NO_PART_SCAN;
+       } else {
+               /* Register BDI before referencing it from bdev */
+               disk_to_dev(disk)->devt = devt;
+               bdi_register_owner(disk->queue->backing_dev_info,
+                               disk_to_dev(disk));
+               blk_register_region(disk_devt(disk), disk->minors, NULL,
+                                   exact_match, exact_lock, disk);
+       }
        register_disk(parent, disk);
        blk_register_queue(disk);
  
         */
        WARN_ON_ONCE(!blk_get_queue(disk->queue));
  
-       retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
-                                  "bdi");
-       WARN_ON(retval);
        disk_add_events(disk);
        blk_integrity_add(disk);
  }
@@@ -705,7 -715,8 +715,8 @@@ void del_gendisk(struct gendisk *disk
        set_capacity(disk, 0);
        disk->flags &= ~GENHD_FL_UP;
  
-       sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
+       if (!(disk->flags & GENHD_FL_HIDDEN))
+               sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
        if (disk->queue) {
                /*
                 * Unregister bdi before releasing device numbers (as they can
        } else {
                WARN_ON(1);
        }
-       blk_unregister_region(disk_devt(disk), disk->minors);
  
-       part_stat_set_all(&disk->part0, 0);
-       disk->part0.stamp = 0;
+       if (!(disk->flags & GENHD_FL_HIDDEN))
+               blk_unregister_region(disk_devt(disk), disk->minors);
  
        kobject_put(disk->part0.holder_dir);
        kobject_put(disk->slave_dir);
+       part_stat_set_all(&disk->part0, 0);
+       disk->part0.stamp = 0;
        if (!sysfs_deprecated)
                sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
        pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
@@@ -785,6 -798,10 +798,10 @@@ struct gendisk *get_gendisk(dev_t devt
                spin_unlock_bh(&ext_devt_lock);
        }
  
+       if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) {
+               put_disk(disk);
+               disk = NULL;
+       }
        return disk;
  }
  EXPORT_SYMBOL(get_gendisk);
@@@ -1028,6 -1045,15 +1045,15 @@@ static ssize_t disk_removable_show(stru
                       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
  }
  
+ static ssize_t disk_hidden_show(struct device *dev,
+                                  struct device_attribute *attr, char *buf)
+ {
+       struct gendisk *disk = dev_to_disk(dev);
+       return sprintf(buf, "%d\n",
+                      (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
+ }
  static ssize_t disk_ro_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
  {
@@@ -1065,6 -1091,7 +1091,7 @@@ static ssize_t disk_discard_alignment_s
  static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
  static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
  static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
+ static DEVICE_ATTR(hidden, S_IRUGO, disk_hidden_show, NULL);
  static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
  static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
  static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
@@@ -1089,6 -1116,7 +1116,7 @@@ static struct attribute *disk_attrs[] 
        &dev_attr_range.attr,
        &dev_attr_ext_range.attr,
        &dev_attr_removable.attr,
+       &dev_attr_hidden.attr,
        &dev_attr_ro.attr,
        &dev_attr_size.attr,
        &dev_attr_alignment_offset.attr,
@@@ -1354,7 -1382,13 +1382,7 @@@ dev_t blk_lookup_devt(const char *name
  }
  EXPORT_SYMBOL(blk_lookup_devt);
  
 -struct gendisk *alloc_disk(int minors)
 -{
 -      return alloc_disk_node(minors, NUMA_NO_NODE);
 -}
 -EXPORT_SYMBOL(alloc_disk);
 -
 -struct gendisk *alloc_disk_node(int minors, int node_id)
 +struct gendisk *__alloc_disk_node(int minors, int node_id)
  {
        struct gendisk *disk;
        struct disk_part_tbl *ptbl;
        }
        return disk;
  }
 -EXPORT_SYMBOL(alloc_disk_node);
 +EXPORT_SYMBOL(__alloc_disk_node);
  
  struct kobject *get_disk(struct gendisk *disk)
  {
diff --combined drivers/block/Kconfig
index 7b2df7a54d8759f45e5467bb31a31a94500f7651,95e678716b5e9e80b04fd7eedc409a1fe7f5cb0e..923b417eaf4c9939aec7cc48681bcc797ec10128
@@@ -1,4 -1,3 +1,4 @@@
 +# SPDX-License-Identifier: GPL-2.0
  #
  # Block device driver configuration
  #
@@@ -18,7 -17,7 +18,7 @@@ if BLK_DE
  
  config BLK_DEV_NULL_BLK
        tristate "Null test block driver"
 -      depends on CONFIGFS_FS
 +      select CONFIGFS_FS
  
  config BLK_DEV_FD
        tristate "Normal floppy disk support"
@@@ -68,9 -67,13 +68,13 @@@ config AMIGA_Z2RA
          To compile this driver as a module, choose M here: the
          module will be called z2ram.
  
+ config CDROM
+       tristate
  config GDROM
        tristate "SEGA Dreamcast GD-ROM drive"
        depends on SH_DREAMCAST
+       select CDROM
        select BLK_SCSI_REQUEST # only for the generic cdrom code
        help
          A standard SEGA Dreamcast comes with a modified CD ROM drive called a
@@@ -348,6 -351,7 +352,7 @@@ config BLK_DEV_RAM_DA
  config CDROM_PKTCDVD
        tristate "Packet writing on CD/DVD media (DEPRECATED)"
        depends on !UML
+       select CDROM
        select BLK_SCSI_REQUEST
        help
          Note: This driver is deprecated and will be removed from the
diff --combined drivers/block/nbd.c
index 9adfb5445f8dca5a88a4ffe59d3573ed5b854e02,95cab69d9c8be14a27206a96dc451f41add31cda..5f2a4240a204d54fc6fe87e569dc6165d5190530
@@@ -243,6 -243,7 +243,6 @@@ static void nbd_size_set(struct nbd_dev
        struct nbd_config *config = nbd->config;
        config->blksize = blocksize;
        config->bytesize = blocksize * nr_blocks;
 -      nbd_size_update(nbd);
  }
  
  static void nbd_complete_rq(struct request *req)
@@@ -288,15 -289,6 +288,6 @@@ static enum blk_eh_timer_return nbd_xmi
                cmd->status = BLK_STS_TIMEOUT;
                return BLK_EH_HANDLED;
        }
-       /* If we are waiting on our dead timer then we could get timeout
-        * callbacks for our request.  For this we just want to reset the timer
-        * and let the queue side take care of everything.
-        */
-       if (!completion_done(&cmd->send_complete)) {
-               nbd_config_put(nbd);
-               return BLK_EH_RESET_TIMER;
-       }
        config = nbd->config;
  
        if (config->num_connections > 1) {
@@@ -386,15 -378,6 +377,15 @@@ static int sock_xmit(struct nbd_device 
        return result;
  }
  
 +/*
 + * Different settings for sk->sk_sndtimeo can result in different return values
 + * if there is a signal pending when we enter sendmsg, because reasons?
 + */
 +static inline int was_interrupted(int result)
 +{
 +      return result == -ERESTARTSYS || result == -EINTR;
 +}
 +
  /* always call with the tx_lock held */
  static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
  {
        result = sock_xmit(nbd, index, 1, &from,
                        (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
        if (result <= 0) {
 -              if (result == -ERESTARTSYS) {
 +              if (was_interrupted(result)) {
                        /* If we havne't sent anything we can just return BUSY,
                         * however if we have sent something we need to make
                         * sure we only allow this req to be sent until we are
@@@ -511,7 -494,7 +502,7 @@@ send_pages
                        }
                        result = sock_xmit(nbd, index, 1, &from, flags, &sent);
                        if (result <= 0) {
 -                              if (result == -ERESTARTSYS) {
 +                              if (was_interrupted(result)) {
                                        /* We've already sent the header, we
                                         * have no choice but to set pending and
                                         * return BUSY.
@@@ -723,9 -706,9 +714,9 @@@ static int wait_for_reconnect(struct nb
                return 0;
        if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
                return 0;
-       wait_event_interruptible_timeout(config->conn_wait,
-                                        atomic_read(&config->live_connections),
-                                        config->dead_conn_timeout);
+       wait_event_timeout(config->conn_wait,
+                          atomic_read(&config->live_connections),
+                          config->dead_conn_timeout);
        return atomic_read(&config->live_connections);
  }
  
@@@ -740,6 -723,7 +731,7 @@@ static int nbd_handle_cmd(struct nbd_cm
        if (!refcount_inc_not_zero(&nbd->config_refs)) {
                dev_err_ratelimited(disk_to_dev(nbd->disk),
                                    "Socks array is empty\n");
+               blk_mq_start_request(req);
                return -EINVAL;
        }
        config = nbd->config;
                dev_err_ratelimited(disk_to_dev(nbd->disk),
                                    "Attempted send on invalid socket\n");
                nbd_config_put(nbd);
+               blk_mq_start_request(req);
                return -EINVAL;
        }
        cmd->status = BLK_STS_OK;
@@@ -771,6 -756,7 +764,7 @@@ again
                         */
                        sock_shutdown(nbd);
                        nbd_config_put(nbd);
+                       blk_mq_start_request(req);
                        return -EIO;
                }
                goto again;
         * here so that it gets put _after_ the request that is already on the
         * dispatch list.
         */
+       blk_mq_start_request(req);
        if (unlikely(nsock->pending && nsock->pending != req)) {
                blk_mq_requeue_request(req, true);
                ret = 0;
        ret = nbd_send_cmd(nbd, cmd, index);
        if (ret == -EAGAIN) {
                dev_err_ratelimited(disk_to_dev(nbd->disk),
-                                   "Request send failed trying another connection\n");
+                                   "Request send failed, requeueing\n");
                nbd_mark_nsock_dead(nbd, nsock, 1);
-               mutex_unlock(&nsock->tx_lock);
-               goto again;
+               blk_mq_requeue_request(req, true);
+               ret = 0;
        }
  out:
        mutex_unlock(&nsock->tx_lock);
@@@ -820,7 -807,6 +815,6 @@@ static blk_status_t nbd_queue_rq(struc
         * done sending everything over the wire.
         */
        init_completion(&cmd->send_complete);
-       blk_mq_start_request(bd->rq);
  
        /* We can be called directly from the user space process, which means we
         * could possibly have signals pending so our sendmsg will fail.  In
         * appropriate.
         */
        ret = nbd_handle_cmd(cmd, hctx->queue_num);
 +      if (ret < 0)
 +              ret = BLK_STS_IOERR;
 +      else if (!ret)
 +              ret = BLK_STS_OK;
        complete(&cmd->send_complete);
  
 -      return ret < 0 ? BLK_STS_IOERR : BLK_STS_OK;
 +      return ret;
  }
  
  static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
@@@ -1102,7 -1084,6 +1096,7 @@@ static int nbd_start_device(struct nbd_
                args->index = i;
                queue_work(recv_workqueue, &args->work);
        }
 +      nbd_size_update(nbd);
        return error;
  }
  
diff --combined drivers/block/null_blk.c
index cda69dbefe3ba52b78b61538773191cae4a1e8b5,50c83c4b2ea02ea57b11f931dc2088b771805250..c61960deb74aac4277d7994f8020fc8f7a65d3bd
@@@ -154,6 -154,10 +154,10 @@@ enum 
        NULL_Q_MQ               = 2,
  };
  
+ static int g_no_sched;
+ module_param_named(no_sched, g_no_sched, int, S_IRUGO);
+ MODULE_PARM_DESC(no_sched, "No io scheduler");
  static int g_submit_queues = 1;
  module_param_named(submit_queues, g_submit_queues, int, S_IRUGO);
  MODULE_PARM_DESC(submit_queues, "Number of submission queues");
@@@ -476,7 -480,7 +480,7 @@@ static struct configfs_item_operations 
        .release        = nullb_device_release,
  };
  
 -static struct config_item_type nullb_device_type = {
 +static const struct config_item_type nullb_device_type = {
        .ct_item_ops    = &nullb_device_ops,
        .ct_attrs       = nullb_device_attrs,
        .ct_owner       = THIS_MODULE,
@@@ -528,7 -532,7 +532,7 @@@ static struct configfs_group_operation
        .drop_item      = nullb_group_drop_item,
  };
  
 -static struct config_item_type nullb_group_type = {
 +static const struct config_item_type nullb_group_type = {
        .ct_group_ops   = &nullb_group_ops,
        .ct_attrs       = nullb_group_attrs,
        .ct_owner       = THIS_MODULE,
@@@ -1754,6 -1758,8 +1758,8 @@@ static int null_init_tag_set(struct nul
        set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
        set->cmd_size   = sizeof(struct nullb_cmd);
        set->flags = BLK_MQ_F_SHOULD_MERGE;
+       if (g_no_sched)
+               set->flags |= BLK_MQ_F_NO_SCHED;
        set->driver_data = NULL;
  
        if ((nullb && nullb->dev->blocking) || g_blocking)
@@@ -1985,8 -1991,10 +1991,10 @@@ static int __init null_init(void
  
        for (i = 0; i < nr_devices; i++) {
                dev = null_alloc_dev();
-               if (!dev)
+               if (!dev) {
+                       ret = -ENOMEM;
                        goto err_dev;
+               }
                ret = null_add_dev(dev);
                if (ret) {
                        null_free_dev(dev);
index b226835a909a3106f1adf2f6a5e6614596caae51,1d5057f5080ba124bac9f04330abdfde573b45a3..f8bd6ef3605a5be238ab486318b11df3c8b41d7c
@@@ -1,4 -1,3 +1,4 @@@
 +# SPDX-License-Identifier: GPL-2.0
  #
  # PARIDE configuration
  #
@@@ -26,6 -25,7 +26,7 @@@ config PARIDE_P
  config PARIDE_PCD
        tristate "Parallel port ATAPI CD-ROMs"
        depends on PARIDE
+       select CDROM
        select BLK_SCSI_REQUEST # only for the generic cdrom code
        ---help---
          This option enables the high-level driver for ATAPI CD-ROM devices
diff --combined drivers/block/skd_main.c
index 64d0fc17c1742ab74aa232da503d08e344b594b2,802ab9f7a8c11f5a50a9d192ffb5e17838475286..2819f23e8bf2fe8f18cac9ccae63a8e298adc73a
@@@ -1967,7 -1967,8 +1967,8 @@@ static void skd_isr_msg_from_dev(struc
                break;
  
        case FIT_MTD_CMD_LOG_HOST_ID:
-               skdev->connect_time_stamp = get_seconds();
+               /* hardware interface overflows in y2106 */
+               skdev->connect_time_stamp = (u32)ktime_get_real_seconds();
                data = skdev->connect_time_stamp & 0xFFFF;
                mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data);
                SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
@@@ -2604,7 -2605,7 +2605,7 @@@ static void *skd_alloc_dma(struct skd_d
                return NULL;
        *dma_handle = dma_map_single(dev, buf, s->size, dir);
        if (dma_mapping_error(dev, *dma_handle)) {
 -              kfree(buf);
 +              kmem_cache_free(s, buf);
                buf = NULL;
        }
        return buf;
diff --combined drivers/cdrom/Makefile
index a95566ff47d30043994642901b6341cc8bb78ec1,7f3f43cc22574f516f6b962c8fa281fdb9c6d46e..0f3664b45f485821bdf60dfe34d27d15d4fa4e5c
@@@ -1,14 -1,2 +1,3 @@@
- # Makefile for the kernel cdrom device drivers.
- #
- # 30 Jan 1998, Michael Elizabeth Chastain, <mailto:[email protected]>
- # Rewritten to use lists instead of if-statements.
- # Each configuration option enables a list of files.
- obj-$(CONFIG_BLK_DEV_IDECD)   +=              cdrom.o
- obj-$(CONFIG_BLK_DEV_SR)      +=              cdrom.o
- obj-$(CONFIG_PARIDE_PCD)      +=              cdrom.o
- obj-$(CONFIG_CDROM_PKTCDVD)   +=              cdrom.o
- obj-$(CONFIG_GDROM)           += gdrom.o      cdrom.o
 +# SPDX-License-Identifier: GPL-2.0
+ obj-$(CONFIG_CDROM)   += cdrom.o
+ obj-$(CONFIG_GDROM)   += gdrom.o
diff --combined drivers/ide/ide-pm.c
index dccdca9eda38692fcbd0764908d9e532a26a7ff0,f56d742908df65a45816a9bb9dfde0f380742ec9..ad8a125defdd51133bada4a30d16f890e5223e31
@@@ -1,4 -1,3 +1,4 @@@
 +// SPDX-License-Identifier: GPL-2.0
  #include <linux/kernel.h>
  #include <linux/gfp.h>
  #include <linux/ide.h>
@@@ -90,9 -89,9 +90,9 @@@ int generic_ide_resume(struct device *d
        }
  
        memset(&rqpm, 0, sizeof(rqpm));
-       rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+       rq = blk_get_request_flags(drive->queue, REQ_OP_DRV_IN,
+                                  BLK_MQ_REQ_PREEMPT);
        ide_req(rq)->type = ATA_PRIV_PM_RESUME;
-       rq->rq_flags |= RQF_PREEMPT;
        rq->special = &rqpm;
        rqpm.pm_step = IDE_PM_START_RESUME;
        rqpm.pm_state = PM_EVENT_ON;
index 08035634795c12d26730375abc4ac8b7c399c5ad,8c5a626343d4da98f26ae503fcf4af9bd7777d61..a27d85232ce1343ce802576ab3584ed67be6f35d
@@@ -1,4 -1,3 +1,4 @@@
 +// SPDX-License-Identifier: GPL-2.0
  /*
   * Primary bucket allocation code
   *
@@@ -407,7 -406,8 +407,8 @@@ long bch_bucket_alloc(struct cache *ca
  
        finish_wait(&ca->set->bucket_wait, &w);
  out:
-       wake_up_process(ca->alloc_thread);
+       if (ca->alloc_thread)
+               wake_up_process(ca->alloc_thread);
  
        trace_bcache_alloc(ca, reserve);
  
                b->prio = INITIAL_PRIO;
        }
  
+       if (ca->set->avail_nbuckets > 0) {
+               ca->set->avail_nbuckets--;
+               bch_update_bucket_in_use(ca->set, &ca->set->gc_stats);
+       }
        return r;
  }
  
@@@ -449,6 -454,11 +455,11 @@@ void __bch_bucket_free(struct cache *ca
  {
        SET_GC_MARK(b, 0);
        SET_GC_SECTORS_USED(b, 0);
+       if (ca->set->avail_nbuckets < ca->set->nbuckets) {
+               ca->set->avail_nbuckets++;
+               bch_update_bucket_in_use(ca->set, &ca->set->gc_stats);
+       }
  }
  
  void bch_bucket_free(struct cache_set *c, struct bkey *k)
@@@ -601,7 -611,7 +612,7 @@@ bool bch_alloc_sectors(struct cache_se
  
        /*
         * If we had to allocate, we might race and not need to allocate the
-        * second time we call find_data_bucket(). If we allocated a bucket but
+        * second time we call pick_data_bucket(). If we allocated a bucket but
         * didn't use it, drop the refcount bch_bucket_alloc_set() took:
         */
        if (KEY_PTRS(&alloc.key))
index abd31e847f967203e9f702cb4ca6e1772f6a7d46,e274082330dcd8b1e564576c2bedc581c830d650..843877e017e1afa51138ede3601ec0c55daf9e8e
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef _BCACHE_H
  #define _BCACHE_H
  
  #include <linux/mutex.h>
  #include <linux/rbtree.h>
  #include <linux/rwsem.h>
+ #include <linux/refcount.h>
  #include <linux/types.h>
  #include <linux/workqueue.h>
  
@@@ -266,9 -266,6 +267,6 @@@ struct bcache_device 
        atomic_t                *stripe_sectors_dirty;
        unsigned long           *full_dirty_stripes;
  
-       unsigned long           sectors_dirty_last;
-       long                    sectors_dirty_derivative;
        struct bio_set          *bio_split;
  
        unsigned                data_csum:1;
@@@ -300,7 -297,7 +298,7 @@@ struct cached_dev 
        struct semaphore        sb_write_mutex;
  
        /* Refcount on the cache set. Always nonzero when we're caching. */
-       atomic_t                count;
+       refcount_t              count;
        struct work_struct      detach;
  
        /*
  
        uint64_t                writeback_rate_target;
        int64_t                 writeback_rate_proportional;
-       int64_t                 writeback_rate_derivative;
-       int64_t                 writeback_rate_change;
+       int64_t                 writeback_rate_integral;
+       int64_t                 writeback_rate_integral_scaled;
+       int32_t                 writeback_rate_change;
  
        unsigned                writeback_rate_update_seconds;
-       unsigned                writeback_rate_d_term;
+       unsigned                writeback_rate_i_term_inverse;
        unsigned                writeback_rate_p_term_inverse;
+       unsigned                writeback_rate_minimum;
  };
  
  enum alloc_reserve {
@@@ -582,6 -581,7 +582,7 @@@ struct cache_set 
        uint8_t                 need_gc;
        struct gc_stat          gc_stats;
        size_t                  nbuckets;
+       size_t                  avail_nbuckets;
  
        struct task_struct      *gc_thread;
        /* Where in the btree gc currently is */
@@@ -807,13 -807,13 +808,13 @@@ do {                                                                    
  
  static inline void cached_dev_put(struct cached_dev *dc)
  {
-       if (atomic_dec_and_test(&dc->count))
+       if (refcount_dec_and_test(&dc->count))
                schedule_work(&dc->detach);
  }
  
  static inline bool cached_dev_get(struct cached_dev *dc)
  {
-       if (!atomic_inc_not_zero(&dc->count))
+       if (!refcount_inc_not_zero(&dc->count))
                return false;
  
        /* Paired with the mb in cached_dev_attach */
index 658c54b3b07a96e8f53b73b1e30eba117e2a2274,d8865e6ead37046b6d9a0b9c56baa1306a9ee089..11c5503d31dc3029df2cde14f8f9e9fc48514bc2
@@@ -1,4 -1,3 +1,4 @@@
 +// SPDX-License-Identifier: GPL-2.0
  /*
   * Copyright (C) 2010 Kent Overstreet <[email protected]>
   *
@@@ -1241,6 -1240,11 +1241,11 @@@ void bch_initial_mark_key(struct cache_
        __bch_btree_mark_key(c, level, k);
  }
  
+ void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats)
+ {
+       stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets;
+ }
  static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
  {
        uint8_t stale = 0;
@@@ -1652,9 -1656,8 +1657,8 @@@ static void btree_gc_start(struct cache
        mutex_unlock(&c->bucket_lock);
  }
  
- static size_t bch_btree_gc_finish(struct cache_set *c)
+ static void bch_btree_gc_finish(struct cache_set *c)
  {
-       size_t available = 0;
        struct bucket *b;
        struct cache *ca;
        unsigned i;
        }
        rcu_read_unlock();
  
+       c->avail_nbuckets = 0;
        for_each_cache(ca, c, i) {
                uint64_t *i;
  
                        BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
  
                        if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
-                               available++;
+                               c->avail_nbuckets++;
                }
        }
  
        mutex_unlock(&c->bucket_lock);
-       return available;
  }
  
  static void bch_btree_gc(struct cache_set *c)
  {
        int ret;
-       unsigned long available;
        struct gc_stat stats;
        struct closure writes;
        struct btree_op op;
                        pr_warn("gc failed!");
        } while (ret);
  
-       available = bch_btree_gc_finish(c);
+       bch_btree_gc_finish(c);
        wake_up_allocators(c);
  
        bch_time_stats_update(&c->btree_gc_time, start_time);
  
        stats.key_bytes *= sizeof(uint64_t);
        stats.data      <<= 9;
-       stats.in_use    = (c->nbuckets - available) * 100 / c->nbuckets;
+       bch_update_bucket_in_use(c, &stats);
        memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
  
        trace_bcache_gc_end(c);
index 42204d61bc9544d29f1d9b53f8eb0b6211c909f7,4073aca09a4982af8647dc5bae9cabf24f82e540..d211e2c25b6bce30591e131663b77123c099b3df
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef _BCACHE_BTREE_H
  #define _BCACHE_BTREE_H
  
@@@ -306,5 -305,5 +306,5 @@@ void bch_keybuf_del(struct keybuf *, st
  struct keybuf_key *bch_keybuf_next(struct keybuf *);
  struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
                                          struct bkey *, keybuf_pred_fn *);
+ void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats);
  #endif
index 965907ce1e2097672d260f43fb8f7ddc5343b414,00fb314cce57c4e910601eb87085f3d58bce321b..ccfbea6f9f6ba93fc0e5dec6103bf5bef960f64b
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef _LINUX_CLOSURE_H
  #define _LINUX_CLOSURE_H
  
@@@ -252,6 -251,12 +252,12 @@@ static inline void set_closure_fn(struc
  static inline void closure_queue(struct closure *cl)
  {
        struct workqueue_struct *wq = cl->wq;
+       /**
+        * Changes made to closure, work_struct, or a couple of other structs
+        * may cause work.func not pointing to the right location.
+        */
+       BUILD_BUG_ON(offsetof(struct closure, fn)
+                    != offsetof(struct work_struct, func));
        if (wq) {
                INIT_WORK(&cl->work, cl->work.func);
                BUG_ON(!queue_work(wq, &cl->work));
index 3475d6628e219f6eecda80e1bf98c56ef67c3bc4,597dd1e87beab2504c40475b996058efdafdbe58..3a7aed7282b2a0227e9f04cec6d82404bad55f23
@@@ -1,4 -1,3 +1,4 @@@
 +// SPDX-License-Identifier: GPL-2.0
  /*
   * Main bcache entry point - handle a read or a write request and decide what to
   * do with it; the make_request functions are called by the block layer.
@@@ -27,12 -26,12 +27,12 @@@ struct kmem_cache *bch_search_cache
  
  static void bch_data_insert_start(struct closure *);
  
- static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
+ static unsigned cache_mode(struct cached_dev *dc)
  {
        return BDEV_CACHE_MODE(&dc->sb);
  }
  
- static bool verify(struct cached_dev *dc, struct bio *bio)
+ static bool verify(struct cached_dev *dc)
  {
        return dc->verify;
  }
@@@ -370,7 -369,7 +370,7 @@@ static struct hlist_head *iohash(struc
  static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
  {
        struct cache_set *c = dc->disk.c;
-       unsigned mode = cache_mode(dc, bio);
+       unsigned mode = cache_mode(dc);
        unsigned sectors, congested = bch_get_congested(c);
        struct task_struct *task = current;
        struct io *i;
             op_is_write(bio_op(bio))))
                goto skip;
  
+       /*
+        * Flag for bypass if the IO is for read-ahead or background,
+        * unless the read-ahead request is for metadata (eg, for gfs2).
+        */
+       if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
+           !(bio->bi_opf & REQ_META))
+               goto skip;
        if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
            bio_sectors(bio) & (c->sb.block_size - 1)) {
                pr_debug("skipping unaligned io");
@@@ -463,6 -470,7 +471,7 @@@ struct search 
        unsigned                recoverable:1;
        unsigned                write:1;
        unsigned                read_dirty_data:1;
+       unsigned                cache_missed:1;
  
        unsigned long           start_time;
  
@@@ -649,6 -657,7 +658,7 @@@ static inline struct search *search_all
  
        s->orig_bio             = bio;
        s->cache_miss           = NULL;
+       s->cache_missed         = 0;
        s->d                    = d;
        s->recoverable          = 1;
        s->write                = op_is_write(bio_op(bio));
@@@ -698,8 -707,16 +708,16 @@@ static void cached_dev_read_error(struc
  {
        struct search *s = container_of(cl, struct search, cl);
        struct bio *bio = &s->bio.bio;
+       struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
  
-       if (s->recoverable) {
+       /*
+        * If cache device is dirty (dc->has_dirty is non-zero), then
+        * recovery a failed read request from cached device may get a
+        * stale data back. So read failure recovery is only permitted
+        * when cache device is clean.
+        */
+       if (s->recoverable &&
+           (dc && !atomic_read(&dc->has_dirty))) {
                /* Retry from the backing device: */
                trace_bcache_read_retry(s->orig_bio);
  
@@@ -740,7 -757,7 +758,7 @@@ static void cached_dev_read_done(struc
                s->cache_miss = NULL;
        }
  
-       if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data)
+       if (verify(dc) && s->recoverable && !s->read_dirty_data)
                bch_data_verify(dc, s->orig_bio);
  
        bio_complete(s);
@@@ -760,12 -777,12 +778,12 @@@ static void cached_dev_read_done_bh(str
        struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
  
        bch_mark_cache_accounting(s->iop.c, s->d,
-                                 !s->cache_miss, s->iop.bypass);
+                                 !s->cache_missed, s->iop.bypass);
        trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
  
        if (s->iop.status)
                continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
-       else if (s->iop.bio || verify(dc, &s->bio.bio))
+       else if (s->iop.bio || verify(dc))
                continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
        else
                continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
@@@ -779,6 -796,8 +797,8 @@@ static int cached_dev_cache_miss(struc
        struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
        struct bio *miss, *cache_bio;
  
+       s->cache_missed = 1;
        if (s->cache_miss || s->iop.bypass) {
                miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
                ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
@@@ -892,7 -911,7 +912,7 @@@ static void cached_dev_write(struct cac
                s->iop.bypass = true;
  
        if (should_writeback(dc, s->orig_bio,
-                            cache_mode(dc, bio),
+                            cache_mode(dc),
                             s->iop.bypass)) {
                s->iop.bypass = false;
                s->iop.writeback = true;
index 234b2f5b286df209984ed1223ae51cedfbb9e314,2290bffd49228e8ac85f7bd999caf9524e7d9036..b4184092c7279fa2fe1246f6a3e70650c850d6a1
@@@ -1,4 -1,3 +1,4 @@@
 +// SPDX-License-Identifier: GPL-2.0
  /*
   * bcache sysfs interfaces
   *
@@@ -82,8 -81,9 +82,9 @@@ rw_attribute(writeback_delay)
  rw_attribute(writeback_rate);
  
  rw_attribute(writeback_rate_update_seconds);
- rw_attribute(writeback_rate_d_term);
+ rw_attribute(writeback_rate_i_term_inverse);
  rw_attribute(writeback_rate_p_term_inverse);
+ rw_attribute(writeback_rate_minimum);
  read_attribute(writeback_rate_debug);
  
  read_attribute(stripe_size);
@@@ -131,15 -131,16 +132,16 @@@ SHOW(__bch_cached_dev
        sysfs_hprint(writeback_rate,    dc->writeback_rate.rate << 9);
  
        var_print(writeback_rate_update_seconds);
-       var_print(writeback_rate_d_term);
+       var_print(writeback_rate_i_term_inverse);
        var_print(writeback_rate_p_term_inverse);
+       var_print(writeback_rate_minimum);
  
        if (attr == &sysfs_writeback_rate_debug) {
                char rate[20];
                char dirty[20];
                char target[20];
                char proportional[20];
-               char derivative[20];
+               char integral[20];
                char change[20];
                s64 next_io;
  
                bch_hprint(dirty,       bcache_dev_sectors_dirty(&dc->disk) << 9);
                bch_hprint(target,      dc->writeback_rate_target << 9);
                bch_hprint(proportional,dc->writeback_rate_proportional << 9);
-               bch_hprint(derivative,  dc->writeback_rate_derivative << 9);
+               bch_hprint(integral,    dc->writeback_rate_integral_scaled << 9);
                bch_hprint(change,      dc->writeback_rate_change << 9);
  
                next_io = div64_s64(dc->writeback_rate.next - local_clock(),
                               "dirty:\t\t%s\n"
                               "target:\t\t%s\n"
                               "proportional:\t%s\n"
-                              "derivative:\t%s\n"
+                              "integral:\t%s\n"
                               "change:\t\t%s/sec\n"
                               "next io:\t%llims\n",
                               rate, dirty, target, proportional,
-                              derivative, change, next_io);
+                              integral, change, next_io);
        }
  
        sysfs_hprint(dirty_data,
@@@ -214,7 -215,7 +216,7 @@@ STORE(__cached_dev
                            dc->writeback_rate.rate, 1, INT_MAX);
  
        d_strtoul_nonzero(writeback_rate_update_seconds);
-       d_strtoul(writeback_rate_d_term);
+       d_strtoul(writeback_rate_i_term_inverse);
        d_strtoul_nonzero(writeback_rate_p_term_inverse);
  
        d_strtoi_h(sequential_cutoff);
@@@ -320,7 -321,7 +322,7 @@@ static struct attribute *bch_cached_dev
        &sysfs_writeback_percent,
        &sysfs_writeback_rate,
        &sysfs_writeback_rate_update_seconds,
-       &sysfs_writeback_rate_d_term,
+       &sysfs_writeback_rate_i_term_inverse,
        &sysfs_writeback_rate_p_term_inverse,
        &sysfs_writeback_rate_debug,
        &sysfs_dirty_data,
@@@ -746,6 -747,11 +748,11 @@@ static struct attribute *bch_cache_set_
  };
  KTYPE(bch_cache_set_internal);
  
+ static int __bch_cache_cmp(const void *l, const void *r)
+ {
+       return *((uint16_t *)r) - *((uint16_t *)l);
+ }
  SHOW(__bch_cache)
  {
        struct cache *ca = container_of(kobj, struct cache, kobj);
                                               CACHE_REPLACEMENT(&ca->sb));
  
        if (attr == &sysfs_priority_stats) {
-               int cmp(const void *l, const void *r)
-               {       return *((uint16_t *) r) - *((uint16_t *) l); }
                struct bucket *b;
                size_t n = ca->sb.nbuckets, i;
                size_t unused = 0, available = 0, dirty = 0, meta = 0;
                        p[i] = ca->buckets[i].prio;
                mutex_unlock(&ca->set->bucket_lock);
  
-               sort(p, n, sizeof(uint16_t), cmp, NULL);
+               sort(p, n, sizeof(uint16_t), __bch_cache_cmp, NULL);
  
                while (n &&
                       !cached[n - 1])
diff --combined drivers/md/bcache/util.h
index f54b58282f7756acd13600206714a1e4064728b4,8f509290bb02ec1b77c9306e983adada04aa3351..ed5e8a412eb8e3582251495e137741fa6a665cf6
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  
  #ifndef _BCACHE_UTIL_H
  #define _BCACHE_UTIL_H
@@@ -442,10 -441,10 +442,10 @@@ struct bch_ratelimit 
        uint64_t                next;
  
        /*
-        * Rate at which we want to do work, in units per nanosecond
+        * Rate at which we want to do work, in units per second
         * The units here correspond to the units passed to bch_next_delay()
         */
-       unsigned                rate;
+       uint32_t                rate;
  };
  
  static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
index 70454f2ad2faacde608f40638dd55955eed4e586,9b770b13bdf62058f372cb799575c0f9b399d065..56a37884ca8b44725794c59fbe9197beb679314c
@@@ -1,4 -1,3 +1,4 @@@
 +// SPDX-License-Identifier: GPL-2.0
  /*
   * background writeback - scan btree for dirty data and write it to the backing
   * device
@@@ -26,48 -25,63 +26,63 @@@ static void __update_writeback_rate(str
                                bcache_flash_devs_sectors_dirty(c);
        uint64_t cache_dirty_target =
                div_u64(cache_sectors * dc->writeback_percent, 100);
        int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
                                   c->cached_dev_sectors);
  
-       /* PD controller */
+       /*
+        * PI controller:
+        * Figures out the amount that should be written per second.
+        *
+        * First, the error (number of sectors that are dirty beyond our
+        * target) is calculated.  The error is accumulated (numerically
+        * integrated).
+        *
+        * Then, the proportional value and integral value are scaled
+        * based on configured values.  These are stored as inverses to
+        * avoid fixed point math and to make configuration easy-- e.g.
+        * the default value of 40 for writeback_rate_p_term_inverse
+        * attempts to write at a rate that would retire all the dirty
+        * blocks in 40 seconds.
+        *
+        * The writeback_rate_i_inverse value of 10000 means that 1/10000th
+        * of the error is accumulated in the integral term per second.
+        * This acts as a slow, long-term average that is not subject to
+        * variations in usage like the p term.
+        */
        int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
-       int64_t derivative = dirty - dc->disk.sectors_dirty_last;
-       int64_t proportional = dirty - target;
-       int64_t change;
-       dc->disk.sectors_dirty_last = dirty;
-       /* Scale to sectors per second */
-       proportional *= dc->writeback_rate_update_seconds;
-       proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
-       derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
-       derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
-                             (dc->writeback_rate_d_term /
-                              dc->writeback_rate_update_seconds) ?: 1, 0);
-       derivative *= dc->writeback_rate_d_term;
-       derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
-       change = proportional + derivative;
+       int64_t error = dirty - target;
+       int64_t proportional_scaled =
+               div_s64(error, dc->writeback_rate_p_term_inverse);
+       int64_t integral_scaled;
+       uint32_t new_rate;
+       if ((error < 0 && dc->writeback_rate_integral > 0) ||
+           (error > 0 && time_before64(local_clock(),
+                        dc->writeback_rate.next + NSEC_PER_MSEC))) {
+               /*
+                * Only decrease the integral term if it's more than
+                * zero.  Only increase the integral term if the device
+                * is keeping up.  (Don't wind up the integral
+                * ineffectively in either case).
+                *
+                * It's necessary to scale this by
+                * writeback_rate_update_seconds to keep the integral
+                * term dimensioned properly.
+                */
+               dc->writeback_rate_integral += error *
+                       dc->writeback_rate_update_seconds;
+       }
  
-       /* Don't increase writeback rate if the device isn't keeping up */
-       if (change > 0 &&
-           time_after64(local_clock(),
-                        dc->writeback_rate.next + NSEC_PER_MSEC))
-               change = 0;
+       integral_scaled = div_s64(dc->writeback_rate_integral,
+                       dc->writeback_rate_i_term_inverse);
  
-       dc->writeback_rate.rate =
-               clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
-                       1, NSEC_PER_MSEC);
+       new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled),
+                       dc->writeback_rate_minimum, NSEC_PER_SEC);
  
-       dc->writeback_rate_proportional = proportional;
-       dc->writeback_rate_derivative = derivative;
-       dc->writeback_rate_change = change;
+       dc->writeback_rate_proportional = proportional_scaled;
+       dc->writeback_rate_integral_scaled = integral_scaled;
+       dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
+       dc->writeback_rate.rate = new_rate;
        dc->writeback_rate_target = target;
  }
  
@@@ -180,13 -194,21 +195,21 @@@ static void write_dirty(struct closure 
        struct dirty_io *io = container_of(cl, struct dirty_io, cl);
        struct keybuf_key *w = io->bio.bi_private;
  
-       dirty_init(w);
-       bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
-       io->bio.bi_iter.bi_sector = KEY_START(&w->key);
-       bio_set_dev(&io->bio, io->dc->bdev);
-       io->bio.bi_end_io       = dirty_endio;
+       /*
+        * IO errors are signalled using the dirty bit on the key.
+        * If we failed to read, we should not attempt to write to the
+        * backing device.  Instead, immediately go to write_dirty_finish
+        * to clean up.
+        */
+       if (KEY_DIRTY(&w->key)) {
+               dirty_init(w);
+               bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
+               io->bio.bi_iter.bi_sector = KEY_START(&w->key);
+               bio_set_dev(&io->bio, io->dc->bdev);
+               io->bio.bi_end_io       = dirty_endio;
  
-       closure_bio_submit(&io->bio, cl);
+               closure_bio_submit(&io->bio, cl);
+       }
  
        continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
  }
@@@ -418,6 -440,8 +441,8 @@@ static int bch_writeback_thread(void *a
        struct cached_dev *dc = arg;
        bool searched_full_index;
  
+       bch_ratelimit_reset(&dc->writeback_rate);
        while (!kthread_should_stop()) {
                down_write(&dc->writeback_lock);
                if (!atomic_read(&dc->has_dirty) ||
  
                up_write(&dc->writeback_lock);
  
-               bch_ratelimit_reset(&dc->writeback_rate);
                read_dirty(dc);
  
                if (searched_full_index) {
                               !kthread_should_stop() &&
                               !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
                                delay = schedule_timeout_interruptible(delay);
+                       bch_ratelimit_reset(&dc->writeback_rate);
                }
        }
  
@@@ -492,8 -517,6 +518,6 @@@ void bch_sectors_dirty_init(struct bcac
  
        bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
                           sectors_dirty_init_fn, 0);
-       d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
  }
  
  void bch_cached_dev_writeback_init(struct cached_dev *dc)
        dc->writeback_percent           = 10;
        dc->writeback_delay             = 30;
        dc->writeback_rate.rate         = 1024;
+       dc->writeback_rate_minimum      = 8;
  
        dc->writeback_rate_update_seconds = 5;
-       dc->writeback_rate_d_term       = 30;
-       dc->writeback_rate_p_term_inverse = 6000;
+       dc->writeback_rate_p_term_inverse = 40;
+       dc->writeback_rate_i_term_inverse = 10000;
  
        INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
  }
index 151544740148618ef8b22407618e42af5855ea28,7d25bff37a9bf95ada7bfdeeb3b8e3fb17aaa469..a9e3ffb4b03c72bcc713b0147583b72072383a94
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef _BCACHE_WRITEBACK_H
  #define _BCACHE_WRITEBACK_H
  
@@@ -77,7 -76,9 +77,9 @@@ static inline bool should_writeback(str
        if (would_skip)
                return false;
  
-       return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
+       return (op_is_sync(bio->bi_opf) ||
+               bio->bi_opf & (REQ_META|REQ_PRIO) ||
+               in_use <= CUTOFF_WRITEBACK);
  }
  
  static inline void bch_writeback_queue(struct cached_dev *dc)
@@@ -90,7 -91,7 +92,7 @@@ static inline void bch_writeback_add(st
  {
        if (!atomic_read(&dc->has_dirty) &&
            !atomic_xchg(&dc->has_dirty, 1)) {
-               atomic_inc(&dc->count);
+               refcount_inc(&dc->count);
  
                if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
                        SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
diff --combined drivers/md/dm.c
index 8aaffa19b29af44301542a87e7e660589cc4ec30,8d07ad61221c64ee7352e3f02f7ae8b40a700967..a3f8cbb98dd5b7e77d62e93160eb4469743ec063
@@@ -52,12 -52,6 +52,12 @@@ static struct workqueue_struct *deferre
  atomic_t dm_global_event_nr = ATOMIC_INIT(0);
  DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
  
 +void dm_issue_global_event(void)
 +{
 +      atomic_inc(&dm_global_event_nr);
 +      wake_up(&dm_global_eventq);
 +}
 +
  /*
   * One of these is allocated per bio.
   */
@@@ -114,7 -108,7 +114,7 @@@ static unsigned reserved_bio_based_ios 
  
  static int __dm_get_module_param_int(int *module_param, int min, int max)
  {
 -      int param = ACCESS_ONCE(*module_param);
 +      int param = READ_ONCE(*module_param);
        int modified_param = 0;
        bool modified = true;
  
  unsigned __dm_get_module_param(unsigned *module_param,
                               unsigned def, unsigned max)
  {
 -      unsigned param = ACCESS_ONCE(*module_param);
 +      unsigned param = READ_ONCE(*module_param);
        unsigned modified_param = 0;
  
        if (!param)
@@@ -1618,17 -1612,6 +1618,6 @@@ static void dm_wq_work(struct work_stru
  
  void dm_init_md_queue(struct mapped_device *md)
  {
-       /*
-        * Request-based dm devices cannot be stacked on top of bio-based dm
-        * devices.  The type of this dm device may not have been decided yet.
-        * The type is decided at the first table loading time.
-        * To prevent problematic device stacking, clear the queue flag
-        * for request stacking support until then.
-        *
-        * This queue is new, so no concurrency on the queue_flags.
-        */
-       queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
        /*
         * Initialize data that will only be used by a non-blk-mq DM queue
         * - must do so here (in alloc_dev callchain) before queue is used
@@@ -1871,8 -1854,9 +1860,8 @@@ static void event_callback(void *contex
        dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
  
        atomic_inc(&md->event_nr);
 -      atomic_inc(&dm_global_event_nr);
        wake_up(&md->eventq);
 -      wake_up(&dm_global_eventq);
 +      dm_issue_global_event();
  }
  
  /*
@@@ -2288,7 -2272,6 +2277,7 @@@ struct dm_table *dm_swap_table(struct m
        }
  
        map = __bind(md, table, &limits);
 +      dm_issue_global_event();
  
  out:
        mutex_unlock(&md->suspend_lock);
index 7b96e4588a128bd71953be919bd3df7585220805,b856f2f549cdaf28c8efdd8b59c94d168dfb499f..a25fd43650ad6b7df8ffbe3bcd2eff859594b6bc
@@@ -1,4 -1,3 +1,4 @@@
 +# SPDX-License-Identifier: GPL-2.0
  obj-$(CONFIG_NVME_CORE)                       += nvme-core.o
  obj-$(CONFIG_BLK_DEV_NVME)            += nvme.o
  obj-$(CONFIG_NVME_FABRICS)            += nvme-fabrics.o
@@@ -6,6 -5,7 +6,7 @@@ obj-$(CONFIG_NVME_RDMA)                  += nvme-rdma.
  obj-$(CONFIG_NVME_FC)                 += nvme-fc.o
  
  nvme-core-y                           := core.o
+ nvme-core-$(CONFIG_NVME_MULTIPATH)    += multipath.o
  nvme-core-$(CONFIG_NVM)                       += lightnvm.o
  
  nvme-y                                        += pci.o
diff --combined drivers/nvme/host/core.c
index 37f9039bb9cab29892f783ddb45a459809c3805c,993813ccdc0b679f9357a85cb1389611fcd1d21a..25da74d310d1bbd5e7c62f9a35de94f6279fbc25
  
  #define NVME_MINORS           (1U << MINORBITS)
  
- unsigned char admin_timeout = 60;
- module_param(admin_timeout, byte, 0644);
+ unsigned int admin_timeout = 60;
+ module_param(admin_timeout, uint, 0644);
  MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
  EXPORT_SYMBOL_GPL(admin_timeout);
  
- unsigned char nvme_io_timeout = 30;
- module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
+ unsigned int nvme_io_timeout = 30;
+ module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
  MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
  EXPORT_SYMBOL_GPL(nvme_io_timeout);
  
@@@ -52,9 -52,6 +52,6 @@@ static u8 nvme_max_retries = 5
  module_param_named(max_retries, nvme_max_retries, byte, 0644);
  MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
  
- static int nvme_char_major;
- module_param(nvme_char_major, int, 0);
  static unsigned long default_ps_max_latency_us = 100000;
  module_param(default_ps_max_latency_us, ulong, 0644);
  MODULE_PARM_DESC(default_ps_max_latency_us,
@@@ -71,10 -68,17 +68,17 @@@ MODULE_PARM_DESC(streams, "turn on supp
  struct workqueue_struct *nvme_wq;
  EXPORT_SYMBOL_GPL(nvme_wq);
  
- static LIST_HEAD(nvme_ctrl_list);
- static DEFINE_SPINLOCK(dev_list_lock);
+ static DEFINE_IDA(nvme_subsystems_ida);
+ static LIST_HEAD(nvme_subsystems);
+ static DEFINE_MUTEX(nvme_subsystems_lock);
  
+ static DEFINE_IDA(nvme_instance_ida);
+ static dev_t nvme_chr_devt;
  static struct class *nvme_class;
+ static struct class *nvme_subsys_class;
+ static void nvme_ns_remove(struct nvme_ns *ns);
+ static int nvme_revalidate_disk(struct gendisk *disk);
  
  static __le32 nvme_get_log_dw10(u8 lid, size_t size)
  {
@@@ -101,6 -105,51 +105,51 @@@ static int nvme_reset_ctrl_sync(struct 
        return ret;
  }
  
+ static void nvme_delete_ctrl_work(struct work_struct *work)
+ {
+       struct nvme_ctrl *ctrl =
+               container_of(work, struct nvme_ctrl, delete_work);
+       flush_work(&ctrl->reset_work);
+       nvme_stop_ctrl(ctrl);
+       nvme_remove_namespaces(ctrl);
+       ctrl->ops->delete_ctrl(ctrl);
+       nvme_uninit_ctrl(ctrl);
+       nvme_put_ctrl(ctrl);
+ }
+ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
+ {
+       if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
+               return -EBUSY;
+       if (!queue_work(nvme_wq, &ctrl->delete_work))
+               return -EBUSY;
+       return 0;
+ }
+ EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
+ int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
+ {
+       int ret = 0;
+       /*
+        * Keep a reference until the work is flushed since ->delete_ctrl
+        * can free the controller.
+        */
+       nvme_get_ctrl(ctrl);
+       ret = nvme_delete_ctrl(ctrl);
+       if (!ret)
+               flush_work(&ctrl->delete_work);
+       nvme_put_ctrl(ctrl);
+       return ret;
+ }
+ EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync);
+ static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
+ {
+       return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
+ }
  static blk_status_t nvme_error_status(struct request *req)
  {
        switch (nvme_req(req)->status & 0x7ff) {
@@@ -142,9 -191,16 +191,16 @@@ static inline bool nvme_req_needs_retry
  void nvme_complete_rq(struct request *req)
  {
        if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
-               nvme_req(req)->retries++;
-               blk_mq_requeue_request(req, true);
-               return;
+               if (nvme_req_needs_failover(req)) {
+                       nvme_failover_req(req);
+                       return;
+               }
+               if (!blk_queue_dying(req->q)) {
+                       nvme_req(req)->retries++;
+                       blk_mq_requeue_request(req, true);
+                       return;
+               }
        }
  
        blk_mq_end_request(req, nvme_error_status(req));
@@@ -153,18 -209,13 +209,13 @@@ EXPORT_SYMBOL_GPL(nvme_complete_rq)
  
  void nvme_cancel_request(struct request *req, void *data, bool reserved)
  {
-       int status;
        if (!blk_mq_request_started(req))
                return;
  
        dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
                                "Cancelling I/O %d", req->tag);
  
-       status = NVME_SC_ABORT_REQ;
-       if (blk_queue_dying(req->q))
-               status |= NVME_SC_DNR;
-       nvme_req(req)->status = status;
+       nvme_req(req)->status = NVME_SC_ABORT_REQ;
        blk_mq_complete_request(req);
  
  }
@@@ -205,6 -256,7 +256,7 @@@ bool nvme_change_ctrl_state(struct nvme
        case NVME_CTRL_RECONNECTING:
                switch (old_state) {
                case NVME_CTRL_LIVE:
+               case NVME_CTRL_RESETTING:
                        changed = true;
                        /* FALLTHRU */
                default:
                ctrl->state = new_state;
  
        spin_unlock_irqrestore(&ctrl->lock, flags);
+       if (changed && ctrl->state == NVME_CTRL_LIVE)
+               nvme_kick_requeue_lists(ctrl);
        return changed;
  }
  EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
  
+ static void nvme_free_ns_head(struct kref *ref)
+ {
+       struct nvme_ns_head *head =
+               container_of(ref, struct nvme_ns_head, ref);
+       nvme_mpath_remove_disk(head);
+       ida_simple_remove(&head->subsys->ns_ida, head->instance);
+       list_del_init(&head->entry);
+       cleanup_srcu_struct(&head->srcu);
+       kfree(head);
+ }
+ static void nvme_put_ns_head(struct nvme_ns_head *head)
+ {
+       kref_put(&head->ref, nvme_free_ns_head);
+ }
  static void nvme_free_ns(struct kref *kref)
  {
        struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
        if (ns->ndev)
                nvme_nvm_unregister(ns);
  
-       if (ns->disk) {
-               spin_lock(&dev_list_lock);
-               ns->disk->private_data = NULL;
-               spin_unlock(&dev_list_lock);
-       }
        put_disk(ns->disk);
-       ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
+       nvme_put_ns_head(ns->head);
        nvme_put_ctrl(ns->ctrl);
        kfree(ns);
  }
@@@ -268,31 -332,8 +332,8 @@@ static void nvme_put_ns(struct nvme_ns 
        kref_put(&ns->kref, nvme_free_ns);
  }
  
- static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
- {
-       struct nvme_ns *ns;
-       spin_lock(&dev_list_lock);
-       ns = disk->private_data;
-       if (ns) {
-               if (!kref_get_unless_zero(&ns->kref))
-                       goto fail;
-               if (!try_module_get(ns->ctrl->ops->module))
-                       goto fail_put_ns;
-       }
-       spin_unlock(&dev_list_lock);
-       return ns;
- fail_put_ns:
-       kref_put(&ns->kref, nvme_free_ns);
- fail:
-       spin_unlock(&dev_list_lock);
-       return NULL;
- }
  struct request *nvme_alloc_request(struct request_queue *q,
-               struct nvme_command *cmd, unsigned int flags, int qid)
+               struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
  {
        unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
        struct request *req;
@@@ -417,7 -458,7 +458,7 @@@ static inline void nvme_setup_flush(str
  {
        memset(cmnd, 0, sizeof(*cmnd));
        cmnd->common.opcode = nvme_cmd_flush;
-       cmnd->common.nsid = cpu_to_le32(ns->ns_id);
+       cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
  }
  
  static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
  
        memset(cmnd, 0, sizeof(*cmnd));
        cmnd->dsm.opcode = nvme_cmd_dsm;
-       cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
+       cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
        cmnd->dsm.nr = cpu_to_le32(segments - 1);
        cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
  
@@@ -467,16 -508,6 +508,6 @@@ static inline blk_status_t nvme_setup_r
        u16 control = 0;
        u32 dsmgmt = 0;
  
-       /*
-        * If formated with metadata, require the block layer provide a buffer
-        * unless this namespace is formated such that the metadata can be
-        * stripped/generated by the controller with PRACT=1.
-        */
-       if (ns && ns->ms &&
-           (!ns->pi_type || ns->ms != sizeof(struct t10_pi_tuple)) &&
-           !blk_integrity_rq(req) && !blk_rq_is_passthrough(req))
-               return BLK_STS_NOTSUPP;
        if (req->cmd_flags & REQ_FUA)
                control |= NVME_RW_FUA;
        if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
  
        memset(cmnd, 0, sizeof(*cmnd));
        cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
-       cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
+       cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
        cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
        cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
  
                nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
  
        if (ns->ms) {
+               /*
+                * If formated with metadata, the block layer always provides a
+                * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
+                * we enable the PRACT bit for protection information or set the
+                * namespace capacity to zero to prevent any I/O.
+                */
+               if (!blk_integrity_rq(req)) {
+                       if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
+                               return BLK_STS_NOTSUPP;
+                       control |= NVME_RW_PRINFO_PRACT;
+               }
                switch (ns->pi_type) {
                case NVME_NS_DPS_PI_TYPE3:
                        control |= NVME_RW_PRINFO_PRCHK_GUARD;
                                        nvme_block_nr(ns, blk_rq_pos(req)));
                        break;
                }
-               if (!blk_integrity_rq(req))
-                       control |= NVME_RW_PRINFO_PRACT;
        }
  
        cmnd->rw.control = cpu_to_le16(control);
@@@ -560,7 -601,8 +601,8 @@@ EXPORT_SYMBOL_GPL(nvme_setup_cmd)
   */
  int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
                union nvme_result *result, void *buffer, unsigned bufflen,
-               unsigned timeout, int qid, int at_head, int flags)
+               unsigned timeout, int qid, int at_head,
+               blk_mq_req_flags_t flags)
  {
        struct request *req;
        int ret;
@@@ -778,7 -820,7 +820,7 @@@ static int nvme_identify_ctrl(struct nv
  }
  
  static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
-               u8 *eui64, u8 *nguid, uuid_t *uuid)
+               struct nvme_ns_ids *ids)
  {
        struct nvme_command c = { };
        int status;
                                goto free_data;
                        }
                        len = NVME_NIDT_EUI64_LEN;
-                       memcpy(eui64, data + pos + sizeof(*cur), len);
+                       memcpy(ids->eui64, data + pos + sizeof(*cur), len);
                        break;
                case NVME_NIDT_NGUID:
                        if (cur->nidl != NVME_NIDT_NGUID_LEN) {
                                goto free_data;
                        }
                        len = NVME_NIDT_NGUID_LEN;
-                       memcpy(nguid, data + pos + sizeof(*cur), len);
+                       memcpy(ids->nguid, data + pos + sizeof(*cur), len);
                        break;
                case NVME_NIDT_UUID:
                        if (cur->nidl != NVME_NIDT_UUID_LEN) {
                                goto free_data;
                        }
                        len = NVME_NIDT_UUID_LEN;
-                       uuid_copy(uuid, data + pos + sizeof(*cur));
+                       uuid_copy(&ids->uuid, data + pos + sizeof(*cur));
                        break;
                default:
                        /* Skip unnkown types */
@@@ -968,7 -1010,7 +1010,7 @@@ static int nvme_submit_io(struct nvme_n
        memset(&c, 0, sizeof(c));
        c.rw.opcode = io.opcode;
        c.rw.flags = io.flags;
-       c.rw.nsid = cpu_to_le32(ns->ns_id);
+       c.rw.nsid = cpu_to_le32(ns->head->ns_id);
        c.rw.slba = cpu_to_le64(io.slba);
        c.rw.length = cpu_to_le16(io.nblocks);
        c.rw.control = cpu_to_le16(io.control);
                        metadata, meta_len, io.slba, NULL, 0);
  }
  
+ static u32 nvme_known_admin_effects(u8 opcode)
+ {
+       switch (opcode) {
+       case nvme_admin_format_nvm:
+               return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
+                                       NVME_CMD_EFFECTS_CSE_MASK;
+       case nvme_admin_sanitize_nvm:
+               return NVME_CMD_EFFECTS_CSE_MASK;
+       default:
+               break;
+       }
+       return 0;
+ }
+ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+                                                               u8 opcode)
+ {
+       u32 effects = 0;
+       if (ns) {
+               if (ctrl->effects)
+                       effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
+               if (effects & ~NVME_CMD_EFFECTS_CSUPP)
+                       dev_warn(ctrl->device,
+                                "IO command:%02x has unhandled effects:%08x\n",
+                                opcode, effects);
+               return 0;
+       }
+       if (ctrl->effects)
+               effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
+       else
+               effects = nvme_known_admin_effects(opcode);
+       /*
+        * For simplicity, IO to all namespaces is quiesced even if the command
+        * effects say only one namespace is affected.
+        */
+       if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
+               nvme_start_freeze(ctrl);
+               nvme_wait_freeze(ctrl);
+       }
+       return effects;
+ }
+ static void nvme_update_formats(struct nvme_ctrl *ctrl)
+ {
+       struct nvme_ns *ns;
+       mutex_lock(&ctrl->namespaces_mutex);
+       list_for_each_entry(ns, &ctrl->namespaces, list) {
+               if (ns->disk && nvme_revalidate_disk(ns->disk))
+                       nvme_ns_remove(ns);
+       }
+       mutex_unlock(&ctrl->namespaces_mutex);
+ }
+ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
+ {
+       /*
+        * Revalidate LBA changes prior to unfreezing. This is necessary to
+        * prevent memory corruption if a logical block size was changed by
+        * this command.
+        */
+       if (effects & NVME_CMD_EFFECTS_LBCC)
+               nvme_update_formats(ctrl);
+       if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK))
+               nvme_unfreeze(ctrl);
+       if (effects & NVME_CMD_EFFECTS_CCC)
+               nvme_init_identify(ctrl);
+       if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC))
+               nvme_queue_scan(ctrl);
+ }
  static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
                        struct nvme_passthru_cmd __user *ucmd)
  {
        struct nvme_passthru_cmd cmd;
        struct nvme_command c;
        unsigned timeout = 0;
+       u32 effects;
        int status;
  
        if (!capable(CAP_SYS_ADMIN))
        if (cmd.timeout_ms)
                timeout = msecs_to_jiffies(cmd.timeout_ms);
  
+       effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
        status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
                        (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
                        (void __user *)(uintptr_t)cmd.metadata, cmd.metadata,
                        0, &cmd.result, timeout);
+       nvme_passthru_end(ctrl, effects);
        if (status >= 0) {
                if (put_user(cmd.result, &ucmd->result))
                        return -EFAULT;
        return status;
  }
  
- static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
-               unsigned int cmd, unsigned long arg)
+ /*
+  * Issue ioctl requests on the first available path.  Note that unlike normal
+  * block layer requests we will not retry failed request on another controller.
+  */
+ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
+               struct nvme_ns_head **head, int *srcu_idx)
  {
-       struct nvme_ns *ns = bdev->bd_disk->private_data;
+ #ifdef CONFIG_NVME_MULTIPATH
+       if (disk->fops == &nvme_ns_head_ops) {
+               *head = disk->private_data;
+               *srcu_idx = srcu_read_lock(&(*head)->srcu);
+               return nvme_find_path(*head);
+       }
+ #endif
+       *head = NULL;
+       *srcu_idx = -1;
+       return disk->private_data;
+ }
  
+ static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
+ {
+       if (head)
+               srcu_read_unlock(&head->srcu, idx);
+ }
+ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg)
+ {
        switch (cmd) {
        case NVME_IOCTL_ID:
                force_successful_syscall_return();
-               return ns->ns_id;
+               return ns->head->ns_id;
        case NVME_IOCTL_ADMIN_CMD:
                return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
        case NVME_IOCTL_IO_CMD:
        }
  }
  
- #ifdef CONFIG_COMPAT
- static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
-                       unsigned int cmd, unsigned long arg)
+ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
+               unsigned int cmd, unsigned long arg)
  {
-       return nvme_ioctl(bdev, mode, cmd, arg);
+       struct nvme_ns_head *head = NULL;
+       struct nvme_ns *ns;
+       int srcu_idx, ret;
+       ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+       if (unlikely(!ns))
+               ret = -EWOULDBLOCK;
+       else
+               ret = nvme_ns_ioctl(ns, cmd, arg);
+       nvme_put_ns_from_disk(head, srcu_idx);
+       return ret;
  }
- #else
- #define nvme_compat_ioctl     NULL
- #endif
  
  static int nvme_open(struct block_device *bdev, fmode_t mode)
  {
-       return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
+       struct nvme_ns *ns = bdev->bd_disk->private_data;
+ #ifdef CONFIG_NVME_MULTIPATH
+       /* should never be called due to GENHD_FL_HIDDEN */
+       if (WARN_ON_ONCE(ns->head->disk))
+               return -ENXIO;
+ #endif
+       if (!kref_get_unless_zero(&ns->kref))
+               return -ENXIO;
+       return 0;
  }
  
  static void nvme_release(struct gendisk *disk, fmode_t mode)
  {
-       struct nvme_ns *ns = disk->private_data;
-       module_put(ns->ctrl->ops->module);
-       nvme_put_ns(ns);
+       nvme_put_ns(disk->private_data);
  }
  
  static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
  }
  
  #ifdef CONFIG_BLK_DEV_INTEGRITY
- static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
-               u16 bs)
- {
-       struct nvme_ns *ns = disk->private_data;
-       u16 old_ms = ns->ms;
-       u8 pi_type = 0;
-       ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
-       ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
-       /* PI implementation requires metadata equal t10 pi tuple size */
-       if (ns->ms == sizeof(struct t10_pi_tuple))
-               pi_type = id->dps & NVME_NS_DPS_PI_MASK;
-       if (blk_get_integrity(disk) &&
-           (ns->pi_type != pi_type || ns->ms != old_ms ||
-            bs != queue_logical_block_size(disk->queue) ||
-            (ns->ms && ns->ext)))
-               blk_integrity_unregister(disk);
-       ns->pi_type = pi_type;
- }
- static void nvme_init_integrity(struct nvme_ns *ns)
+ static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
  {
        struct blk_integrity integrity;
  
        memset(&integrity, 0, sizeof(integrity));
-       switch (ns->pi_type) {
+       switch (pi_type) {
        case NVME_NS_DPS_PI_TYPE3:
                integrity.profile = &t10_pi_type3_crc;
                integrity.tag_size = sizeof(u16) + sizeof(u32);
                integrity.profile = NULL;
                break;
        }
-       integrity.tuple_size = ns->ms;
-       blk_integrity_register(ns->disk, &integrity);
-       blk_queue_max_integrity_segments(ns->queue, 1);
+       integrity.tuple_size = ms;
+       blk_integrity_register(disk, &integrity);
+       blk_queue_max_integrity_segments(disk->queue, 1);
  }
  #else
- static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
-               u16 bs)
- {
- }
- static void nvme_init_integrity(struct nvme_ns *ns)
+ static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
  {
  }
  #endif /* CONFIG_BLK_DEV_INTEGRITY */
@@@ -1149,53 -1276,89 +1276,89 @@@ static void nvme_set_chunk_size(struct 
        blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
  }
  
- static void nvme_config_discard(struct nvme_ns *ns)
+ static void nvme_config_discard(struct nvme_ctrl *ctrl,
+               unsigned stream_alignment, struct request_queue *queue)
  {
-       struct nvme_ctrl *ctrl = ns->ctrl;
-       u32 logical_block_size = queue_logical_block_size(ns->queue);
+       u32 size = queue_logical_block_size(queue);
+       if (stream_alignment)
+               size *= stream_alignment;
  
        BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
                        NVME_DSM_MAX_RANGES);
  
-       if (ctrl->nr_streams && ns->sws && ns->sgs) {
-               unsigned int sz = logical_block_size * ns->sws * ns->sgs;
+       queue->limits.discard_alignment = size;
+       queue->limits.discard_granularity = size;
  
-               ns->queue->limits.discard_alignment = sz;
-               ns->queue->limits.discard_granularity = sz;
-       } else {
-               ns->queue->limits.discard_alignment = logical_block_size;
-               ns->queue->limits.discard_granularity = logical_block_size;
-       }
-       blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
-       blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
-       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+       blk_queue_max_discard_sectors(queue, UINT_MAX);
+       blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
+       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, queue);
  
        if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
-               blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX);
+               blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
  }
  
  static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
-               struct nvme_id_ns *id, u8 *eui64, u8 *nguid, uuid_t *uuid)
+               struct nvme_id_ns *id, struct nvme_ns_ids *ids)
  {
+       memset(ids, 0, sizeof(*ids));
        if (ctrl->vs >= NVME_VS(1, 1, 0))
-               memcpy(eui64, id->eui64, sizeof(id->eui64));
+               memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
        if (ctrl->vs >= NVME_VS(1, 2, 0))
-               memcpy(nguid, id->nguid, sizeof(id->nguid));
+               memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
        if (ctrl->vs >= NVME_VS(1, 3, 0)) {
                 /* Don't treat error as fatal we potentially
                  * already have a NGUID or EUI-64
                  */
-               if (nvme_identify_ns_descs(ctrl, nsid, eui64, nguid, uuid))
+               if (nvme_identify_ns_descs(ctrl, nsid, ids))
                        dev_warn(ctrl->device,
                                 "%s: Identify Descriptors failed\n", __func__);
        }
  }
  
+ static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
+ {
+       return !uuid_is_null(&ids->uuid) ||
+               memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) ||
+               memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
+ }
+ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
+ {
+       return uuid_equal(&a->uuid, &b->uuid) &&
+               memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
+               memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
+ }
+ static void nvme_update_disk_info(struct gendisk *disk,
+               struct nvme_ns *ns, struct nvme_id_ns *id)
+ {
+       sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
+       unsigned stream_alignment = 0;
+       if (ns->ctrl->nr_streams && ns->sws && ns->sgs)
+               stream_alignment = ns->sws * ns->sgs;
+       blk_mq_freeze_queue(disk->queue);
+       blk_integrity_unregister(disk);
+       blk_queue_logical_block_size(disk->queue, 1 << ns->lba_shift);
+       if (ns->ms && !ns->ext &&
+           (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
+               nvme_init_integrity(disk, ns->ms, ns->pi_type);
+       if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk))
+               capacity = 0;
+       set_capacity(disk, capacity);
+       if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
+               nvme_config_discard(ns->ctrl, stream_alignment, disk->queue);
+       blk_mq_unfreeze_queue(disk->queue);
+ }
  static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
  {
        struct nvme_ns *ns = disk->private_data;
-       struct nvme_ctrl *ctrl = ns->ctrl;
-       u16 bs;
  
        /*
         * If identify namespace failed, use default 512 byte block size so
        ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
        if (ns->lba_shift == 0)
                ns->lba_shift = 9;
-       bs = 1 << ns->lba_shift;
        ns->noiob = le16_to_cpu(id->noiob);
+       ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
+       ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
+       /* the PI implementation requires metadata equal t10 pi tuple size */
+       if (ns->ms == sizeof(struct t10_pi_tuple))
+               ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+       else
+               ns->pi_type = 0;
  
-       blk_mq_freeze_queue(disk->queue);
-       if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
-               nvme_prep_integrity(disk, id, bs);
-       blk_queue_logical_block_size(ns->queue, bs);
        if (ns->noiob)
                nvme_set_chunk_size(ns);
-       if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
-               nvme_init_integrity(ns);
-       if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
-               set_capacity(disk, 0);
-       else
-               set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
-       if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
-               nvme_config_discard(ns);
-       blk_mq_unfreeze_queue(disk->queue);
+       nvme_update_disk_info(disk, ns, id);
+ #ifdef CONFIG_NVME_MULTIPATH
+       if (ns->head->disk)
+               nvme_update_disk_info(ns->head->disk, ns, id);
+ #endif
  }
  
  static int nvme_revalidate_disk(struct gendisk *disk)
        struct nvme_ns *ns = disk->private_data;
        struct nvme_ctrl *ctrl = ns->ctrl;
        struct nvme_id_ns *id;
-       u8 eui64[8] = { 0 }, nguid[16] = { 0 };
-       uuid_t uuid = uuid_null;
+       struct nvme_ns_ids ids;
        int ret = 0;
  
        if (test_bit(NVME_NS_DEAD, &ns->flags)) {
                return -ENODEV;
        }
  
-       id = nvme_identify_ns(ctrl, ns->ns_id);
+       id = nvme_identify_ns(ctrl, ns->head->ns_id);
        if (!id)
                return -ENODEV;
  
                goto out;
        }
  
-       nvme_report_ns_ids(ctrl, ns->ns_id, id, eui64, nguid, &uuid);
-       if (!uuid_equal(&ns->uuid, &uuid) ||
-           memcmp(&ns->nguid, &nguid, sizeof(ns->nguid)) ||
-           memcmp(&ns->eui, &eui64, sizeof(ns->eui))) {
 +      __nvme_revalidate_disk(disk, id);
+       nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
+       if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
                dev_err(ctrl->device,
-                       "identifiers changed for nsid %d\n", ns->ns_id);
+                       "identifiers changed for nsid %d\n", ns->head->ns_id);
                ret = -ENODEV;
        }
  
@@@ -1287,8 -1442,10 +1443,10 @@@ static char nvme_pr_type(enum pr_type t
  static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
                                u64 key, u64 sa_key, u8 op)
  {
-       struct nvme_ns *ns = bdev->bd_disk->private_data;
+       struct nvme_ns_head *head = NULL;
+       struct nvme_ns *ns;
        struct nvme_command c;
+       int srcu_idx, ret;
        u8 data[16] = { 0, };
  
        put_unaligned_le64(key, &data[0]);
  
        memset(&c, 0, sizeof(c));
        c.common.opcode = op;
-       c.common.nsid = cpu_to_le32(ns->ns_id);
+       c.common.nsid = cpu_to_le32(head->ns_id);
        c.common.cdw10[0] = cpu_to_le32(cdw10);
  
-       return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+       ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+       if (unlikely(!ns))
+               ret = -EWOULDBLOCK;
+       else
+               ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+       nvme_put_ns_from_disk(head, srcu_idx);
+       return ret;
  }
  
  static int nvme_pr_register(struct block_device *bdev, u64 old,
@@@ -1381,7 -1544,7 +1545,7 @@@ EXPORT_SYMBOL_GPL(nvme_sec_submit)
  static const struct block_device_operations nvme_fops = {
        .owner          = THIS_MODULE,
        .ioctl          = nvme_ioctl,
-       .compat_ioctl   = nvme_compat_ioctl,
+       .compat_ioctl   = nvme_ioctl,
        .open           = nvme_open,
        .release        = nvme_release,
        .getgeo         = nvme_getgeo,
        .pr_ops         = &nvme_pr_ops,
  };
  
+ #ifdef CONFIG_NVME_MULTIPATH
+ static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
+ {
+       struct nvme_ns_head *head = bdev->bd_disk->private_data;
+       if (!kref_get_unless_zero(&head->ref))
+               return -ENXIO;
+       return 0;
+ }
+ static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
+ {
+       nvme_put_ns_head(disk->private_data);
+ }
+ const struct block_device_operations nvme_ns_head_ops = {
+       .owner          = THIS_MODULE,
+       .open           = nvme_ns_head_open,
+       .release        = nvme_ns_head_release,
+       .ioctl          = nvme_ioctl,
+       .compat_ioctl   = nvme_ioctl,
+       .getgeo         = nvme_getgeo,
+       .pr_ops         = &nvme_pr_ops,
+ };
+ #endif /* CONFIG_NVME_MULTIPATH */
  static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
  {
        unsigned long timeout =
@@@ -1737,14 -1926,15 +1927,15 @@@ static bool quirk_matches(const struct 
                string_matches(id->fr, q->fr, sizeof(id->fr));
  }
  
- static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
+ static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
+               struct nvme_id_ctrl *id)
  {
        size_t nqnlen;
        int off;
  
        nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
        if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
-               strcpy(ctrl->subnqn, id->subnqn);
+               strncpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
                return;
        }
  
                dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
  
        /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
-       off = snprintf(ctrl->subnqn, NVMF_NQN_SIZE,
+       off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
                        "nqn.2014.08.org.nvmexpress:%4x%4x",
                        le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
-       memcpy(ctrl->subnqn + off, id->sn, sizeof(id->sn));
+       memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
        off += sizeof(id->sn);
-       memcpy(ctrl->subnqn + off, id->mn, sizeof(id->mn));
+       memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
        off += sizeof(id->mn);
-       memset(ctrl->subnqn + off, 0, sizeof(ctrl->subnqn) - off);
+       memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
+ }
+ static void __nvme_release_subsystem(struct nvme_subsystem *subsys)
+ {
+       ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
+       kfree(subsys);
+ }
+ static void nvme_release_subsystem(struct device *dev)
+ {
+       __nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev));
+ }
+ static void nvme_destroy_subsystem(struct kref *ref)
+ {
+       struct nvme_subsystem *subsys =
+                       container_of(ref, struct nvme_subsystem, ref);
+       mutex_lock(&nvme_subsystems_lock);
+       list_del(&subsys->entry);
+       mutex_unlock(&nvme_subsystems_lock);
+       ida_destroy(&subsys->ns_ida);
+       device_del(&subsys->dev);
+       put_device(&subsys->dev);
+ }
+ static void nvme_put_subsystem(struct nvme_subsystem *subsys)
+ {
+       kref_put(&subsys->ref, nvme_destroy_subsystem);
+ }
+ static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
+ {
+       struct nvme_subsystem *subsys;
+       lockdep_assert_held(&nvme_subsystems_lock);
+       list_for_each_entry(subsys, &nvme_subsystems, entry) {
+               if (strcmp(subsys->subnqn, subsysnqn))
+                       continue;
+               if (!kref_get_unless_zero(&subsys->ref))
+                       continue;
+               return subsys;
+       }
+       return NULL;
+ }
+ #define SUBSYS_ATTR_RO(_name, _mode, _show)                   \
+       struct device_attribute subsys_attr_##_name = \
+               __ATTR(_name, _mode, _show, NULL)
+ static ssize_t nvme_subsys_show_nqn(struct device *dev,
+                                   struct device_attribute *attr,
+                                   char *buf)
+ {
+       struct nvme_subsystem *subsys =
+               container_of(dev, struct nvme_subsystem, dev);
+       return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn);
+ }
+ static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
+ #define nvme_subsys_show_str_function(field)                          \
+ static ssize_t subsys_##field##_show(struct device *dev,              \
+                           struct device_attribute *attr, char *buf)   \
+ {                                                                     \
+       struct nvme_subsystem *subsys =                                 \
+               container_of(dev, struct nvme_subsystem, dev);          \
+       return sprintf(buf, "%.*s\n",                                   \
+                      (int)sizeof(subsys->field), subsys->field);      \
+ }                                                                     \
+ static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
+ nvme_subsys_show_str_function(model);
+ nvme_subsys_show_str_function(serial);
+ nvme_subsys_show_str_function(firmware_rev);
+ static struct attribute *nvme_subsys_attrs[] = {
+       &subsys_attr_model.attr,
+       &subsys_attr_serial.attr,
+       &subsys_attr_firmware_rev.attr,
+       &subsys_attr_subsysnqn.attr,
+       NULL,
+ };
+ static struct attribute_group nvme_subsys_attrs_group = {
+       .attrs = nvme_subsys_attrs,
+ };
+ static const struct attribute_group *nvme_subsys_attrs_groups[] = {
+       &nvme_subsys_attrs_group,
+       NULL,
+ };
+ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
+ {
+       struct nvme_subsystem *subsys, *found;
+       int ret;
+       subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
+       if (!subsys)
+               return -ENOMEM;
+       ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL);
+       if (ret < 0) {
+               kfree(subsys);
+               return ret;
+       }
+       subsys->instance = ret;
+       mutex_init(&subsys->lock);
+       kref_init(&subsys->ref);
+       INIT_LIST_HEAD(&subsys->ctrls);
+       INIT_LIST_HEAD(&subsys->nsheads);
+       nvme_init_subnqn(subsys, ctrl, id);
+       memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
+       memcpy(subsys->model, id->mn, sizeof(subsys->model));
+       memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
+       subsys->vendor_id = le16_to_cpu(id->vid);
+       subsys->cmic = id->cmic;
+       subsys->dev.class = nvme_subsys_class;
+       subsys->dev.release = nvme_release_subsystem;
+       subsys->dev.groups = nvme_subsys_attrs_groups;
+       dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance);
+       device_initialize(&subsys->dev);
+       mutex_lock(&nvme_subsystems_lock);
+       found = __nvme_find_get_subsystem(subsys->subnqn);
+       if (found) {
+               /*
+                * Verify that the subsystem actually supports multiple
+                * controllers, else bail out.
+                */
+               if (!(id->cmic & (1 << 1))) {
+                       dev_err(ctrl->device,
+                               "ignoring ctrl due to duplicate subnqn (%s).\n",
+                               found->subnqn);
+                       nvme_put_subsystem(found);
+                       ret = -EINVAL;
+                       goto out_unlock;
+               }
+               __nvme_release_subsystem(subsys);
+               subsys = found;
+       } else {
+               ret = device_add(&subsys->dev);
+               if (ret) {
+                       dev_err(ctrl->device,
+                               "failed to register subsystem device.\n");
+                       goto out_unlock;
+               }
+               ida_init(&subsys->ns_ida);
+               list_add_tail(&subsys->entry, &nvme_subsystems);
+       }
+       ctrl->subsys = subsys;
+       mutex_unlock(&nvme_subsystems_lock);
+       if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
+                       dev_name(ctrl->device))) {
+               dev_err(ctrl->device,
+                       "failed to create sysfs link from subsystem.\n");
+               /* the transport driver will eventually put the subsystem */
+               return -EINVAL;
+       }
+       mutex_lock(&subsys->lock);
+       list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
+       mutex_unlock(&subsys->lock);
+       return 0;
+ out_unlock:
+       mutex_unlock(&nvme_subsystems_lock);
+       put_device(&subsys->dev);
+       return ret;
+ }
+ static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
+                       size_t size)
+ {
+       struct nvme_command c = { };
+       c.common.opcode = nvme_admin_get_log_page;
+       c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
+       c.common.cdw10[0] = nvme_get_log_dw10(log_page, size);
+       return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
+ }
+ static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
+ {
+       int ret;
+       if (!ctrl->effects)
+               ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
+       if (!ctrl->effects)
+               return 0;
+       ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects,
+                                       sizeof(*ctrl->effects));
+       if (ret) {
+               kfree(ctrl->effects);
+               ctrl->effects = NULL;
+       }
+       return ret;
  }
  
  /*
@@@ -1797,9 -2195,19 +2196,19 @@@ int nvme_init_identify(struct nvme_ctr
                return -EIO;
        }
  
-       nvme_init_subnqn(ctrl, id);
+       if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
+               ret = nvme_get_effects_log(ctrl);
+               if (ret < 0)
+                       return ret;
+       }
  
        if (!ctrl->identified) {
+               int i;
+               ret = nvme_init_subsystem(ctrl, id);
+               if (ret)
+                       goto out_free;
                /*
                 * Check for quirks.  Quirk can depend on firmware version,
                 * so, in principle, the set of quirks present can change
                 * the device, but we'd have to make sure that the driver
                 * behaves intelligently if the quirks change.
                 */
-               int i;
                for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
                        if (quirk_matches(id, &core_quirks[i]))
                                ctrl->quirks |= core_quirks[i].quirks;
        }
  
        ctrl->oacs = le16_to_cpu(id->oacs);
-       ctrl->vid = le16_to_cpu(id->vid);
        ctrl->oncs = le16_to_cpup(&id->oncs);
        atomic_set(&ctrl->abort_limit, id->acl + 1);
        ctrl->vwc = id->vwc;
        ctrl->cntlid = le16_to_cpup(&id->cntlid);
-       memcpy(ctrl->serial, id->sn, sizeof(id->sn));
-       memcpy(ctrl->model, id->mn, sizeof(id->mn));
-       memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
        if (id->mdts)
                max_hw_sectors = 1 << (id->mdts + page_shift - 9);
        else
@@@ -1931,33 -2332,12 +2333,12 @@@ EXPORT_SYMBOL_GPL(nvme_init_identify)
  
  static int nvme_dev_open(struct inode *inode, struct file *file)
  {
-       struct nvme_ctrl *ctrl;
-       int instance = iminor(inode);
-       int ret = -ENODEV;
-       spin_lock(&dev_list_lock);
-       list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
-               if (ctrl->instance != instance)
-                       continue;
-               if (!ctrl->admin_q) {
-                       ret = -EWOULDBLOCK;
-                       break;
-               }
-               if (!kref_get_unless_zero(&ctrl->kref))
-                       break;
-               file->private_data = ctrl;
-               ret = 0;
-               break;
-       }
-       spin_unlock(&dev_list_lock);
-       return ret;
- }
+       struct nvme_ctrl *ctrl =
+               container_of(inode->i_cdev, struct nvme_ctrl, cdev);
  
static int nvme_dev_release(struct inode *inode, struct file *file)
- {
-       nvme_put_ctrl(file->private_data);
      if (ctrl->state != NVME_CTRL_LIVE)
+               return -EWOULDBLOCK;
+       file->private_data = ctrl;
        return 0;
  }
  
@@@ -2021,7 -2401,6 +2402,6 @@@ static long nvme_dev_ioctl(struct file 
  static const struct file_operations nvme_dev_fops = {
        .owner          = THIS_MODULE,
        .open           = nvme_dev_open,
-       .release        = nvme_dev_release,
        .unlocked_ioctl = nvme_dev_ioctl,
        .compat_ioctl   = nvme_dev_ioctl,
  };
@@@ -2051,77 -2430,86 +2431,86 @@@ static ssize_t nvme_sysfs_rescan(struc
  }
  static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
  
+ static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
+ {
+       struct gendisk *disk = dev_to_disk(dev);
+       if (disk->fops == &nvme_fops)
+               return nvme_get_ns_from_dev(dev)->head;
+       else
+               return disk->private_data;
+ }
  static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
-                                                               char *buf)
+               char *buf)
  {
-       struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-       struct nvme_ctrl *ctrl = ns->ctrl;
-       int serial_len = sizeof(ctrl->serial);
-       int model_len = sizeof(ctrl->model);
+       struct nvme_ns_head *head = dev_to_ns_head(dev);
+       struct nvme_ns_ids *ids = &head->ids;
+       struct nvme_subsystem *subsys = head->subsys;
+       int serial_len = sizeof(subsys->serial);
+       int model_len = sizeof(subsys->model);
  
-       if (!uuid_is_null(&ns->uuid))
-               return sprintf(buf, "uuid.%pU\n", &ns->uuid);
+       if (!uuid_is_null(&ids->uuid))
+               return sprintf(buf, "uuid.%pU\n", &ids->uuid);
  
-       if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
-               return sprintf(buf, "eui.%16phN\n", ns->nguid);
+       if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
+               return sprintf(buf, "eui.%16phN\n", ids->nguid);
  
-       if (memchr_inv(ns->eui, 0, sizeof(ns->eui)))
-               return sprintf(buf, "eui.%8phN\n", ns->eui);
+       if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
+               return sprintf(buf, "eui.%8phN\n", ids->eui64);
  
-       while (serial_len > 0 && (ctrl->serial[serial_len - 1] == ' ' ||
-                                 ctrl->serial[serial_len - 1] == '\0'))
+       while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
+                                 subsys->serial[serial_len - 1] == '\0'))
                serial_len--;
-       while (model_len > 0 && (ctrl->model[model_len - 1] == ' ' ||
-                                ctrl->model[model_len - 1] == '\0'))
+       while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
+                                subsys->model[model_len - 1] == '\0'))
                model_len--;
  
-       return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid,
-               serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id);
+       return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
+               serial_len, subsys->serial, model_len, subsys->model,
+               head->ns_id);
  }
  static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL);
  
  static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
-                         char *buf)
+               char *buf)
  {
-       struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-       return sprintf(buf, "%pU\n", ns->nguid);
+       return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
  }
  static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL);
  
  static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
-                                                               char *buf)
+               char *buf)
  {
-       struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+       struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
  
        /* For backward compatibility expose the NGUID to userspace if
         * we have no UUID set
         */
-       if (uuid_is_null(&ns->uuid)) {
+       if (uuid_is_null(&ids->uuid)) {
                printk_ratelimited(KERN_WARNING
                                   "No UUID available providing old NGUID\n");
-               return sprintf(buf, "%pU\n", ns->nguid);
+               return sprintf(buf, "%pU\n", ids->nguid);
        }
-       return sprintf(buf, "%pU\n", &ns->uuid);
+       return sprintf(buf, "%pU\n", &ids->uuid);
  }
  static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
  
  static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
-                                                               char *buf)
+               char *buf)
  {
-       struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-       return sprintf(buf, "%8phd\n", ns->eui);
+       return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
  }
  static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
  
  static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
-                                                               char *buf)
+               char *buf)
  {
-       struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-       return sprintf(buf, "%d\n", ns->ns_id);
+       return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
  }
  static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
  
- static struct attribute *nvme_ns_attrs[] = {
+ static struct attribute *nvme_ns_id_attrs[] = {
        &dev_attr_wwid.attr,
        &dev_attr_uuid.attr,
        &dev_attr_nguid.attr,
        NULL,
  };
  
- static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
+ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
                struct attribute *a, int n)
  {
        struct device *dev = container_of(kobj, struct device, kobj);
-       struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+       struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
  
        if (a == &dev_attr_uuid.attr) {
-               if (uuid_is_null(&ns->uuid) &&
-                   !memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
+               if (uuid_is_null(&ids->uuid) &&
+                   !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
                        return 0;
        }
        if (a == &dev_attr_nguid.attr) {
-               if (!memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
+               if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
                        return 0;
        }
        if (a == &dev_attr_eui.attr) {
-               if (!memchr_inv(ns->eui, 0, sizeof(ns->eui)))
+               if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
                        return 0;
        }
        return a->mode;
  }
  
static const struct attribute_group nvme_ns_attr_group = {
-       .attrs          = nvme_ns_attrs,
-       .is_visible     = nvme_ns_attrs_are_visible,
const struct attribute_group nvme_ns_id_attr_group = {
+       .attrs          = nvme_ns_id_attrs,
+       .is_visible     = nvme_ns_id_attrs_are_visible,
  };
  
  #define nvme_show_str_function(field)                                         \
@@@ -2162,10 -2550,15 +2551,15 @@@ static ssize_t  field##_show(struct dev
                            struct device_attribute *attr, char *buf)           \
  {                                                                             \
          struct nvme_ctrl *ctrl = dev_get_drvdata(dev);                                \
-         return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field); \
+         return sprintf(buf, "%.*s\n",                                         \
+               (int)sizeof(ctrl->subsys->field), ctrl->subsys->field);         \
  }                                                                             \
  static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
  
+ nvme_show_str_function(model);
+ nvme_show_str_function(serial);
+ nvme_show_str_function(firmware_rev);
  #define nvme_show_int_function(field)                                         \
  static ssize_t  field##_show(struct device *dev,                              \
                            struct device_attribute *attr, char *buf)           \
  }                                                                             \
  static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
  
- nvme_show_str_function(model);
- nvme_show_str_function(serial);
- nvme_show_str_function(firmware_rev);
  nvme_show_int_function(cntlid);
  
  static ssize_t nvme_sysfs_delete(struct device *dev,
        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
  
        if (device_remove_file_self(dev, attr))
-               ctrl->ops->delete_ctrl(ctrl);
+               nvme_delete_ctrl_sync(ctrl);
        return count;
  }
  static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
@@@ -2231,7 -2621,7 +2622,7 @@@ static ssize_t nvme_sysfs_show_subsysnq
  {
        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
  
-       return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subnqn);
+       return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn);
  }
  static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
  
@@@ -2284,12 -2674,128 +2675,128 @@@ static const struct attribute_group *nv
        NULL,
  };
  
+ static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys,
+               unsigned nsid)
+ {
+       struct nvme_ns_head *h;
+       lockdep_assert_held(&subsys->lock);
+       list_for_each_entry(h, &subsys->nsheads, entry) {
+               if (h->ns_id == nsid && kref_get_unless_zero(&h->ref))
+                       return h;
+       }
+       return NULL;
+ }
+ static int __nvme_check_ids(struct nvme_subsystem *subsys,
+               struct nvme_ns_head *new)
+ {
+       struct nvme_ns_head *h;
+       lockdep_assert_held(&subsys->lock);
+       list_for_each_entry(h, &subsys->nsheads, entry) {
+               if (nvme_ns_ids_valid(&new->ids) &&
+                   nvme_ns_ids_equal(&new->ids, &h->ids))
+                       return -EINVAL;
+       }
+       return 0;
+ }
+ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
+               unsigned nsid, struct nvme_id_ns *id)
+ {
+       struct nvme_ns_head *head;
+       int ret = -ENOMEM;
+       head = kzalloc(sizeof(*head), GFP_KERNEL);
+       if (!head)
+               goto out;
+       ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL);
+       if (ret < 0)
+               goto out_free_head;
+       head->instance = ret;
+       INIT_LIST_HEAD(&head->list);
+       init_srcu_struct(&head->srcu);
+       head->subsys = ctrl->subsys;
+       head->ns_id = nsid;
+       kref_init(&head->ref);
+       nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
+       ret = __nvme_check_ids(ctrl->subsys, head);
+       if (ret) {
+               dev_err(ctrl->device,
+                       "duplicate IDs for nsid %d\n", nsid);
+               goto out_cleanup_srcu;
+       }
+       ret = nvme_mpath_alloc_disk(ctrl, head);
+       if (ret)
+               goto out_cleanup_srcu;
+       list_add_tail(&head->entry, &ctrl->subsys->nsheads);
+       return head;
+ out_cleanup_srcu:
+       cleanup_srcu_struct(&head->srcu);
+       ida_simple_remove(&ctrl->subsys->ns_ida, head->instance);
+ out_free_head:
+       kfree(head);
+ out:
+       return ERR_PTR(ret);
+ }
+ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
+               struct nvme_id_ns *id, bool *new)
+ {
+       struct nvme_ctrl *ctrl = ns->ctrl;
+       bool is_shared = id->nmic & (1 << 0);
+       struct nvme_ns_head *head = NULL;
+       int ret = 0;
+       mutex_lock(&ctrl->subsys->lock);
+       if (is_shared)
+               head = __nvme_find_ns_head(ctrl->subsys, nsid);
+       if (!head) {
+               head = nvme_alloc_ns_head(ctrl, nsid, id);
+               if (IS_ERR(head)) {
+                       ret = PTR_ERR(head);
+                       goto out_unlock;
+               }
+               *new = true;
+       } else {
+               struct nvme_ns_ids ids;
+               nvme_report_ns_ids(ctrl, nsid, id, &ids);
+               if (!nvme_ns_ids_equal(&head->ids, &ids)) {
+                       dev_err(ctrl->device,
+                               "IDs don't match for shared namespace %d\n",
+                                       nsid);
+                       ret = -EINVAL;
+                       goto out_unlock;
+               }
+               *new = false;
+       }
+       list_add_tail(&ns->siblings, &head->list);
+       ns->head = head;
+ out_unlock:
+       mutex_unlock(&ctrl->subsys->lock);
+       return ret;
+ }
  static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
  {
        struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
        struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
  
-       return nsa->ns_id - nsb->ns_id;
+       return nsa->head->ns_id - nsb->head->ns_id;
  }
  
  static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
  
        mutex_lock(&ctrl->namespaces_mutex);
        list_for_each_entry(ns, &ctrl->namespaces, list) {
-               if (ns->ns_id == nsid) {
-                       kref_get(&ns->kref);
+               if (ns->head->ns_id == nsid) {
+                       if (!kref_get_unless_zero(&ns->kref))
+                               continue;
                        ret = ns;
                        break;
                }
-               if (ns->ns_id > nsid)
+               if (ns->head->ns_id > nsid)
                        break;
        }
        mutex_unlock(&ctrl->namespaces_mutex);
@@@ -2318,7 -2825,7 +2826,7 @@@ static int nvme_setup_streams_ns(struc
        if (!ctrl->nr_streams)
                return 0;
  
-       ret = nvme_get_stream_params(ctrl, &s, ns->ns_id);
+       ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
        if (ret)
                return ret;
  
@@@ -2342,33 -2849,27 +2850,27 @@@ static void nvme_alloc_ns(struct nvme_c
        struct gendisk *disk;
        struct nvme_id_ns *id;
        char disk_name[DISK_NAME_LEN];
-       int node = dev_to_node(ctrl->dev);
+       int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
+       bool new = true;
  
        ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
        if (!ns)
                return;
  
-       ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL);
-       if (ns->instance < 0)
-               goto out_free_ns;
        ns->queue = blk_mq_init_queue(ctrl->tagset);
        if (IS_ERR(ns->queue))
-               goto out_release_instance;
+               goto out_free_ns;
        queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
        ns->queue->queuedata = ns;
        ns->ctrl = ctrl;
  
        kref_init(&ns->kref);
-       ns->ns_id = nsid;
        ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
  
        blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
        nvme_set_queue_limits(ctrl, ns->queue);
        nvme_setup_streams_ns(ctrl, ns);
  
-       sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
        id = nvme_identify_ns(ctrl, nsid);
        if (!id)
                goto out_free_queue;
        if (id->ncap == 0)
                goto out_free_id;
  
-       nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid);
+       if (nvme_init_ns_head(ns, nsid, id, &new))
+               goto out_free_id;
+       
+ #ifdef CONFIG_NVME_MULTIPATH
+       /*
+        * If multipathing is enabled we need to always use the subsystem
+        * instance number for numbering our devices to avoid conflicts
+        * between subsystems that have multiple controllers and thus use
+        * the multipath-aware subsystem node and those that have a single
+        * controller and use the controller node directly.
+        */
+       if (ns->head->disk) {
+               sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
+                               ctrl->cntlid, ns->head->instance);
+               flags = GENHD_FL_HIDDEN;
+       } else {
+               sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance,
+                               ns->head->instance);
+       }
+ #else
+       /*
+        * But without the multipath code enabled, multiple controller per
+        * subsystems are visible as devices and thus we cannot use the
+        * subsystem instance.
+        */
+       sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
+ #endif
  
        if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
                if (nvme_nvm_register(ns, disk_name, node)) {
                        dev_warn(ctrl->device, "LightNVM init failure\n");
-                       goto out_free_id;
+                       goto out_unlink_ns;
                }
        }
  
        disk = alloc_disk_node(0, node);
        if (!disk)
-               goto out_free_id;
+               goto out_unlink_ns;
  
        disk->fops = &nvme_fops;
        disk->private_data = ns;
        disk->queue = ns->queue;
-       disk->flags = GENHD_FL_EXT_DEVT;
+       disk->flags = flags;
        memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
        ns->disk = disk;
  
        list_add_tail(&ns->list, &ctrl->namespaces);
        mutex_unlock(&ctrl->namespaces_mutex);
  
-       kref_get(&ctrl->kref);
+       nvme_get_ctrl(ctrl);
  
        kfree(id);
  
        device_add_disk(ctrl->device, ns->disk);
        if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
-                                       &nvme_ns_attr_group))
+                                       &nvme_ns_id_attr_group))
                pr_warn("%s: failed to create sysfs group for identification\n",
                        ns->disk->disk_name);
        if (ns->ndev && nvme_nvm_register_sysfs(ns))
                pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
                        ns->disk->disk_name);
+       if (new)
+               nvme_mpath_add_disk(ns->head);
+       nvme_mpath_add_disk_links(ns);
        return;
+  out_unlink_ns:
+       mutex_lock(&ctrl->subsys->lock);
+       list_del_rcu(&ns->siblings);
+       mutex_unlock(&ctrl->subsys->lock);
   out_free_id:
        kfree(id);
   out_free_queue:
        blk_cleanup_queue(ns->queue);
-  out_release_instance:
-       ida_simple_remove(&ctrl->ns_ida, ns->instance);
   out_free_ns:
        kfree(ns);
  }
  
  static void nvme_ns_remove(struct nvme_ns *ns)
  {
+       struct nvme_ns_head *head = ns->head;
        if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
                return;
  
        if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
                if (blk_get_integrity(ns->disk))
                        blk_integrity_unregister(ns->disk);
+               nvme_mpath_remove_disk_links(ns);
                sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
-                                       &nvme_ns_attr_group);
+                                       &nvme_ns_id_attr_group);
                if (ns->ndev)
                        nvme_nvm_unregister_sysfs(ns);
                del_gendisk(ns->disk);
                blk_cleanup_queue(ns->queue);
        }
  
+       mutex_lock(&ns->ctrl->subsys->lock);
+       nvme_mpath_clear_current_path(ns);
+       if (head)
+               list_del_rcu(&ns->siblings);
+       mutex_unlock(&ns->ctrl->subsys->lock);
        mutex_lock(&ns->ctrl->namespaces_mutex);
        list_del_init(&ns->list);
        mutex_unlock(&ns->ctrl->namespaces_mutex);
  
+       synchronize_srcu(&head->srcu);
        nvme_put_ns(ns);
  }
  
@@@ -2467,7 -3010,7 +3011,7 @@@ static void nvme_remove_invalid_namespa
        struct nvme_ns *ns, *next;
  
        list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
-               if (ns->ns_id > nsid)
+               if (ns->head->ns_id > nsid)
                        nvme_ns_remove(ns);
        }
  }
@@@ -2583,20 -3126,29 +3127,29 @@@ void nvme_remove_namespaces(struct nvme
  }
  EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
  
+ static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
+ {
+       char *envp[2] = { NULL, NULL };
+       u32 aen_result = ctrl->aen_result;
+       ctrl->aen_result = 0;
+       if (!aen_result)
+               return;
+       envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
+       if (!envp[0])
+               return;
+       kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
+       kfree(envp[0]);
+ }
  static void nvme_async_event_work(struct work_struct *work)
  {
        struct nvme_ctrl *ctrl =
                container_of(work, struct nvme_ctrl, async_event_work);
  
-       spin_lock_irq(&ctrl->lock);
-       while (ctrl->state == NVME_CTRL_LIVE && ctrl->event_limit > 0) {
-               int aer_idx = --ctrl->event_limit;
-               spin_unlock_irq(&ctrl->lock);
-               ctrl->ops->submit_async_event(ctrl, aer_idx);
-               spin_lock_irq(&ctrl->lock);
-       }
-       spin_unlock_irq(&ctrl->lock);
+       nvme_aen_uevent(ctrl);
+       ctrl->ops->submit_async_event(ctrl);
  }
  
  static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
  
  static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
  {
-       struct nvme_command c = { };
        struct nvme_fw_slot_info_log *log;
  
        log = kmalloc(sizeof(*log), GFP_KERNEL);
        if (!log)
                return;
  
-       c.common.opcode = nvme_admin_get_log_page;
-       c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
-       c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log));
-       if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log)))
+       if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log)))
                dev_warn(ctrl->device,
                                "Get FW SLOT INFO log error\n");
        kfree(log);
@@@ -2660,7 -3207,7 +3208,7 @@@ static void nvme_fw_act_work(struct wor
                return;
  
        nvme_start_queues(ctrl);
-       /* read FW slot informationi to clear the AER*/
+       /* read FW slot information to clear the AER */
        nvme_get_fw_slot_info(ctrl);
  }
  
@@@ -2668,24 -3215,21 +3216,21 @@@ void nvme_complete_async_event(struct n
                union nvme_result *res)
  {
        u32 result = le32_to_cpu(res->u32);
-       bool done = true;
  
-       switch (le16_to_cpu(status) >> 1) {
-       case NVME_SC_SUCCESS:
-               done = false;
-               /*FALLTHRU*/
-       case NVME_SC_ABORT_REQ:
-               ++ctrl->event_limit;
-               if (ctrl->state == NVME_CTRL_LIVE)
-                       queue_work(nvme_wq, &ctrl->async_event_work);
+       if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
+               return;
+       switch (result & 0x7) {
+       case NVME_AER_ERROR:
+       case NVME_AER_SMART:
+       case NVME_AER_CSS:
+       case NVME_AER_VS:
+               ctrl->aen_result = result;
                break;
        default:
                break;
        }
  
-       if (done)
-               return;
        switch (result & 0xff07) {
        case NVME_AER_NOTICE_NS_CHANGED:
                dev_info(ctrl->device, "rescanning\n");
        default:
                dev_warn(ctrl->device, "async event result %08x\n", result);
        }
- }
- EXPORT_SYMBOL_GPL(nvme_complete_async_event);
- void nvme_queue_async_events(struct nvme_ctrl *ctrl)
- {
-       ctrl->event_limit = NVME_NR_AERS;
        queue_work(nvme_wq, &ctrl->async_event_work);
  }
- EXPORT_SYMBOL_GPL(nvme_queue_async_events);
- static DEFINE_IDA(nvme_instance_ida);
- static int nvme_set_instance(struct nvme_ctrl *ctrl)
- {
-       int instance, error;
-       do {
-               if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
-                       return -ENODEV;
-               spin_lock(&dev_list_lock);
-               error = ida_get_new(&nvme_instance_ida, &instance);
-               spin_unlock(&dev_list_lock);
-       } while (error == -EAGAIN);
-       if (error)
-               return -ENODEV;
-       ctrl->instance = instance;
-       return 0;
- }
- static void nvme_release_instance(struct nvme_ctrl *ctrl)
- {
-       spin_lock(&dev_list_lock);
-       ida_remove(&nvme_instance_ida, ctrl->instance);
-       spin_unlock(&dev_list_lock);
- }
+ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
  
  void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
  {
@@@ -2752,7 -3261,7 +3262,7 @@@ void nvme_start_ctrl(struct nvme_ctrl *
  
        if (ctrl->queue_count > 1) {
                nvme_queue_scan(ctrl);
-               nvme_queue_async_events(ctrl);
+               queue_work(nvme_wq, &ctrl->async_event_work);
                nvme_start_queues(ctrl);
        }
  }
@@@ -2760,30 -3269,31 +3270,31 @@@ EXPORT_SYMBOL_GPL(nvme_start_ctrl)
  
  void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
  {
-       device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
-       spin_lock(&dev_list_lock);
-       list_del(&ctrl->node);
-       spin_unlock(&dev_list_lock);
+       cdev_device_del(&ctrl->cdev, ctrl->device);
  }
  EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
  
- static void nvme_free_ctrl(struct kref *kref)
+ static void nvme_free_ctrl(struct device *dev)
  {
-       struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
+       struct nvme_ctrl *ctrl =
+               container_of(dev, struct nvme_ctrl, ctrl_device);
+       struct nvme_subsystem *subsys = ctrl->subsys;
  
-       put_device(ctrl->device);
-       nvme_release_instance(ctrl);
-       ida_destroy(&ctrl->ns_ida);
+       ida_simple_remove(&nvme_instance_ida, ctrl->instance);
+       kfree(ctrl->effects);
+       if (subsys) {
+               mutex_lock(&subsys->lock);
+               list_del(&ctrl->subsys_entry);
+               mutex_unlock(&subsys->lock);
+               sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
+       }
  
        ctrl->ops->free_ctrl(ctrl);
- }
  
- void nvme_put_ctrl(struct nvme_ctrl *ctrl)
- {
-       kref_put(&ctrl->kref, nvme_free_ctrl);
+       if (subsys)
+               nvme_put_subsystem(subsys);
  }
- EXPORT_SYMBOL_GPL(nvme_put_ctrl);
  
  /*
   * Initialize a NVMe controller structures.  This needs to be called during
@@@ -2799,32 -3309,36 +3310,36 @@@ int nvme_init_ctrl(struct nvme_ctrl *ct
        spin_lock_init(&ctrl->lock);
        INIT_LIST_HEAD(&ctrl->namespaces);
        mutex_init(&ctrl->namespaces_mutex);
-       kref_init(&ctrl->kref);
        ctrl->dev = dev;
        ctrl->ops = ops;
        ctrl->quirks = quirks;
        INIT_WORK(&ctrl->scan_work, nvme_scan_work);
        INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
        INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
+       INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
  
-       ret = nvme_set_instance(ctrl);
-       if (ret)
+       ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
+       if (ret < 0)
                goto out;
-       ctrl->device = device_create_with_groups(nvme_class, ctrl->dev,
-                               MKDEV(nvme_char_major, ctrl->instance),
-                               ctrl, nvme_dev_attr_groups,
-                               "nvme%d", ctrl->instance);
-       if (IS_ERR(ctrl->device)) {
-               ret = PTR_ERR(ctrl->device);
+       ctrl->instance = ret;
+       device_initialize(&ctrl->ctrl_device);
+       ctrl->device = &ctrl->ctrl_device;
+       ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance);
+       ctrl->device->class = nvme_class;
+       ctrl->device->parent = ctrl->dev;
+       ctrl->device->groups = nvme_dev_attr_groups;
+       ctrl->device->release = nvme_free_ctrl;
+       dev_set_drvdata(ctrl->device, ctrl);
+       ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
+       if (ret)
                goto out_release_instance;
-       }
-       get_device(ctrl->device);
-       ida_init(&ctrl->ns_ida);
  
-       spin_lock(&dev_list_lock);
-       list_add_tail(&ctrl->node, &nvme_ctrl_list);
-       spin_unlock(&dev_list_lock);
+       cdev_init(&ctrl->cdev, &nvme_dev_fops);
+       ctrl->cdev.owner = ops->module;
+       ret = cdev_device_add(&ctrl->cdev, ctrl->device);
+       if (ret)
+               goto out_free_name;
  
        /*
         * Initialize latency tolerance controls.  The sysfs files won't
                min(default_ps_max_latency_us, (unsigned long)S32_MAX));
  
        return 0;
+ out_free_name:
+       kfree_const(dev->kobj.name);
  out_release_instance:
-       nvme_release_instance(ctrl);
+       ida_simple_remove(&nvme_instance_ida, ctrl->instance);
  out:
        return ret;
  }
@@@ -2945,6 -3461,16 +3462,16 @@@ void nvme_start_queues(struct nvme_ctr
  }
  EXPORT_SYMBOL_GPL(nvme_start_queues);
  
+ int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
+ {
+       if (!ctrl->ops->reinit_request)
+               return 0;
+       return blk_mq_tagset_iter(set, set->driver_data,
+                       ctrl->ops->reinit_request);
+ }
+ EXPORT_SYMBOL_GPL(nvme_reinit_tagset);
  int __init nvme_core_init(void)
  {
        int result;
        if (!nvme_wq)
                return -ENOMEM;
  
-       result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
-                                                       &nvme_dev_fops);
+       result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
        if (result < 0)
                goto destroy_wq;
-       else if (result > 0)
-               nvme_char_major = result;
  
        nvme_class = class_create(THIS_MODULE, "nvme");
        if (IS_ERR(nvme_class)) {
                goto unregister_chrdev;
        }
  
+       nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
+       if (IS_ERR(nvme_subsys_class)) {
+               result = PTR_ERR(nvme_subsys_class);
+               goto destroy_class;
+       }
        return 0;
  
+ destroy_class:
+       class_destroy(nvme_class);
  unregister_chrdev:
-       __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
+       unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
  destroy_wq:
        destroy_workqueue(nvme_wq);
        return result;
  
  void nvme_core_exit(void)
  {
+       ida_destroy(&nvme_subsystems_ida);
+       class_destroy(nvme_subsys_class);
        class_destroy(nvme_class);
-       __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
+       unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
        destroy_workqueue(nvme_wq);
  }
  
diff --combined drivers/nvme/host/fc.c
index be49d0f793816cae0a9629665230acf248ab85ec,6eb460b117d6faf5d753319782a29a4f266f0599..7ab0be55c7d063b31f1a9525a2961308b0d8a274
  /* *************************** Data Structures/Defines ****************** */
  
  
- /*
-  * We handle AEN commands ourselves and don't even let the
-  * block layer know about them.
-  */
- #define NVME_FC_NR_AEN_COMMANDS       1
- #define NVME_FC_AQ_BLKMQ_DEPTH        \
-       (NVME_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS)
- #define AEN_CMDID_BASE                (NVME_FC_AQ_BLKMQ_DEPTH + 1)
  enum nvme_fc_queue_flags {
        NVME_FC_Q_CONNECTED = (1 << 0),
  };
  
  #define NVMEFC_QUEUE_DELAY    3               /* ms units */
  
+ #define NVME_FC_DEFAULT_DEV_LOSS_TMO  60      /* seconds */
  struct nvme_fc_queue {
        struct nvme_fc_ctrl     *ctrl;
        struct device           *dev;
        struct blk_mq_hw_ctx    *hctx;
        void                    *lldd_handle;
-       int                     queue_size;
        size_t                  cmnd_capsule_len;
        u32                     qnum;
        u32                     rqcnt;
@@@ -124,6 -116,7 +116,7 @@@ struct nvme_fc_lport 
        struct device                   *dev;   /* physical device for dma */
        struct nvme_fc_port_template    *ops;
        struct kref                     ref;
+       atomic_t                        act_rport_cnt;
  } __aligned(sizeof(u64));     /* alignment for other things alloc'd with */
  
  struct nvme_fc_rport {
        struct nvme_fc_lport            *lport;
        spinlock_t                      lock;
        struct kref                     ref;
+       atomic_t                        act_ctrl_cnt;
+       unsigned long                   dev_loss_end;
  } __aligned(sizeof(u64));     /* alignment for other things alloc'd with */
  
  enum nvme_fcctrl_flags {
@@@ -150,6 -145,7 +145,7 @@@ struct nvme_fc_ctrl 
        struct nvme_fc_rport    *rport;
        u32                     cnum;
  
+       bool                    assoc_active;
        u64                     association_id;
  
        struct list_head        ctrl_list;      /* rport->ctrl_list */
        struct blk_mq_tag_set   admin_tag_set;
        struct blk_mq_tag_set   tag_set;
  
-       struct work_struct      delete_work;
        struct delayed_work     connect_work;
  
        struct kref             ref;
        u32                     iocnt;
        wait_queue_head_t       ioabort_wait;
  
-       struct nvme_fc_fcp_op   aen_ops[NVME_FC_NR_AEN_COMMANDS];
+       struct nvme_fc_fcp_op   aen_ops[NVME_NR_AEN_COMMANDS];
  
        struct nvme_ctrl        ctrl;
  };
@@@ -213,10 -208,16 +208,16 @@@ static DEFINE_IDA(nvme_fc_ctrl_cnt)
  
  
  
+ /*
+  * These items are short-term. They will eventually be moved into
+  * a generic FC class. See comments in module init.
+  */
+ static struct class *fc_class;
+ static struct device *fc_udev_device;
  
  /* *********************** FC-NVME Port Management ************************ */
  
- static int __nvme_fc_del_ctrl(struct nvme_fc_ctrl *);
  static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *,
                        struct nvme_fc_queue *, unsigned int);
  
@@@ -235,9 -236,6 +236,6 @@@ nvme_fc_free_lport(struct kref *ref
        list_del(&lport->port_list);
        spin_unlock_irqrestore(&nvme_fc_lock, flags);
  
-       /* let the LLDD know we've finished tearing it down */
-       lport->ops->localport_delete(&lport->localport);
        ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num);
        ida_destroy(&lport->endp_cnt);
  
@@@ -260,7 -258,9 +258,9 @@@ nvme_fc_lport_get(struct nvme_fc_lport 
  
  
  static struct nvme_fc_lport *
- nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo)
+ nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo,
+                       struct nvme_fc_port_template *ops,
+                       struct device *dev)
  {
        struct nvme_fc_lport *lport;
        unsigned long flags;
                    lport->localport.port_name != pinfo->port_name)
                        continue;
  
+               if (lport->dev != dev) {
+                       lport = ERR_PTR(-EXDEV);
+                       goto out_done;
+               }
                if (lport->localport.port_state != FC_OBJSTATE_DELETED) {
                        lport = ERR_PTR(-EEXIST);
                        goto out_done;
  
                /* resume the lport */
  
+               lport->ops = ops;
                lport->localport.port_role = pinfo->port_role;
                lport->localport.port_id = pinfo->port_id;
                lport->localport.port_state = FC_OBJSTATE_ONLINE;
@@@ -348,7 -354,7 +354,7 @@@ nvme_fc_register_localport(struct nvme_
         * expired, we can simply re-enable the localport. Remoteports
         * and controller reconnections should resume naturally.
         */
-       newrec = nvme_fc_attach_to_unreg_lport(pinfo);
+       newrec = nvme_fc_attach_to_unreg_lport(pinfo, template, dev);
  
        /* found an lport, but something about its state is bad */
        if (IS_ERR(newrec)) {
        INIT_LIST_HEAD(&newrec->port_list);
        INIT_LIST_HEAD(&newrec->endp_list);
        kref_init(&newrec->ref);
+       atomic_set(&newrec->act_rport_cnt, 0);
        newrec->ops = template;
        newrec->dev = dev;
        ida_init(&newrec->endp_cnt);
@@@ -446,12 -453,177 +453,177 @@@ nvme_fc_unregister_localport(struct nvm
  
        spin_unlock_irqrestore(&nvme_fc_lock, flags);
  
+       if (atomic_read(&lport->act_rport_cnt) == 0)
+               lport->ops->localport_delete(&lport->localport);
        nvme_fc_lport_put(lport);
  
        return 0;
  }
  EXPORT_SYMBOL_GPL(nvme_fc_unregister_localport);
  
+ /*
+  * TRADDR strings, per FC-NVME are fixed format:
+  *   "nn-0x<16hexdigits>:pn-0x<16hexdigits>" - 43 characters
+  * udev event will only differ by prefix of what field is
+  * being specified:
+  *    "NVMEFC_HOST_TRADDR=" or "NVMEFC_TRADDR=" - 19 max characters
+  *  19 + 43 + null_fudge = 64 characters
+  */
+ #define FCNVME_TRADDR_LENGTH          64
+ static void
+ nvme_fc_signal_discovery_scan(struct nvme_fc_lport *lport,
+               struct nvme_fc_rport *rport)
+ {
+       char hostaddr[FCNVME_TRADDR_LENGTH];    /* NVMEFC_HOST_TRADDR=...*/
+       char tgtaddr[FCNVME_TRADDR_LENGTH];     /* NVMEFC_TRADDR=...*/
+       char *envp[4] = { "FC_EVENT=nvmediscovery", hostaddr, tgtaddr, NULL };
+       if (!(rport->remoteport.port_role & FC_PORT_ROLE_NVME_DISCOVERY))
+               return;
+       snprintf(hostaddr, sizeof(hostaddr),
+               "NVMEFC_HOST_TRADDR=nn-0x%016llx:pn-0x%016llx",
+               lport->localport.node_name, lport->localport.port_name);
+       snprintf(tgtaddr, sizeof(tgtaddr),
+               "NVMEFC_TRADDR=nn-0x%016llx:pn-0x%016llx",
+               rport->remoteport.node_name, rport->remoteport.port_name);
+       kobject_uevent_env(&fc_udev_device->kobj, KOBJ_CHANGE, envp);
+ }
+ static void
+ nvme_fc_free_rport(struct kref *ref)
+ {
+       struct nvme_fc_rport *rport =
+               container_of(ref, struct nvme_fc_rport, ref);
+       struct nvme_fc_lport *lport =
+                       localport_to_lport(rport->remoteport.localport);
+       unsigned long flags;
+       WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED);
+       WARN_ON(!list_empty(&rport->ctrl_list));
+       /* remove from lport list */
+       spin_lock_irqsave(&nvme_fc_lock, flags);
+       list_del(&rport->endp_list);
+       spin_unlock_irqrestore(&nvme_fc_lock, flags);
+       ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num);
+       kfree(rport);
+       nvme_fc_lport_put(lport);
+ }
+ static void
+ nvme_fc_rport_put(struct nvme_fc_rport *rport)
+ {
+       kref_put(&rport->ref, nvme_fc_free_rport);
+ }
+ static int
+ nvme_fc_rport_get(struct nvme_fc_rport *rport)
+ {
+       return kref_get_unless_zero(&rport->ref);
+ }
+ static void
+ nvme_fc_resume_controller(struct nvme_fc_ctrl *ctrl)
+ {
+       switch (ctrl->ctrl.state) {
+       case NVME_CTRL_NEW:
+       case NVME_CTRL_RECONNECTING:
+               /*
+                * As all reconnects were suppressed, schedule a
+                * connect.
+                */
+               dev_info(ctrl->ctrl.device,
+                       "NVME-FC{%d}: connectivity re-established. "
+                       "Attempting reconnect\n", ctrl->cnum);
+               queue_delayed_work(nvme_wq, &ctrl->connect_work, 0);
+               break;
+       case NVME_CTRL_RESETTING:
+               /*
+                * Controller is already in the process of terminating the
+                * association. No need to do anything further. The reconnect
+                * step will naturally occur after the reset completes.
+                */
+               break;
+       default:
+               /* no action to take - let it delete */
+               break;
+       }
+ }
+ static struct nvme_fc_rport *
+ nvme_fc_attach_to_suspended_rport(struct nvme_fc_lport *lport,
+                               struct nvme_fc_port_info *pinfo)
+ {
+       struct nvme_fc_rport *rport;
+       struct nvme_fc_ctrl *ctrl;
+       unsigned long flags;
+       spin_lock_irqsave(&nvme_fc_lock, flags);
+       list_for_each_entry(rport, &lport->endp_list, endp_list) {
+               if (rport->remoteport.node_name != pinfo->node_name ||
+                   rport->remoteport.port_name != pinfo->port_name)
+                       continue;
+               if (!nvme_fc_rport_get(rport)) {
+                       rport = ERR_PTR(-ENOLCK);
+                       goto out_done;
+               }
+               spin_unlock_irqrestore(&nvme_fc_lock, flags);
+               spin_lock_irqsave(&rport->lock, flags);
+               /* has it been unregistered */
+               if (rport->remoteport.port_state != FC_OBJSTATE_DELETED) {
+                       /* means lldd called us twice */
+                       spin_unlock_irqrestore(&rport->lock, flags);
+                       nvme_fc_rport_put(rport);
+                       return ERR_PTR(-ESTALE);
+               }
+               rport->remoteport.port_state = FC_OBJSTATE_ONLINE;
+               rport->dev_loss_end = 0;
+               /*
+                * kick off a reconnect attempt on all associations to the
+                * remote port. A successful reconnects will resume i/o.
+                */
+               list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list)
+                       nvme_fc_resume_controller(ctrl);
+               spin_unlock_irqrestore(&rport->lock, flags);
+               return rport;
+       }
+       rport = NULL;
+ out_done:
+       spin_unlock_irqrestore(&nvme_fc_lock, flags);
+       return rport;
+ }
+ static inline void
+ __nvme_fc_set_dev_loss_tmo(struct nvme_fc_rport *rport,
+                       struct nvme_fc_port_info *pinfo)
+ {
+       if (pinfo->dev_loss_tmo)
+               rport->remoteport.dev_loss_tmo = pinfo->dev_loss_tmo;
+       else
+               rport->remoteport.dev_loss_tmo = NVME_FC_DEFAULT_DEV_LOSS_TMO;
+ }
  /**
   * nvme_fc_register_remoteport - transport entry point called by an
   *                              LLDD to register the existence of a NVME
@@@ -478,28 -650,52 +650,52 @@@ nvme_fc_register_remoteport(struct nvme
        unsigned long flags;
        int ret, idx;
  
+       if (!nvme_fc_lport_get(lport)) {
+               ret = -ESHUTDOWN;
+               goto out_reghost_failed;
+       }
+       /*
+        * look to see if there is already a remoteport that is waiting
+        * for a reconnect (within dev_loss_tmo) with the same WWN's.
+        * If so, transition to it and reconnect.
+        */
+       newrec = nvme_fc_attach_to_suspended_rport(lport, pinfo);
+       /* found an rport, but something about its state is bad */
+       if (IS_ERR(newrec)) {
+               ret = PTR_ERR(newrec);
+               goto out_lport_put;
+       /* found existing rport, which was resumed */
+       } else if (newrec) {
+               nvme_fc_lport_put(lport);
+               __nvme_fc_set_dev_loss_tmo(newrec, pinfo);
+               nvme_fc_signal_discovery_scan(lport, newrec);
+               *portptr = &newrec->remoteport;
+               return 0;
+       }
+       /* nothing found - allocate a new remoteport struct */
        newrec = kmalloc((sizeof(*newrec) + lport->ops->remote_priv_sz),
                         GFP_KERNEL);
        if (!newrec) {
                ret = -ENOMEM;
-               goto out_reghost_failed;
-       }
-       if (!nvme_fc_lport_get(lport)) {
-               ret = -ESHUTDOWN;
-               goto out_kfree_rport;
+               goto out_lport_put;
        }
  
        idx = ida_simple_get(&lport->endp_cnt, 0, 0, GFP_KERNEL);
        if (idx < 0) {
                ret = -ENOSPC;
-               goto out_lport_put;
+               goto out_kfree_rport;
        }
  
        INIT_LIST_HEAD(&newrec->endp_list);
        INIT_LIST_HEAD(&newrec->ctrl_list);
        INIT_LIST_HEAD(&newrec->ls_req_list);
        kref_init(&newrec->ref);
+       atomic_set(&newrec->act_ctrl_cnt, 0);
        spin_lock_init(&newrec->lock);
        newrec->remoteport.localport = &lport->localport;
        newrec->dev = lport->dev;
        newrec->remoteport.port_id = pinfo->port_id;
        newrec->remoteport.port_state = FC_OBJSTATE_ONLINE;
        newrec->remoteport.port_num = idx;
+       __nvme_fc_set_dev_loss_tmo(newrec, pinfo);
  
        spin_lock_irqsave(&nvme_fc_lock, flags);
        list_add_tail(&newrec->endp_list, &lport->endp_list);
        spin_unlock_irqrestore(&nvme_fc_lock, flags);
  
+       nvme_fc_signal_discovery_scan(lport, newrec);
        *portptr = &newrec->remoteport;
        return 0;
  
- out_lport_put:
-       nvme_fc_lport_put(lport);
  out_kfree_rport:
        kfree(newrec);
+ out_lport_put:
+       nvme_fc_lport_put(lport);
  out_reghost_failed:
        *portptr = NULL;
        return ret;
  }
  EXPORT_SYMBOL_GPL(nvme_fc_register_remoteport);
  
- static void
- nvme_fc_free_rport(struct kref *ref)
- {
-       struct nvme_fc_rport *rport =
-               container_of(ref, struct nvme_fc_rport, ref);
-       struct nvme_fc_lport *lport =
-                       localport_to_lport(rport->remoteport.localport);
-       unsigned long flags;
-       WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED);
-       WARN_ON(!list_empty(&rport->ctrl_list));
-       /* remove from lport list */
-       spin_lock_irqsave(&nvme_fc_lock, flags);
-       list_del(&rport->endp_list);
-       spin_unlock_irqrestore(&nvme_fc_lock, flags);
-       /* let the LLDD know we've finished tearing it down */
-       lport->ops->remoteport_delete(&rport->remoteport);
-       ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num);
-       kfree(rport);
-       nvme_fc_lport_put(lport);
- }
- static void
- nvme_fc_rport_put(struct nvme_fc_rport *rport)
- {
-       kref_put(&rport->ref, nvme_fc_free_rport);
- }
- static int
- nvme_fc_rport_get(struct nvme_fc_rport *rport)
- {
-       return kref_get_unless_zero(&rport->ref);
- }
  static int
  nvme_fc_abort_lsops(struct nvme_fc_rport *rport)
  {
@@@ -592,6 -752,58 +752,58 @@@ restart
        return 0;
  }
  
+ static void
+ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
+ {
+       dev_info(ctrl->ctrl.device,
+               "NVME-FC{%d}: controller connectivity lost. Awaiting "
+               "Reconnect", ctrl->cnum);
+       switch (ctrl->ctrl.state) {
+       case NVME_CTRL_NEW:
+       case NVME_CTRL_LIVE:
+               /*
+                * Schedule a controller reset. The reset will terminate the
+                * association and schedule the reconnect timer.  Reconnects
+                * will be attempted until either the ctlr_loss_tmo
+                * (max_retries * connect_delay) expires or the remoteport's
+                * dev_loss_tmo expires.
+                */
+               if (nvme_reset_ctrl(&ctrl->ctrl)) {
+                       dev_warn(ctrl->ctrl.device,
+                               "NVME-FC{%d}: Couldn't schedule reset. "
+                               "Deleting controller.\n",
+                               ctrl->cnum);
+                       nvme_delete_ctrl(&ctrl->ctrl);
+               }
+               break;
+       case NVME_CTRL_RECONNECTING:
+               /*
+                * The association has already been terminated and the
+                * controller is attempting reconnects.  No need to do anything
+                * futher.  Reconnects will be attempted until either the
+                * ctlr_loss_tmo (max_retries * connect_delay) expires or the
+                * remoteport's dev_loss_tmo expires.
+                */
+               break;
+       case NVME_CTRL_RESETTING:
+               /*
+                * Controller is already in the process of terminating the
+                * association.  No need to do anything further. The reconnect
+                * step will kick in naturally after the association is
+                * terminated.
+                */
+               break;
+       case NVME_CTRL_DELETING:
+       default:
+               /* no action to take - let it delete */
+               break;
+       }
+ }
  /**
   * nvme_fc_unregister_remoteport - transport entry point called by an
   *                              LLDD to deregister/remove a previously
@@@ -621,19 -833,78 +833,78 @@@ nvme_fc_unregister_remoteport(struct nv
        }
        portptr->port_state = FC_OBJSTATE_DELETED;
  
-       /* tear down all associations to the remote port */
-       list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list)
-               __nvme_fc_del_ctrl(ctrl);
+       rport->dev_loss_end = jiffies + (portptr->dev_loss_tmo * HZ);
+       list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
+               /* if dev_loss_tmo==0, dev loss is immediate */
+               if (!portptr->dev_loss_tmo) {
+                       dev_warn(ctrl->ctrl.device,
+                               "NVME-FC{%d}: controller connectivity lost. "
+                               "Deleting controller.\n",
+                               ctrl->cnum);
+                       nvme_delete_ctrl(&ctrl->ctrl);
+               } else
+                       nvme_fc_ctrl_connectivity_loss(ctrl);
+       }
  
        spin_unlock_irqrestore(&rport->lock, flags);
  
        nvme_fc_abort_lsops(rport);
  
+       if (atomic_read(&rport->act_ctrl_cnt) == 0)
+               rport->lport->ops->remoteport_delete(portptr);
+       /*
+        * release the reference, which will allow, if all controllers
+        * go away, which should only occur after dev_loss_tmo occurs,
+        * for the rport to be torn down.
+        */
        nvme_fc_rport_put(rport);
        return 0;
  }
  EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport);
  
+ /**
+  * nvme_fc_rescan_remoteport - transport entry point called by an
+  *                              LLDD to request a nvme device rescan.
+  * @remoteport: pointer to the (registered) remote port that is to be
+  *              rescanned.
+  *
+  * Returns: N/A
+  */
+ void
+ nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport)
+ {
+       struct nvme_fc_rport *rport = remoteport_to_rport(remoteport);
+       nvme_fc_signal_discovery_scan(rport->lport, rport);
+ }
+ EXPORT_SYMBOL_GPL(nvme_fc_rescan_remoteport);
+ int
+ nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *portptr,
+                       u32 dev_loss_tmo)
+ {
+       struct nvme_fc_rport *rport = remoteport_to_rport(portptr);
+       unsigned long flags;
+       spin_lock_irqsave(&rport->lock, flags);
+       if (portptr->port_state != FC_OBJSTATE_ONLINE) {
+               spin_unlock_irqrestore(&rport->lock, flags);
+               return -EINVAL;
+       }
+       /* a dev_loss_tmo of 0 (immediate) is allowed to be set */
+       rport->remoteport.dev_loss_tmo = dev_loss_tmo;
+       spin_unlock_irqrestore(&rport->lock, flags);
+       return 0;
+ }
+ EXPORT_SYMBOL_GPL(nvme_fc_set_remoteport_devloss);
  
  /* *********************** FC-NVME DMA Handling **************************** */
  
@@@ -723,7 -994,6 +994,6 @@@ fc_dma_unmap_sg(struct device *dev, str
                dma_unmap_sg(dev, sg, nents, dir);
  }
  
  /* *********************** FC-NVME LS Handling **************************** */
  
  static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
@@@ -1266,7 -1536,7 +1536,7 @@@ nvme_fc_abort_aen_ops(struct nvme_fc_ct
        unsigned long flags;
        int i, ret;
  
-       for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+       for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
                if (atomic_read(&aen_op->state) != FCPOP_STATE_ACTIVE)
                        continue;
  
@@@ -1331,7 -1601,7 +1601,7 @@@ nvme_fc_fcpio_done(struct nvmefc_fcp_re
        struct nvme_command *sqe = &op->cmd_iu.sqe;
        __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
        union nvme_result result;
-       bool complete_rq, terminate_assoc = true;
+       bool terminate_assoc = true;
  
        /*
         * WARNING:
        fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma,
                                sizeof(op->rsp_iu), DMA_FROM_DEVICE);
  
-       if (atomic_read(&op->state) == FCPOP_STATE_ABORTED)
-               status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1);
+       if (atomic_read(&op->state) == FCPOP_STATE_ABORTED ||
+                       op->flags & FCOP_FLAGS_TERMIO)
+               status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
        else if (freq->status)
                status = cpu_to_le16(NVME_SC_INTERNAL << 1);
  
  done:
        if (op->flags & FCOP_FLAGS_AEN) {
                nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
-               complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op);
+               __nvme_fc_fcpop_chk_teardowns(ctrl, op);
                atomic_set(&op->state, FCPOP_STATE_IDLE);
                op->flags = FCOP_FLAGS_AEN;     /* clear other flags */
                nvme_fc_ctrl_put(ctrl);
                goto check_error;
        }
  
-       complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op);
-       if (!complete_rq) {
-               if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) {
-                       status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
-                       if (blk_queue_dying(rq->q))
-                               status |= cpu_to_le16(NVME_SC_DNR << 1);
-               }
-               nvme_end_request(rq, status, result);
-       } else
+       /*
+        * Force failures of commands if we're killing the controller
+        * or have an error on a command used to create an new association
+        */
+       if (status &&
+           (blk_queue_dying(rq->q) ||
+            ctrl->ctrl.state == NVME_CTRL_NEW ||
+            ctrl->ctrl.state == NVME_CTRL_RECONNECTING))
+               status |= cpu_to_le16(NVME_SC_DNR << 1);
+       if (__nvme_fc_fcpop_chk_teardowns(ctrl, op))
                __nvme_fc_final_op_cleanup(rq);
+       else
+               nvme_end_request(rq, status, result);
  
  check_error:
        if (terminate_assoc)
@@@ -1531,7 -1806,7 +1806,7 @@@ nvme_fc_init_aen_ops(struct nvme_fc_ctr
        int i, ret;
  
        aen_op = ctrl->aen_ops;
-       for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+       for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
                private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz,
                                                GFP_KERNEL);
                if (!private)
                sqe = &cmdiu->sqe;
                ret = __nvme_fc_init_request(ctrl, &ctrl->queues[0],
                                aen_op, (struct request *)NULL,
-                               (AEN_CMDID_BASE + i));
+                               (NVME_AQ_BLK_MQ_DEPTH + i));
                if (ret) {
                        kfree(private);
                        return ret;
                memset(sqe, 0, sizeof(*sqe));
                sqe->common.opcode = nvme_admin_async_event;
                /* Note: core layer may overwrite the sqe.command_id value */
-               sqe->common.command_id = AEN_CMDID_BASE + i;
+               sqe->common.command_id = NVME_AQ_BLK_MQ_DEPTH + i;
        }
        return 0;
  }
@@@ -1566,7 -1841,7 +1841,7 @@@ nvme_fc_term_aen_ops(struct nvme_fc_ctr
        int i;
  
        aen_op = ctrl->aen_ops;
-       for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+       for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
                if (!aen_op->fcp_req.private)
                        continue;
  
@@@ -1610,7 -1885,7 +1885,7 @@@ nvme_fc_init_admin_hctx(struct blk_mq_h
  }
  
  static void
- nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx, size_t queue_size)
+ nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx)
  {
        struct nvme_fc_queue *queue;
  
        else
                queue->cmnd_capsule_len = sizeof(struct nvme_command);
  
-       queue->queue_size = queue_size;
        /*
         * Considered whether we should allocate buffers for all SQEs
         * and CQEs and dma map them - mapping their respective entries
@@@ -1751,7 -2024,7 +2024,7 @@@ nvme_fc_init_io_queues(struct nvme_fc_c
        int i;
  
        for (i = 1; i < ctrl->ctrl.queue_count; i++)
-               nvme_fc_init_queue(ctrl, i, ctrl->ctrl.sqsize);
+               nvme_fc_init_queue(ctrl, i);
  }
  
  static void
@@@ -1825,13 -2098,6 +2098,6 @@@ nvme_fc_error_recovery(struct nvme_fc_c
        dev_warn(ctrl->ctrl.device,
                "NVME-FC{%d}: resetting controller\n", ctrl->cnum);
  
-       if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
-               dev_err(ctrl->ctrl.device,
-                       "NVME-FC{%d}: error_recovery: Couldn't change state "
-                       "to RECONNECTING\n", ctrl->cnum);
-               return;
-       }
        nvme_reset_ctrl(&ctrl->ctrl);
  }
  
@@@ -1842,13 -2108,14 +2108,14 @@@ nvme_fc_timeout(struct request *rq, boo
        struct nvme_fc_ctrl *ctrl = op->ctrl;
        int ret;
  
-       if (reserved)
+       if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE ||
+                       atomic_read(&op->state) == FCPOP_STATE_ABORTED)
                return BLK_EH_RESET_TIMER;
  
        ret = __nvme_fc_abort_op(ctrl, op);
        if (ret)
-               /* io wasn't active to abort consider it done */
-               return BLK_EH_HANDLED;
+               /* io wasn't active to abort */
+               return BLK_EH_NOT_HANDLED;
  
        /*
         * we can't individually ABTS an io without affecting the queue,
         */
        nvme_fc_error_recovery(ctrl, "io timeout error");
  
-       return BLK_EH_HANDLED;
+       /*
+        * the io abort has been initiated. Have the reset timer
+        * restarted and the abort completion will complete the io
+        * shortly. Avoids a synchronous wait while the abort finishes.
+        */
+       return BLK_EH_RESET_TIMER;
  }
  
  static int
@@@ -2110,7 -2382,7 +2382,7 @@@ nvme_fc_poll(struct blk_mq_hw_ctx *hctx
  }
  
  static void
- nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
+ nvme_fc_submit_async_event(struct nvme_ctrl *arg)
  {
        struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg);
        struct nvme_fc_fcp_op *aen_op;
        bool terminating = false;
        blk_status_t ret;
  
-       if (aer_idx > NVME_FC_NR_AEN_COMMANDS)
-               return;
        spin_lock_irqsave(&ctrl->lock, flags);
        if (ctrl->flags & FCCTRL_TERMIO)
                terminating = true;
        if (terminating)
                return;
  
-       aen_op = &ctrl->aen_ops[aer_idx];
+       aen_op = &ctrl->aen_ops[0];
  
        ret = nvme_fc_start_fcp_op(ctrl, aen_op->queue, aen_op, 0,
                                        NVMEFC_FCP_NODATA);
        if (ret)
                dev_err(ctrl->ctrl.device,
-                       "failed async event work [%d]\n", aer_idx);
+                       "failed async event work\n");
  }
  
  static void
@@@ -2337,7 -2606,7 +2606,7 @@@ nvme_fc_reinit_io_queues(struct nvme_fc
  
        nvme_fc_init_io_queues(ctrl);
  
-       ret = blk_mq_reinit_tagset(&ctrl->tag_set, nvme_fc_reinit_request);
+       ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
        if (ret)
                goto out_free_io_queues;
  
@@@ -2360,6 -2629,61 +2629,61 @@@ out_free_io_queues
        return ret;
  }
  
+ static void
+ nvme_fc_rport_active_on_lport(struct nvme_fc_rport *rport)
+ {
+       struct nvme_fc_lport *lport = rport->lport;
+       atomic_inc(&lport->act_rport_cnt);
+ }
+ static void
+ nvme_fc_rport_inactive_on_lport(struct nvme_fc_rport *rport)
+ {
+       struct nvme_fc_lport *lport = rport->lport;
+       u32 cnt;
+       cnt = atomic_dec_return(&lport->act_rport_cnt);
+       if (cnt == 0 && lport->localport.port_state == FC_OBJSTATE_DELETED)
+               lport->ops->localport_delete(&lport->localport);
+ }
+ static int
+ nvme_fc_ctlr_active_on_rport(struct nvme_fc_ctrl *ctrl)
+ {
+       struct nvme_fc_rport *rport = ctrl->rport;
+       u32 cnt;
+       if (ctrl->assoc_active)
+               return 1;
+       ctrl->assoc_active = true;
+       cnt = atomic_inc_return(&rport->act_ctrl_cnt);
+       if (cnt == 1)
+               nvme_fc_rport_active_on_lport(rport);
+       return 0;
+ }
+ static int
+ nvme_fc_ctlr_inactive_on_rport(struct nvme_fc_ctrl *ctrl)
+ {
+       struct nvme_fc_rport *rport = ctrl->rport;
+       struct nvme_fc_lport *lport = rport->lport;
+       u32 cnt;
+       /* ctrl->assoc_active=false will be set independently */
+       cnt = atomic_dec_return(&rport->act_ctrl_cnt);
+       if (cnt == 0) {
+               if (rport->remoteport.port_state == FC_OBJSTATE_DELETED)
+                       lport->ops->remoteport_delete(&rport->remoteport);
+               nvme_fc_rport_inactive_on_lport(rport);
+       }
+       return 0;
+ }
  /*
   * This routine restarts the controller on the host side, and
   * on the link side, recreates the controller association.
@@@ -2368,26 -2692,31 +2692,31 @@@ static in
  nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
  {
        struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
-       u32 segs;
        int ret;
        bool changed;
  
        ++ctrl->ctrl.nr_reconnects;
  
+       if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
+               return -ENODEV;
+       if (nvme_fc_ctlr_active_on_rport(ctrl))
+               return -ENOTUNIQ;
        /*
         * Create the admin queue
         */
  
-       nvme_fc_init_queue(ctrl, 0, NVME_FC_AQ_BLKMQ_DEPTH);
+       nvme_fc_init_queue(ctrl, 0);
  
        ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0,
-                               NVME_FC_AQ_BLKMQ_DEPTH);
+                               NVME_AQ_BLK_MQ_DEPTH);
        if (ret)
                goto out_free_queue;
  
        ret = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0],
-                               NVME_FC_AQ_BLKMQ_DEPTH,
-                               (NVME_FC_AQ_BLKMQ_DEPTH / 4));
+                               NVME_AQ_BLK_MQ_DEPTH,
+                               (NVME_AQ_BLK_MQ_DEPTH / 4));
        if (ret)
                goto out_delete_hw_queue;
  
        if (ret)
                goto out_disconnect_admin_queue;
  
-       segs = min_t(u32, NVME_FC_MAX_SEGMENTS,
-                       ctrl->lport->ops->max_sgl_segments);
-       ctrl->ctrl.max_hw_sectors = (segs - 1) << (PAGE_SHIFT - 9);
+       ctrl->ctrl.max_hw_sectors =
+               (ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9);
  
        ret = nvme_init_identify(&ctrl->ctrl);
        if (ret)
        }
  
        changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
-       WARN_ON_ONCE(!changed);
  
        ctrl->ctrl.nr_reconnects = 0;
  
-       nvme_start_ctrl(&ctrl->ctrl);
+       if (changed)
+               nvme_start_ctrl(&ctrl->ctrl);
  
        return 0;       /* Success */
  
@@@ -2482,6 -2810,8 +2810,8 @@@ out_delete_hw_queue
        __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
  out_free_queue:
        nvme_fc_free_queue(&ctrl->queues[0]);
+       ctrl->assoc_active = false;
+       nvme_fc_ctlr_inactive_on_rport(ctrl);
  
        return ret;
  }
@@@ -2497,6 -2827,10 +2827,10 @@@ nvme_fc_delete_association(struct nvme_
  {
        unsigned long flags;
  
+       if (!ctrl->assoc_active)
+               return;
+       ctrl->assoc_active = false;
        spin_lock_irqsave(&ctrl->lock, flags);
        ctrl->flags |= FCCTRL_TERMIO;
        ctrl->iocnt = 0;
         * use blk_mq_tagset_busy_itr() and the transport routine to
         * terminate the exchanges.
         */
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       if (ctrl->ctrl.state != NVME_CTRL_NEW)
+               blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
        blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
                                nvme_fc_terminate_exchange, &ctrl->ctrl);
  
        nvme_fc_abort_aen_ops(ctrl);
  
        /* wait for all io that had to be aborted */
 -      spin_lock_irqsave(&ctrl->lock, flags);
 +      spin_lock_irq(&ctrl->lock);
        wait_event_lock_irq(ctrl->ioabort_wait, ctrl->iocnt == 0, ctrl->lock);
        ctrl->flags &= ~FCCTRL_TERMIO;
 -      spin_unlock_irqrestore(&ctrl->lock, flags);
 +      spin_unlock_irq(&ctrl->lock);
  
        nvme_fc_term_aen_ops(ctrl);
  
  
        __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
        nvme_fc_free_queue(&ctrl->queues[0]);
+       nvme_fc_ctlr_inactive_on_rport(ctrl);
  }
  
  static void
- nvme_fc_delete_ctrl_work(struct work_struct *work)
+ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
  {
-       struct nvme_fc_ctrl *ctrl =
-               container_of(work, struct nvme_fc_ctrl, delete_work);
+       struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
  
-       cancel_work_sync(&ctrl->ctrl.reset_work);
        cancel_delayed_work_sync(&ctrl->connect_work);
-       nvme_stop_ctrl(&ctrl->ctrl);
-       nvme_remove_namespaces(&ctrl->ctrl);
        /*
         * kill the association on the link side.  this will block
         * waiting for io to terminate
         */
        nvme_fc_delete_association(ctrl);
-       /*
-        * tear down the controller
-        * After the last reference on the nvme ctrl is removed,
-        * the transport nvme_fc_nvme_ctrl_freed() callback will be
-        * invoked. From there, the transport will tear down it's
-        * logical queues and association.
-        */
-       nvme_uninit_ctrl(&ctrl->ctrl);
-       nvme_put_ctrl(&ctrl->ctrl);
- }
- static bool
- __nvme_fc_schedule_delete_work(struct nvme_fc_ctrl *ctrl)
- {
-       if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
-               return true;
-       if (!queue_work(nvme_wq, &ctrl->delete_work))
-               return true;
-       return false;
- }
- static int
- __nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl)
- {
-       return __nvme_fc_schedule_delete_work(ctrl) ? -EBUSY : 0;
- }
- /*
-  * Request from nvme core layer to delete the controller
-  */
- static int
- nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl)
- {
-       struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
-       int ret;
-       if (!kref_get_unless_zero(&ctrl->ctrl.kref))
-               return -EBUSY;
-       ret = __nvme_fc_del_ctrl(ctrl);
-       if (!ret)
-               flush_workqueue(nvme_wq);
-       nvme_put_ctrl(&ctrl->ctrl);
-       return ret;
  }
  
  static void
  nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
  {
-       /* If we are resetting/deleting then do nothing */
-       if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) {
-               WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
-                       ctrl->ctrl.state == NVME_CTRL_LIVE);
-               return;
-       }
+       struct nvme_fc_rport *rport = ctrl->rport;
+       struct nvme_fc_remote_port *portptr = &rport->remoteport;
+       unsigned long recon_delay = ctrl->ctrl.opts->reconnect_delay * HZ;
+       bool recon = true;
  
-       dev_info(ctrl->ctrl.device,
-               "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n",
-               ctrl->cnum, status);
+       if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING)
+               return;
  
-       if (nvmf_should_reconnect(&ctrl->ctrl)) {
+       if (portptr->port_state == FC_OBJSTATE_ONLINE)
                dev_info(ctrl->ctrl.device,
-                       "NVME-FC{%d}: Reconnect attempt in %d seconds.\n",
-                       ctrl->cnum, ctrl->ctrl.opts->reconnect_delay);
-               queue_delayed_work(nvme_wq, &ctrl->connect_work,
-                               ctrl->ctrl.opts->reconnect_delay * HZ);
+                       "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n",
+                       ctrl->cnum, status);
+       else if (time_after_eq(jiffies, rport->dev_loss_end))
+               recon = false;
+       if (recon && nvmf_should_reconnect(&ctrl->ctrl)) {
+               if (portptr->port_state == FC_OBJSTATE_ONLINE)
+                       dev_info(ctrl->ctrl.device,
+                               "NVME-FC{%d}: Reconnect attempt in %ld "
+                               "seconds\n",
+                               ctrl->cnum, recon_delay / HZ);
+               else if (time_after(jiffies + recon_delay, rport->dev_loss_end))
+                       recon_delay = rport->dev_loss_end - jiffies;
+               queue_delayed_work(nvme_wq, &ctrl->connect_work, recon_delay);
        } else {
-               dev_warn(ctrl->ctrl.device,
+               if (portptr->port_state == FC_OBJSTATE_ONLINE)
+                       dev_warn(ctrl->ctrl.device,
                                "NVME-FC{%d}: Max reconnect attempts (%d) "
                                "reached. Removing controller\n",
                                ctrl->cnum, ctrl->ctrl.nr_reconnects);
-               WARN_ON(__nvme_fc_schedule_delete_work(ctrl));
+               else
+                       dev_warn(ctrl->ctrl.device,
+                               "NVME-FC{%d}: dev_loss_tmo (%d) expired "
+                               "while waiting for remoteport connectivity. "
+                               "Removing controller\n", ctrl->cnum,
+                               portptr->dev_loss_tmo);
+               WARN_ON(nvme_delete_ctrl(&ctrl->ctrl));
        }
  }
  
@@@ -2675,15 -2972,28 +2972,28 @@@ nvme_fc_reset_ctrl_work(struct work_str
        int ret;
  
        nvme_stop_ctrl(&ctrl->ctrl);
        /* will block will waiting for io to terminate */
        nvme_fc_delete_association(ctrl);
  
-       ret = nvme_fc_create_association(ctrl);
+       if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+               dev_err(ctrl->ctrl.device,
+                       "NVME-FC{%d}: error_recovery: Couldn't change state "
+                       "to RECONNECTING\n", ctrl->cnum);
+               return;
+       }
+       if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE)
+               ret = nvme_fc_create_association(ctrl);
+       else
+               ret = -ENOTCONN;
        if (ret)
                nvme_fc_reconnect_or_delete(ctrl, ret);
        else
                dev_info(ctrl->ctrl.device,
-                       "NVME-FC{%d}: controller reset complete\n", ctrl->cnum);
+                       "NVME-FC{%d}: controller reset complete\n",
+                       ctrl->cnum);
  }
  
  static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
        .reg_write32            = nvmf_reg_write32,
        .free_ctrl              = nvme_fc_nvme_ctrl_freed,
        .submit_async_event     = nvme_fc_submit_async_event,
-       .delete_ctrl            = nvme_fc_del_nvme_ctrl,
+       .delete_ctrl            = nvme_fc_delete_ctrl,
        .get_address            = nvmf_get_address,
+       .reinit_request         = nvme_fc_reinit_request,
  };
  
  static void
@@@ -2728,13 -3039,40 +3039,40 @@@ static const struct blk_mq_ops nvme_fc_
  };
  
  
+ /*
+  * Fails a controller request if it matches an existing controller
+  * (association) with the same tuple:
+  * <Host NQN, Host ID, local FC port, remote FC port, SUBSYS NQN>
+  *
+  * The ports don't need to be compared as they are intrinsically
+  * already matched by the port pointers supplied.
+  */
+ static bool
+ nvme_fc_existing_controller(struct nvme_fc_rport *rport,
+               struct nvmf_ctrl_options *opts)
+ {
+       struct nvme_fc_ctrl *ctrl;
+       unsigned long flags;
+       bool found = false;
+       spin_lock_irqsave(&rport->lock, flags);
+       list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
+               found = nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts);
+               if (found)
+                       break;
+       }
+       spin_unlock_irqrestore(&rport->lock, flags);
+       return found;
+ }
  static struct nvme_ctrl *
  nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
        struct nvme_fc_lport *lport, struct nvme_fc_rport *rport)
  {
        struct nvme_fc_ctrl *ctrl;
        unsigned long flags;
 -      int ret, idx;
 +      int ret, idx, retry;
  
        if (!(rport->remoteport.port_role &
            (FC_PORT_ROLE_NVME_DISCOVERY | FC_PORT_ROLE_NVME_TARGET))) {
                goto out_fail;
        }
  
+       if (!opts->duplicate_connect &&
+           nvme_fc_existing_controller(rport, opts)) {
+               ret = -EALREADY;
+               goto out_fail;
+       }
        ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
        if (!ctrl) {
                ret = -ENOMEM;
        ctrl->rport = rport;
        ctrl->dev = lport->dev;
        ctrl->cnum = idx;
+       ctrl->assoc_active = false;
 +      init_waitqueue_head(&ctrl->ioabort_wait);
  
        get_device(ctrl->dev);
        kref_init(&ctrl->ref);
  
-       INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work);
        INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work);
        INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work);
        spin_lock_init(&ctrl->lock);
  
        memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
        ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
-       ctrl->admin_tag_set.queue_depth = NVME_FC_AQ_BLKMQ_DEPTH;
+       ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
        ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
        ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
        ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
        ctrl->admin_tag_set.driver_data = ctrl;
        ctrl->admin_tag_set.nr_hw_queues = 1;
        ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
+       ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED;
  
        ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
        if (ret)
        list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list);
        spin_unlock_irqrestore(&rport->lock, flags);
  
 -      ret = nvme_fc_create_association(ctrl);
 +      /*
 +       * It's possible that transactions used to create the association
 +       * may fail. Examples: CreateAssociation LS or CreateIOConnection
 +       * LS gets dropped/corrupted/fails; or a frame gets dropped or a
 +       * command times out for one of the actions to init the controller
 +       * (Connect, Get/Set_Property, Set_Features, etc). Many of these
 +       * transport errors (frame drop, LS failure) inherently must kill
 +       * the association. The transport is coded so that any command used
 +       * to create the association (prior to a LIVE state transition
 +       * while NEW or RECONNECTING) will fail if it completes in error or
 +       * times out.
 +       *
 +       * As such: as the connect request was mostly likely due to a
 +       * udev event that discovered the remote port, meaning there is
 +       * not an admin or script there to restart if the connect
 +       * request fails, retry the initial connection creation up to
 +       * three times before giving up and declaring failure.
 +       */
 +      for (retry = 0; retry < 3; retry++) {
 +              ret = nvme_fc_create_association(ctrl);
 +              if (!ret)
 +                      break;
 +      }
 +
        if (ret) {
 +              /* couldn't schedule retry - fail out */
 +              dev_err(ctrl->ctrl.device,
 +                      "NVME-FC{%d}: Connect retry failed\n", ctrl->cnum);
 +
                ctrl->ctrl.opts = NULL;
 +
                /* initiate nvme ctrl ref counting teardown */
                nvme_uninit_ctrl(&ctrl->ctrl);
                nvme_put_ctrl(&ctrl->ctrl);
                return ERR_PTR(ret);
        }
  
-       kref_get(&ctrl->ctrl.kref);
+       nvme_get_ctrl(&ctrl->ctrl);
  
        dev_info(ctrl->ctrl.device,
                "NVME-FC{%d}: new ctrl: NQN \"%s\"\n",
@@@ -3026,7 -3342,50 +3371,50 @@@ static struct nvmf_transport_ops nvme_f
  
  static int __init nvme_fc_init_module(void)
  {
-       return nvmf_register_transport(&nvme_fc_transport);
+       int ret;
+       /*
+        * NOTE:
+        * It is expected that in the future the kernel will combine
+        * the FC-isms that are currently under scsi and now being
+        * added to by NVME into a new standalone FC class. The SCSI
+        * and NVME protocols and their devices would be under this
+        * new FC class.
+        *
+        * As we need something to post FC-specific udev events to,
+        * specifically for nvme probe events, start by creating the
+        * new device class.  When the new standalone FC class is
+        * put in place, this code will move to a more generic
+        * location for the class.
+        */
+       fc_class = class_create(THIS_MODULE, "fc");
+       if (IS_ERR(fc_class)) {
+               pr_err("couldn't register class fc\n");
+               return PTR_ERR(fc_class);
+       }
+       /*
+        * Create a device for the FC-centric udev events
+        */
+       fc_udev_device = device_create(fc_class, NULL, MKDEV(0, 0), NULL,
+                               "fc_udev_device");
+       if (IS_ERR(fc_udev_device)) {
+               pr_err("couldn't create fc_udev device!\n");
+               ret = PTR_ERR(fc_udev_device);
+               goto out_destroy_class;
+       }
+       ret = nvmf_register_transport(&nvme_fc_transport);
+       if (ret)
+               goto out_destroy_device;
+       return 0;
+ out_destroy_device:
+       device_destroy(fc_class, MKDEV(0, 0));
+ out_destroy_class:
+       class_destroy(fc_class);
+       return ret;
  }
  
  static void __exit nvme_fc_exit_module(void)
  
        ida_destroy(&nvme_fc_local_port_cnt);
        ida_destroy(&nvme_fc_ctrl_cnt);
+       device_destroy(fc_class, MKDEV(0, 0));
+       class_destroy(fc_class);
  }
  
  module_init(nvme_fc_init_module);
diff --combined drivers/nvme/host/pci.c
index 3f5a04c586cefdc8096469ba38d325004963b42d,762b8402e04c591df728e6e5e5bffa16ccf0d907..a11cfd470089226cffd01c9c6104afdc876c341a
@@@ -13,7 -13,6 +13,6 @@@
   */
  
  #include <linux/aer.h>
- #include <linux/bitops.h>
  #include <linux/blkdev.h>
  #include <linux/blk-mq.h>
  #include <linux/blk-mq-pci.h>
  #include <linux/mutex.h>
  #include <linux/once.h>
  #include <linux/pci.h>
- #include <linux/poison.h>
  #include <linux/t10-pi.h>
- #include <linux/timer.h>
  #include <linux/types.h>
  #include <linux/io-64-nonatomic-lo-hi.h>
- #include <asm/unaligned.h>
  #include <linux/sed-opal.h>
  
  #include "nvme.h"
  #define SQ_SIZE(depth)                (depth * sizeof(struct nvme_command))
  #define CQ_SIZE(depth)                (depth * sizeof(struct nvme_completion))
  
- /*
-  * We handle AEN commands ourselves and don't even let the
-  * block layer know about them.
-  */
- #define NVME_AQ_BLKMQ_DEPTH   (NVME_AQ_DEPTH - NVME_NR_AERS)
+ #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
  
  static int use_threaded_interrupts;
  module_param(use_threaded_interrupts, int, 0);
@@@ -57,6 -49,12 +49,12 @@@ module_param(max_host_mem_size_mb, uint
  MODULE_PARM_DESC(max_host_mem_size_mb,
        "Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
  
+ static unsigned int sgl_threshold = SZ_32K;
+ module_param(sgl_threshold, uint, 0644);
+ MODULE_PARM_DESC(sgl_threshold,
+               "Use SGLs when average request segment size is larger or equal to "
+               "this size. Use 0 to disable SGLs.");
  static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
  static const struct kernel_param_ops io_queue_depth_ops = {
        .set = io_queue_depth_set,
@@@ -94,7 -92,7 +92,7 @@@ struct nvme_dev 
        struct mutex shutdown_lock;
        bool subsystem;
        void __iomem *cmb;
 -      dma_addr_t cmb_dma_addr;
 +      pci_bus_addr_t cmb_bus_addr;
        u64 cmb_size;
        u32 cmbsz;
        u32 cmbloc;
@@@ -178,6 -176,7 +176,7 @@@ struct nvme_queue 
  struct nvme_iod {
        struct nvme_request req;
        struct nvme_queue *nvmeq;
+       bool use_sgl;
        int aborted;
        int npages;             /* In the PRP list. 0 means small pool in use */
        int nents;              /* Used in scatterlist */
@@@ -331,17 -330,35 +330,35 @@@ static int nvme_npages(unsigned size, s
        return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
  }
  
- static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev,
-               unsigned int size, unsigned int nseg)
+ /*
+  * Calculates the number of pages needed for the SGL segments. For example a 4k
+  * page can accommodate 256 SGL descriptors.
+  */
+ static int nvme_pci_npages_sgl(unsigned int num_seg)
  {
-       return sizeof(__le64 *) * nvme_npages(size, dev) +
-                       sizeof(struct scatterlist) * nseg;
+       return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE);
  }
  
- static unsigned int nvme_cmd_size(struct nvme_dev *dev)
+ static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev,
+               unsigned int size, unsigned int nseg, bool use_sgl)
  {
-       return sizeof(struct nvme_iod) +
-               nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES);
+       size_t alloc_size;
+       if (use_sgl)
+               alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg);
+       else
+               alloc_size = sizeof(__le64 *) * nvme_npages(size, dev);
+       return alloc_size + sizeof(struct scatterlist) * nseg;
+ }
+ static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl)
+ {
+       unsigned int alloc_size = nvme_pci_iod_alloc_size(dev,
+                                   NVME_INT_BYTES(dev), NVME_INT_PAGES,
+                                   use_sgl);
+       return sizeof(struct nvme_iod) + alloc_size;
  }
  
  static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
@@@ -425,10 -442,10 +442,10 @@@ static void __nvme_submit_cmd(struct nv
        nvmeq->sq_tail = tail;
  }
  
- static __le64 **iod_list(struct request *req)
+ static void **nvme_pci_iod_list(struct request *req)
  {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-       return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req));
+       return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
  }
  
  static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
        unsigned int size = blk_rq_payload_bytes(rq);
  
        if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
-               iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC);
+               size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
+                               iod->use_sgl);
+               iod->sg = kmalloc(alloc_size, GFP_ATOMIC);
                if (!iod->sg)
                        return BLK_STS_RESOURCE;
        } else {
  static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
  {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-       const int last_prp = dev->ctrl.page_size / 8 - 1;
+       const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
+       dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
        int i;
-       __le64 **list = iod_list(req);
-       dma_addr_t prp_dma = iod->first_dma;
  
        if (iod->npages == 0)
-               dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
+               dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
+                       dma_addr);
        for (i = 0; i < iod->npages; i++) {
-               __le64 *prp_list = list[i];
-               dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
-               dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
-               prp_dma = next_prp_dma;
+               void *addr = nvme_pci_iod_list(req)[i];
+               if (iod->use_sgl) {
+                       struct nvme_sgl_desc *sg_list = addr;
+                       next_dma_addr =
+                           le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr);
+               } else {
+                       __le64 *prp_list = addr;
+                       next_dma_addr = le64_to_cpu(prp_list[last_prp]);
+               }
+               dma_pool_free(dev->prp_page_pool, addr, dma_addr);
+               dma_addr = next_dma_addr;
        }
  
        if (iod->sg != iod->inline_sg)
@@@ -555,7 -588,8 +588,8 @@@ static void nvme_print_sgl(struct scatt
        }
  }
  
- static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
+ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
+               struct request *req, struct nvme_rw_command *cmnd)
  {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
        struct dma_pool *pool;
        u32 page_size = dev->ctrl.page_size;
        int offset = dma_addr & (page_size - 1);
        __le64 *prp_list;
-       __le64 **list = iod_list(req);
+       void **list = nvme_pci_iod_list(req);
        dma_addr_t prp_dma;
        int nprps, i;
  
+       iod->use_sgl = false;
        length -= (page_size - offset);
        if (length <= 0) {
                iod->first_dma = 0;
-               return BLK_STS_OK;
+               goto done;
        }
  
        dma_len -= (page_size - offset);
  
        if (length <= page_size) {
                iod->first_dma = dma_addr;
-               return BLK_STS_OK;
+               goto done;
        }
  
        nprps = DIV_ROUND_UP(length, page_size);
                dma_len = sg_dma_len(sg);
        }
  
+ done:
+       cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+       cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
        return BLK_STS_OK;
  
   bad_sgl:
        return BLK_STS_IOERR;
  }
  
+ static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
+               struct scatterlist *sg)
+ {
+       sge->addr = cpu_to_le64(sg_dma_address(sg));
+       sge->length = cpu_to_le32(sg_dma_len(sg));
+       sge->type = NVME_SGL_FMT_DATA_DESC << 4;
+ }
+ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
+               dma_addr_t dma_addr, int entries)
+ {
+       sge->addr = cpu_to_le64(dma_addr);
+       if (entries < SGES_PER_PAGE) {
+               sge->length = cpu_to_le32(entries * sizeof(*sge));
+               sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
+       } else {
+               sge->length = cpu_to_le32(PAGE_SIZE);
+               sge->type = NVME_SGL_FMT_SEG_DESC << 4;
+       }
+ }
+ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
+               struct request *req, struct nvme_rw_command *cmd)
+ {
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       int length = blk_rq_payload_bytes(req);
+       struct dma_pool *pool;
+       struct nvme_sgl_desc *sg_list;
+       struct scatterlist *sg = iod->sg;
+       int entries = iod->nents, i = 0;
+       dma_addr_t sgl_dma;
+       iod->use_sgl = true;
+       /* setting the transfer type as SGL */
+       cmd->flags = NVME_CMD_SGL_METABUF;
+       if (length == sg_dma_len(sg)) {
+               nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
+               return BLK_STS_OK;
+       }
+       if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
+               pool = dev->prp_small_pool;
+               iod->npages = 0;
+       } else {
+               pool = dev->prp_page_pool;
+               iod->npages = 1;
+       }
+       sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
+       if (!sg_list) {
+               iod->npages = -1;
+               return BLK_STS_RESOURCE;
+       }
+       nvme_pci_iod_list(req)[0] = sg_list;
+       iod->first_dma = sgl_dma;
+       nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
+       do {
+               if (i == SGES_PER_PAGE) {
+                       struct nvme_sgl_desc *old_sg_desc = sg_list;
+                       struct nvme_sgl_desc *link = &old_sg_desc[i - 1];
+                       sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
+                       if (!sg_list)
+                               return BLK_STS_RESOURCE;
+                       i = 0;
+                       nvme_pci_iod_list(req)[iod->npages++] = sg_list;
+                       sg_list[i++] = *link;
+                       nvme_pci_sgl_set_seg(link, sgl_dma, entries);
+               }
+               nvme_pci_sgl_set_data(&sg_list[i++], sg);
+               length -= sg_dma_len(sg);
+               sg = sg_next(sg);
+               entries--;
+       } while (length > 0);
+       WARN_ON(entries > 0);
+       return BLK_STS_OK;
+ }
+ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
+ {
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       unsigned int avg_seg_size;
+       avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req),
+                       blk_rq_nr_phys_segments(req));
+       if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
+               return false;
+       if (!iod->nvmeq->qid)
+               return false;
+       if (!sgl_threshold || avg_seg_size < sgl_threshold)
+               return false;
+       return true;
+ }
  static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
                struct nvme_command *cmnd)
  {
                                DMA_ATTR_NO_WARN))
                goto out;
  
-       ret = nvme_setup_prps(dev, req);
+       if (nvme_pci_use_sgls(dev, req))
+               ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
+       else
+               ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
        if (ret != BLK_STS_OK)
                goto out_unmap;
  
                        goto out_unmap;
        }
  
-       cmnd->rw.dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-       cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma);
        if (blk_integrity_rq(req))
                cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
        return BLK_STS_OK;
@@@ -804,7 -950,7 +950,7 @@@ static inline void nvme_handle_cqe(stru
         * for them but rather special case them here.
         */
        if (unlikely(nvmeq->qid == 0 &&
-                       cqe->command_id >= NVME_AQ_BLKMQ_DEPTH)) {
+                       cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
                nvme_complete_async_event(&nvmeq->dev->ctrl,
                                cqe->status, &cqe->result);
                return;
@@@ -897,7 -1043,7 +1043,7 @@@ static int nvme_poll(struct blk_mq_hw_c
        return __nvme_poll(nvmeq, tag);
  }
  
- static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
+ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
  {
        struct nvme_dev *dev = to_nvme_dev(ctrl);
        struct nvme_queue *nvmeq = dev->queues[0];
  
        memset(&c, 0, sizeof(c));
        c.common.opcode = nvme_admin_async_event;
-       c.common.command_id = NVME_AQ_BLKMQ_DEPTH + aer_idx;
+       c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
  
        spin_lock_irq(&nvmeq->q_lock);
        __nvme_submit_cmd(nvmeq, &c);
@@@ -930,7 -1076,7 +1076,7 @@@ static int adapter_alloc_cq(struct nvme
        int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
  
        /*
-        * Note: we (ab)use the fact the the prp fields survive if no data
+        * Note: we (ab)use the fact that the prp fields survive if no data
         * is attached to the request.
         */
        memset(&c, 0, sizeof(c));
@@@ -951,7 -1097,7 +1097,7 @@@ static int adapter_alloc_sq(struct nvme
        int flags = NVME_QUEUE_PHYS_CONTIG;
  
        /*
-        * Note: we (ab)use the fact the the prp fields survive if no data
+        * Note: we (ab)use the fact that the prp fields survive if no data
         * is attached to the request.
         */
        memset(&c, 0, sizeof(c));
@@@ -1226,7 -1372,7 +1372,7 @@@ static int nvme_alloc_sq_cmds(struct nv
        if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
                unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
                                                      dev->ctrl.page_size);
 -              nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset;
 +              nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
                nvmeq->sq_cmds_io = dev->cmb + offset;
        } else {
                nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
@@@ -1372,14 -1518,10 +1518,10 @@@ static int nvme_alloc_admin_tags(struc
                dev->admin_tagset.ops = &nvme_mq_admin_ops;
                dev->admin_tagset.nr_hw_queues = 1;
  
-               /*
-                * Subtract one to leave an empty queue entry for 'Full Queue'
-                * condition. See NVM-Express 1.2 specification, section 4.1.2.
-                */
-               dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1;
+               dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
                dev->admin_tagset.timeout = ADMIN_TIMEOUT;
                dev->admin_tagset.numa_node = dev_to_node(dev->dev);
-               dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
+               dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false);
                dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
                dev->admin_tagset.driver_data = dev;
  
@@@ -1527,7 -1669,7 +1669,7 @@@ static void __iomem *nvme_map_cmb(struc
        resource_size_t bar_size;
        struct pci_dev *pdev = to_pci_dev(dev->dev);
        void __iomem *cmb;
 -      dma_addr_t dma_addr;
 +      int bar;
  
        dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
        if (!(NVME_CMB_SZ(dev->cmbsz)))
        szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
        size = szu * NVME_CMB_SZ(dev->cmbsz);
        offset = szu * NVME_CMB_OFST(dev->cmbloc);
 -      bar_size = pci_resource_len(pdev, NVME_CMB_BIR(dev->cmbloc));
 +      bar = NVME_CMB_BIR(dev->cmbloc);
 +      bar_size = pci_resource_len(pdev, bar);
  
        if (offset > bar_size)
                return NULL;
        if (size > bar_size - offset)
                size = bar_size - offset;
  
 -      dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(dev->cmbloc)) + offset;
 -      cmb = ioremap_wc(dma_addr, size);
 +      cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
        if (!cmb)
                return NULL;
  
 -      dev->cmb_dma_addr = dma_addr;
 +      dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
        dev->cmb_size = size;
        return cmb;
  }
@@@ -1906,7 -2048,11 +2048,11 @@@ static int nvme_dev_add(struct nvme_de
                dev->tagset.numa_node = dev_to_node(dev->dev);
                dev->tagset.queue_depth =
                                min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
-               dev->tagset.cmd_size = nvme_cmd_size(dev);
+               dev->tagset.cmd_size = nvme_pci_cmd_size(dev, false);
+               if ((dev->ctrl.sgls & ((1 << 0) | (1 << 1))) && sgl_threshold) {
+                       dev->tagset.cmd_size = max(dev->tagset.cmd_size,
+                                       nvme_pci_cmd_size(dev, true));
+               }
                dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
                dev->tagset.driver_data = dev;
  
@@@ -2132,9 -2278,9 +2278,9 @@@ static void nvme_remove_dead_ctrl(struc
  {
        dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
  
-       kref_get(&dev->ctrl.kref);
+       nvme_get_ctrl(&dev->ctrl);
        nvme_dev_disable(dev, false);
-       if (!schedule_work(&dev->remove_work))
+       if (!queue_work(nvme_wq, &dev->remove_work))
                nvme_put_ctrl(&dev->ctrl);
  }
  
@@@ -2557,6 -2703,7 +2703,7 @@@ static int __init nvme_init(void
  static void __exit nvme_exit(void)
  {
        pci_unregister_driver(&nvme_driver);
+       flush_workqueue(nvme_wq);
        _nvme_check_size();
  }
  
diff --combined drivers/nvme/host/rdma.c
index 0ebb539f3bd3a7d7a6e18753bc4fefa402a1b626,c8d854474a5b6ce50b37d4e63ef248ff219cce0a..4f9bf2f815c399f3f7f39d5b6d485dbe75a2466f
  
  #define NVME_RDMA_MAX_INLINE_SEGMENTS 1
  
- /*
-  * We handle AEN commands ourselves and don't even let the
-  * block layer know about them.
-  */
- #define NVME_RDMA_NR_AEN_COMMANDS      1
- #define NVME_RDMA_AQ_BLKMQ_DEPTH       \
-       (NVME_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS)
  struct nvme_rdma_device {
-       struct ib_device       *dev;
-       struct ib_pd           *pd;
+       struct ib_device        *dev;
+       struct ib_pd            *pd;
        struct kref             ref;
        struct list_head        entry;
  };
@@@ -79,8 -71,8 +71,8 @@@ struct nvme_rdma_request 
  };
  
  enum nvme_rdma_queue_flags {
-       NVME_RDMA_Q_LIVE                = 0,
-       NVME_RDMA_Q_DELETING            = 1,
+       NVME_RDMA_Q_ALLOCATED           = 0,
+       NVME_RDMA_Q_LIVE                = 1,
  };
  
  struct nvme_rdma_queue {
@@@ -105,7 -97,6 +97,6 @@@ struct nvme_rdma_ctrl 
  
        /* other member variables */
        struct blk_mq_tag_set   tag_set;
-       struct work_struct      delete_work;
        struct work_struct      err_work;
  
        struct nvme_rdma_qe     async_event_sqe;
@@@ -274,6 -265,9 +265,9 @@@ static int nvme_rdma_reinit_request(voi
        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
        int ret = 0;
  
+       if (WARN_ON_ONCE(!req->mr))
+               return 0;
        ib_dereg_mr(req->mr);
  
        req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
@@@ -434,11 -428,9 +428,9 @@@ out_err
  
  static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
  {
-       struct nvme_rdma_device *dev;
-       struct ib_device *ibdev;
+       struct nvme_rdma_device *dev = queue->device;
+       struct ib_device *ibdev = dev->dev;
  
-       dev = queue->device;
-       ibdev = dev->dev;
        rdma_destroy_qp(queue->cm_id);
        ib_free_cq(queue->ib_cq);
  
@@@ -493,7 -485,7 +485,7 @@@ static int nvme_rdma_create_queue_ib(st
        return 0;
  
  out_destroy_qp:
-       ib_destroy_qp(queue->qp);
+       rdma_destroy_qp(queue->cm_id);
  out_destroy_ib_cq:
        ib_free_cq(queue->ib_cq);
  out_put_dev:
@@@ -544,11 -536,11 +536,11 @@@ static int nvme_rdma_alloc_queue(struc
        ret = nvme_rdma_wait_for_cm(queue);
        if (ret) {
                dev_info(ctrl->ctrl.device,
-                       "rdma_resolve_addr wait failed (%d).\n", ret);
+                       "rdma connection establishment failed (%d)\n", ret);
                goto out_destroy_cm_id;
        }
  
-       clear_bit(NVME_RDMA_Q_DELETING, &queue->flags);
+       set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags);
  
        return 0;
  
@@@ -568,15 -560,9 +560,15 @@@ static void nvme_rdma_stop_queue(struc
  
  static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
  {
-       if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags))
+       if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
                return;
  
 +      if (nvme_rdma_queue_idx(queue) == 0) {
 +              nvme_rdma_free_qe(queue->device->dev,
 +                      &queue->ctrl->async_event_sqe,
 +                      sizeof(struct nvme_command), DMA_TO_DEVICE);
 +      }
 +
        nvme_rdma_destroy_queue_ib(queue);
        rdma_destroy_id(queue->cm_id);
  }
@@@ -676,11 -662,10 +668,10 @@@ out_free_queues
        return ret;
  }
  
- static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl, bool admin)
+ static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl,
+               struct blk_mq_tag_set *set)
  {
        struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
-       struct blk_mq_tag_set *set = admin ?
-                       &ctrl->admin_tag_set : &ctrl->tag_set;
  
        blk_mq_free_tag_set(set);
        nvme_rdma_dev_put(ctrl->device);
@@@ -697,7 -682,7 +688,7 @@@ static struct blk_mq_tag_set *nvme_rdma
                set = &ctrl->admin_tag_set;
                memset(set, 0, sizeof(*set));
                set->ops = &nvme_rdma_admin_mq_ops;
-               set->queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH;
+               set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
                set->reserved_tags = 2; /* connect + keep-alive */
                set->numa_node = NUMA_NO_NODE;
                set->cmd_size = sizeof(struct nvme_rdma_request) +
                set->driver_data = ctrl;
                set->nr_hw_queues = 1;
                set->timeout = ADMIN_TIMEOUT;
+               set->flags = BLK_MQ_F_NO_SCHED;
        } else {
                set = &ctrl->tag_set;
                memset(set, 0, sizeof(*set));
@@@ -745,10 -731,12 +737,10 @@@ out
  static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
                bool remove)
  {
 -      nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe,
 -                      sizeof(struct nvme_command), DMA_TO_DEVICE);
        nvme_rdma_stop_queue(&ctrl->queues[0]);
        if (remove) {
                blk_cleanup_queue(ctrl->ctrl.admin_q);
-               nvme_rdma_free_tagset(&ctrl->ctrl, true);
+               nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
        }
        nvme_rdma_free_queue(&ctrl->queues[0]);
  }
@@@ -769,10 -757,8 +761,10 @@@ static int nvme_rdma_configure_admin_qu
  
        if (new) {
                ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true);
 -              if (IS_ERR(ctrl->ctrl.admin_tagset))
 +              if (IS_ERR(ctrl->ctrl.admin_tagset)) {
 +                      error = PTR_ERR(ctrl->ctrl.admin_tagset);
                        goto out_free_queue;
 +              }
  
                ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
                if (IS_ERR(ctrl->ctrl.admin_q)) {
                        goto out_free_tagset;
                }
        } else {
-               error = blk_mq_reinit_tagset(&ctrl->admin_tag_set,
-                                            nvme_rdma_reinit_request);
+               error = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
                if (error)
                        goto out_free_queue;
        }
@@@ -825,7 -810,7 +816,7 @@@ out_cleanup_queue
                blk_cleanup_queue(ctrl->ctrl.admin_q);
  out_free_tagset:
        if (new)
-               nvme_rdma_free_tagset(&ctrl->ctrl, true);
+               nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
  out_free_queue:
        nvme_rdma_free_queue(&ctrl->queues[0]);
        return error;
@@@ -837,7 -822,7 +828,7 @@@ static void nvme_rdma_destroy_io_queues
        nvme_rdma_stop_io_queues(ctrl);
        if (remove) {
                blk_cleanup_queue(ctrl->ctrl.connect_q);
-               nvme_rdma_free_tagset(&ctrl->ctrl, false);
+               nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
        }
        nvme_rdma_free_io_queues(ctrl);
  }
@@@ -852,10 -837,8 +843,10 @@@ static int nvme_rdma_configure_io_queue
  
        if (new) {
                ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, false);
 -              if (IS_ERR(ctrl->ctrl.tagset))
 +              if (IS_ERR(ctrl->ctrl.tagset)) {
 +                      ret = PTR_ERR(ctrl->ctrl.tagset);
                        goto out_free_io_queues;
 +              }
  
                ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
                if (IS_ERR(ctrl->ctrl.connect_q)) {
                        goto out_free_tag_set;
                }
        } else {
-               ret = blk_mq_reinit_tagset(&ctrl->tag_set,
-                                          nvme_rdma_reinit_request);
+               ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
                if (ret)
                        goto out_free_io_queues;
  
@@@ -883,7 -865,7 +873,7 @@@ out_cleanup_connect_q
                blk_cleanup_queue(ctrl->ctrl.connect_q);
  out_free_tag_set:
        if (new)
-               nvme_rdma_free_tagset(&ctrl->ctrl, false);
+               nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
  out_free_io_queues:
        nvme_rdma_free_io_queues(ctrl);
        return ret;
@@@ -922,7 -904,7 +912,7 @@@ static void nvme_rdma_reconnect_or_remo
                                ctrl->ctrl.opts->reconnect_delay * HZ);
        } else {
                dev_info(ctrl->ctrl.device, "Removing controller...\n");
-               queue_work(nvme_wq, &ctrl->delete_work);
+               nvme_delete_ctrl(&ctrl->ctrl);
        }
  }
  
@@@ -935,10 -917,6 +925,6 @@@ static void nvme_rdma_reconnect_ctrl_wo
  
        ++ctrl->ctrl.nr_reconnects;
  
-       if (ctrl->ctrl.queue_count > 1)
-               nvme_rdma_destroy_io_queues(ctrl, false);
-       nvme_rdma_destroy_admin_queue(ctrl, false);
        ret = nvme_rdma_configure_admin_queue(ctrl, false);
        if (ret)
                goto requeue;
        if (ctrl->ctrl.queue_count > 1) {
                ret = nvme_rdma_configure_io_queues(ctrl, false);
                if (ret)
-                       goto requeue;
+                       goto destroy_admin;
        }
  
        changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
                return;
        }
  
-       ctrl->ctrl.nr_reconnects = 0;
        nvme_start_ctrl(&ctrl->ctrl);
  
-       dev_info(ctrl->ctrl.device, "Successfully reconnected\n");
+       dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
+                       ctrl->ctrl.nr_reconnects);
+       ctrl->ctrl.nr_reconnects = 0;
  
        return;
  
+ destroy_admin:
+       nvme_rdma_destroy_admin_queue(ctrl, false);
  requeue:
        dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
                        ctrl->ctrl.nr_reconnects);
@@@ -979,17 -960,15 +968,15 @@@ static void nvme_rdma_error_recovery_wo
  
        if (ctrl->ctrl.queue_count > 1) {
                nvme_stop_queues(&ctrl->ctrl);
-               nvme_rdma_stop_io_queues(ctrl);
-       }
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
-       nvme_rdma_stop_queue(&ctrl->queues[0]);
-       /* We must take care of fastfail/requeue all our inflight requests */
-       if (ctrl->ctrl.queue_count > 1)
                blk_mq_tagset_busy_iter(&ctrl->tag_set,
                                        nvme_cancel_request, &ctrl->ctrl);
+               nvme_rdma_destroy_io_queues(ctrl, false);
+       }
+       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
        blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
                                nvme_cancel_request, &ctrl->ctrl);
+       nvme_rdma_destroy_admin_queue(ctrl, false);
  
        /*
         * queues are not a live anymore, so restart the queues to fail fast
@@@ -1065,7 -1044,7 +1052,7 @@@ static void nvme_rdma_unmap_data(struc
        if (!blk_rq_bytes(rq))
                return;
  
-       if (req->mr->need_inval) {
+       if (req->mr->need_inval && test_bit(NVME_RDMA_Q_LIVE, &req->queue->flags)) {
                res = nvme_rdma_inv_rkey(queue, req);
                if (unlikely(res < 0)) {
                        dev_err(ctrl->ctrl.device,
@@@ -1314,7 -1293,7 +1301,7 @@@ static struct blk_mq_tags *nvme_rdma_ta
        return queue->ctrl->tag_set.tags[queue_idx - 1];
  }
  
- static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
+ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
  {
        struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
        struct nvme_rdma_queue *queue = &ctrl->queues[0];
        struct ib_sge sge;
        int ret;
  
-       if (WARN_ON_ONCE(aer_idx != 0))
-               return;
        ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE);
  
        memset(cmd, 0, sizeof(*cmd));
        cmd->common.opcode = nvme_admin_async_event;
-       cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH;
+       cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
        cmd->common.flags |= NVME_CMD_SGL_METABUF;
        nvme_rdma_set_sg_null(cmd);
  
@@@ -1393,7 -1369,7 +1377,7 @@@ static int __nvme_rdma_recv_done(struc
         * for them but rather special case them here.
         */
        if (unlikely(nvme_rdma_queue_idx(queue) == 0 &&
-                       cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH))
+                       cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
                nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
                                &cqe->result);
        else
@@@ -1590,6 -1566,10 +1574,10 @@@ nvme_rdma_timeout(struct request *rq, b
  {
        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
  
+       dev_warn(req->queue->ctrl->ctrl.device,
+                "I/O %d QID %d timeout, reset controller\n",
+                rq->tag, nvme_rdma_queue_idx(req->queue));
        /* queue error recovery */
        nvme_rdma_error_recovery(req->queue->ctrl);
  
@@@ -1614,15 -1594,12 +1602,15 @@@ nvme_rdma_queue_is_ready(struct nvme_rd
                        /*
                         * reconnecting state means transport disruption, which
                         * can take a long time and even might fail permanently,
 -                       * so we can't let incoming I/O be requeued forever.
 -                       * fail it fast to allow upper layers a chance to
 -                       * failover.
 +                       * fail fast to give upper layers a chance to failover.
 +                       * deleting state means that the ctrl will never accept
 +                       * commands again, fail it permanently.
                         */
 -                      if (queue->ctrl->ctrl.state == NVME_CTRL_RECONNECTING)
 +                      if (queue->ctrl->ctrl.state == NVME_CTRL_RECONNECTING ||
 +                          queue->ctrl->ctrl.state == NVME_CTRL_DELETING) {
 +                              nvme_req(rq)->status = NVME_SC_ABORT_REQ;
                                return BLK_STS_IOERR;
 +                      }
                        return BLK_STS_RESOURCE; /* try again later */
                }
        }
@@@ -1767,50 -1744,9 +1755,9 @@@ static void nvme_rdma_shutdown_ctrl(str
        nvme_rdma_destroy_admin_queue(ctrl, shutdown);
  }
  
- static void nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl)
+ static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
  {
-       nvme_remove_namespaces(&ctrl->ctrl);
-       nvme_rdma_shutdown_ctrl(ctrl, true);
-       nvme_uninit_ctrl(&ctrl->ctrl);
-       nvme_put_ctrl(&ctrl->ctrl);
- }
- static void nvme_rdma_del_ctrl_work(struct work_struct *work)
- {
-       struct nvme_rdma_ctrl *ctrl = container_of(work,
-                               struct nvme_rdma_ctrl, delete_work);
-       nvme_stop_ctrl(&ctrl->ctrl);
-       nvme_rdma_remove_ctrl(ctrl);
- }
- static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl)
- {
-       if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
-               return -EBUSY;
-       if (!queue_work(nvme_wq, &ctrl->delete_work))
-               return -EBUSY;
-       return 0;
- }
- static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl)
- {
-       struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
-       int ret = 0;
-       /*
-        * Keep a reference until all work is flushed since
-        * __nvme_rdma_del_ctrl can free the ctrl mem
-        */
-       if (!kref_get_unless_zero(&ctrl->ctrl.kref))
-               return -EBUSY;
-       ret = __nvme_rdma_del_ctrl(ctrl);
-       if (!ret)
-               flush_work(&ctrl->delete_work);
-       nvme_put_ctrl(&ctrl->ctrl);
-       return ret;
+       nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true);
  }
  
  static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
        }
  
        changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
-       WARN_ON_ONCE(!changed);
+       if (!changed) {
+               /* state change failure is ok if we're in DELETING state */
+               WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
+               return;
+       }
  
        nvme_start_ctrl(&ctrl->ctrl);
  
  
  out_fail:
        dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
-       nvme_rdma_remove_ctrl(ctrl);
+       nvme_remove_namespaces(&ctrl->ctrl);
+       nvme_rdma_shutdown_ctrl(ctrl, true);
+       nvme_uninit_ctrl(&ctrl->ctrl);
+       nvme_put_ctrl(&ctrl->ctrl);
  }
  
  static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
        .reg_write32            = nvmf_reg_write32,
        .free_ctrl              = nvme_rdma_free_ctrl,
        .submit_async_event     = nvme_rdma_submit_async_event,
-       .delete_ctrl            = nvme_rdma_del_ctrl,
+       .delete_ctrl            = nvme_rdma_delete_ctrl,
        .get_address            = nvmf_get_address,
+       .reinit_request         = nvme_rdma_reinit_request,
  };
  
+ static inline bool
+ __nvme_rdma_options_match(struct nvme_rdma_ctrl *ctrl,
+       struct nvmf_ctrl_options *opts)
+ {
+       char *stdport = __stringify(NVME_RDMA_IP_PORT);
+       if (!nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts) ||
+           strcmp(opts->traddr, ctrl->ctrl.opts->traddr))
+               return false;
+       if (opts->mask & NVMF_OPT_TRSVCID &&
+           ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
+               if (strcmp(opts->trsvcid, ctrl->ctrl.opts->trsvcid))
+                       return false;
+       } else if (opts->mask & NVMF_OPT_TRSVCID) {
+               if (strcmp(opts->trsvcid, stdport))
+                       return false;
+       } else if (ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
+               if (strcmp(stdport, ctrl->ctrl.opts->trsvcid))
+                       return false;
+       }
+       /* else, it's a match as both have stdport. Fall to next checks */
+       /*
+        * checking the local address is rough. In most cases, one
+        * is not specified and the host port is selected by the stack.
+        *
+        * Assume no match if:
+        *  local address is specified and address is not the same
+        *  local address is not specified but remote is, or vice versa
+        *    (admin using specific host_traddr when it matters).
+        */
+       if (opts->mask & NVMF_OPT_HOST_TRADDR &&
+           ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
+               if (strcmp(opts->host_traddr, ctrl->ctrl.opts->host_traddr))
+                       return false;
+       } else if (opts->mask & NVMF_OPT_HOST_TRADDR ||
+                  ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
+               return false;
+       /*
+        * if neither controller had an host port specified, assume it's
+        * a match as everything else matched.
+        */
+       return true;
+ }
+ /*
+  * Fails a connection request if it matches an existing controller
+  * (association) with the same tuple:
+  * <Host NQN, Host ID, local address, remote address, remote port, SUBSYS NQN>
+  *
+  * if local address is not specified in the request, it will match an
+  * existing controller with all the other parameters the same and no
+  * local port address specified as well.
+  *
+  * The ports don't need to be compared as they are intrinsically
+  * already matched by the port pointers supplied.
+  */
+ static bool
+ nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts)
+ {
+       struct nvme_rdma_ctrl *ctrl;
+       bool found = false;
+       mutex_lock(&nvme_rdma_ctrl_mutex);
+       list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
+               found = __nvme_rdma_options_match(ctrl, opts);
+               if (found)
+                       break;
+       }
+       mutex_unlock(&nvme_rdma_ctrl_mutex);
+       return found;
+ }
  static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
                struct nvmf_ctrl_options *opts)
  {
                }
        }
  
+       if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) {
+               ret = -EALREADY;
+               goto out_free_ctrl;
+       }
        ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
                                0 /* no quirks, we're perfect! */);
        if (ret)
        INIT_DELAYED_WORK(&ctrl->reconnect_work,
                        nvme_rdma_reconnect_ctrl_work);
        INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
-       INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work);
        INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
  
        ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
        dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
                ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
  
-       kref_get(&ctrl->ctrl.kref);
+       nvme_get_ctrl(&ctrl->ctrl);
  
        mutex_lock(&nvme_rdma_ctrl_mutex);
        list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
@@@ -2006,7 -2031,7 +2042,7 @@@ static void nvme_rdma_remove_one(struc
                dev_info(ctrl->ctrl.device,
                        "Removing ctrl: NQN \"%s\", addr %pISp\n",
                        ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
-               __nvme_rdma_del_ctrl(ctrl);
+               nvme_delete_ctrl(&ctrl->ctrl);
        }
        mutex_unlock(&nvme_rdma_ctrl_mutex);
  
index 645ba7eee35db7a66a0249d39c7adba514173229,22a2a2bb40f9e58a04902e8f5d617be39af15332..b54748ad5f4800cdda6ee83372fd16f2eb574fd3
@@@ -57,6 -57,17 +57,17 @@@ u16 nvmet_copy_from_sgl(struct nvmet_re
        return 0;
  }
  
+ static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys)
+ {
+       struct nvmet_ns *ns;
+       if (list_empty(&subsys->namespaces))
+               return 0;
+       ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link);
+       return ns->nsid;
+ }
  static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
  {
        return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16);
@@@ -334,6 -345,8 +345,8 @@@ void nvmet_ns_disable(struct nvmet_ns *
  
        ns->enabled = false;
        list_del_rcu(&ns->dev_link);
+       if (ns->nsid == subsys->max_nsid)
+               subsys->max_nsid = nvmet_max_nsid(subsys);
        mutex_unlock(&subsys->lock);
  
        /*
@@@ -387,21 -400,12 +400,21 @@@ struct nvmet_ns *nvmet_ns_alloc(struct 
  
  static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
  {
 +      u32 old_sqhd, new_sqhd;
 +      u16 sqhd;
 +
        if (status)
                nvmet_set_status(req, status);
  
 -      if (req->sq->size)
 -              req->sq->sqhd = (req->sq->sqhd + 1) % req->sq->size;
 -      req->rsp->sq_head = cpu_to_le16(req->sq->sqhd);
 +      if (req->sq->size) {
 +              do {
 +                      old_sqhd = req->sq->sqhd;
 +                      new_sqhd = (old_sqhd + 1) % req->sq->size;
 +              } while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) !=
 +                                      old_sqhd);
 +      }
 +      sqhd = req->sq->sqhd & 0x0000FFFF;
 +      req->rsp->sq_head = cpu_to_le16(sqhd);
        req->rsp->sq_id = cpu_to_le16(req->sq->qid);
        req->rsp->command_id = req->cmd->common.command_id;
  
@@@ -497,6 -501,7 +510,7 @@@ bool nvmet_req_init(struct nvmet_req *r
        req->ops = ops;
        req->sg = NULL;
        req->sg_cnt = 0;
+       req->transfer_len = 0;
        req->rsp->status = 0;
  
        /* no support for fused commands yet */
@@@ -546,6 -551,15 +560,15 @@@ void nvmet_req_uninit(struct nvmet_req 
  }
  EXPORT_SYMBOL_GPL(nvmet_req_uninit);
  
+ void nvmet_req_execute(struct nvmet_req *req)
+ {
+       if (unlikely(req->data_len != req->transfer_len))
+               nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
+       else
+               req->execute(req);
+ }
+ EXPORT_SYMBOL_GPL(nvmet_req_execute);
  static inline bool nvmet_cc_en(u32 cc)
  {
        return (cc >> NVME_CC_EN_SHIFT) & 0x1;
index 87e429bfcd8a0c918f2aae018c247bb0014d3d0b,194ebffc688c3b38793d59d1afaa9bb383dad82c..417f6c0331cc9c45311ddd85b118b327cce16f77
@@@ -74,7 -74,7 +74,7 @@@ struct nvmet_sq 
        struct percpu_ref       ref;
        u16                     qid;
        u16                     size;
 -      u16                     sqhd;
 +      u32                     sqhd;
        struct completion       free_done;
        struct completion       confirm_done;
  };
@@@ -223,7 -223,10 +223,10 @@@ struct nvmet_req 
        struct bio              inline_bio;
        struct bio_vec          inline_bvec[NVMET_MAX_INLINE_BIOVEC];
        int                     sg_cnt;
+       /* data length as parsed from the command: */
        size_t                  data_len;
+       /* data length as parsed from the SGL descriptor: */
+       size_t                  transfer_len;
  
        struct nvmet_port       *port;
  
@@@ -266,6 -269,7 +269,7 @@@ u16 nvmet_parse_fabrics_cmd(struct nvme
  bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
                struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops);
  void nvmet_req_uninit(struct nvmet_req *req);
+ void nvmet_req_execute(struct nvmet_req *req);
  void nvmet_req_complete(struct nvmet_req *req, u16 status);
  
  void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
@@@ -314,7 -318,7 +318,7 @@@ u16 nvmet_copy_from_sgl(struct nvmet_re
  u32 nvmet_get_log_page_len(struct nvme_command *cmd);
  
  #define NVMET_QUEUE_SIZE      1024
- #define NVMET_NR_QUEUES               64
+ #define NVMET_NR_QUEUES               128
  #define NVMET_MAX_CMD         NVMET_QUEUE_SIZE
  #define NVMET_KAS             10
  #define NVMET_DISC_KATO               120
diff --combined drivers/scsi/scsi_lib.c
index bcc1694cebcd3e184f40bba43f3a2200ea56c6e8,f907e2f8c1ddb83fdff588cec9fc0daedd5dbc72..54de24c785dd9573593845d53ba1a4bf77500de9
@@@ -252,9 -252,9 +252,9 @@@ int scsi_execute(struct scsi_device *sd
        struct scsi_request *rq;
        int ret = DRIVER_ERROR << 24;
  
-       req = blk_get_request(sdev->request_queue,
+       req = blk_get_request_flags(sdev->request_queue,
                        data_direction == DMA_TO_DEVICE ?
-                       REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM);
+                       REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, BLK_MQ_REQ_PREEMPT);
        if (IS_ERR(req))
                return ret;
        rq = scsi_req(req);
        rq->retries = retries;
        req->timeout = timeout;
        req->cmd_flags |= flags;
-       req->rq_flags |= rq_flags | RQF_QUIET | RQF_PREEMPT;
+       req->rq_flags |= rq_flags | RQF_QUIET;
  
        /*
         * head injection *required* here otherwise quiesce won't work
@@@ -1301,7 -1301,7 +1301,7 @@@ scsi_prep_state_check(struct scsi_devic
                        /*
                         * If the devices is blocked we defer normal commands.
                         */
-                       if (!(req->rq_flags & RQF_PREEMPT))
+                       if (req && !(req->rq_flags & RQF_PREEMPT))
                                ret = BLKPREP_DEFER;
                        break;
                default:
                         * special commands.  In particular any user initiated
                         * command is not allowed.
                         */
-                       if (!(req->rq_flags & RQF_PREEMPT))
+                       if (req && !(req->rq_flags & RQF_PREEMPT))
                                ret = BLKPREP_KILL;
                        break;
                }
@@@ -1379,6 -1379,8 +1379,6 @@@ static int scsi_prep_fn(struct request_
  
        ret = scsi_setup_cmnd(sdev, req);
  out:
 -      if (ret != BLKPREP_OK)
 -              cmd->flags &= ~SCMD_INITIALIZED;
        return scsi_prep_return(q, req, ret);
  }
  
@@@ -1898,6 -1900,7 +1898,6 @@@ static int scsi_mq_prep_fn(struct reque
        struct scsi_device *sdev = req->q->queuedata;
        struct Scsi_Host *shost = sdev->host;
        struct scatterlist *sg;
 -      int ret;
  
        scsi_init_command(sdev, cmd);
  
  
        blk_mq_start_request(req);
  
 -      ret = scsi_setup_cmnd(sdev, req);
 -      if (ret != BLK_STS_OK)
 -              cmd->flags &= ~SCMD_INITIALIZED;
 -      return ret;
 +      return scsi_setup_cmnd(sdev, req);
  }
  
  static void scsi_mq_done(struct scsi_cmnd *cmd)
        blk_mq_complete_request(cmd->request);
  }
  
+ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
+ {
+       struct request_queue *q = hctx->queue;
+       struct scsi_device *sdev = q->queuedata;
+       atomic_dec(&sdev->device_busy);
+       put_device(&sdev->sdev_gendev);
+ }
+ static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
+ {
+       struct request_queue *q = hctx->queue;
+       struct scsi_device *sdev = q->queuedata;
+       if (!get_device(&sdev->sdev_gendev))
+               goto out;
+       if (!scsi_dev_queue_ready(q, sdev))
+               goto out_put_device;
+       return true;
+ out_put_device:
+       put_device(&sdev->sdev_gendev);
+ out:
+       return false;
+ }
  static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
                         const struct blk_mq_queue_data *bd)
  {
  
        ret = prep_to_mq(scsi_prep_state_check(sdev, req));
        if (ret != BLK_STS_OK)
-               goto out;
+               goto out_put_budget;
  
        ret = BLK_STS_RESOURCE;
-       if (!get_device(&sdev->sdev_gendev))
-               goto out;
-       if (!scsi_dev_queue_ready(q, sdev))
-               goto out_put_device;
        if (!scsi_target_queue_ready(shost, sdev))
-               goto out_dec_device_busy;
+               goto out_put_budget;
        if (!scsi_host_queue_ready(q, shost, sdev))
                goto out_dec_target_busy;
  
        return BLK_STS_OK;
  
  out_dec_host_busy:
-       atomic_dec(&shost->host_busy);
+        atomic_dec(&shost->host_busy);
  out_dec_target_busy:
        if (scsi_target(sdev)->can_queue > 0)
                atomic_dec(&scsi_target(sdev)->target_busy);
- out_dec_device_busy:
-       atomic_dec(&sdev->device_busy);
- out_put_device:
-       put_device(&sdev->sdev_gendev);
- out:
+ out_put_budget:
+       scsi_mq_put_budget(hctx);
        switch (ret) {
        case BLK_STS_OK:
                break;
@@@ -2205,6 -2230,8 +2224,8 @@@ struct request_queue *scsi_old_alloc_qu
  }
  
  static const struct blk_mq_ops scsi_mq_ops = {
+       .get_budget     = scsi_mq_get_budget,
+       .put_budget     = scsi_mq_put_budget,
        .queue_rq       = scsi_queue_rq,
        .complete       = scsi_softirq_done,
        .timeout        = scsi_timeout,
@@@ -2685,6 -2712,7 +2706,6 @@@ scsi_device_set_state(struct scsi_devic
  
        }
        sdev->sdev_state = state;
 -      sysfs_notify(&sdev->sdev_gendev.kobj, NULL, "state");
        return 0;
  
   illegal:
@@@ -2919,21 -2947,37 +2940,37 @@@ static void scsi_wait_for_queuecommand(
  int
  scsi_device_quiesce(struct scsi_device *sdev)
  {
+       struct request_queue *q = sdev->request_queue;
        int err;
  
+       /*
+        * It is allowed to call scsi_device_quiesce() multiple times from
+        * the same context but concurrent scsi_device_quiesce() calls are
+        * not allowed.
+        */
+       WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current);
+       blk_set_preempt_only(q);
+       blk_mq_freeze_queue(q);
+       /*
+        * Ensure that the effect of blk_set_preempt_only() will be visible
+        * for percpu_ref_tryget() callers that occur after the queue
+        * unfreeze even if the queue was already frozen before this function
+        * was called. See also https://lwn.net/Articles/573497/.
+        */
+       synchronize_rcu();
+       blk_mq_unfreeze_queue(q);
        mutex_lock(&sdev->state_mutex);
        err = scsi_device_set_state(sdev, SDEV_QUIESCE);
+       if (err == 0)
+               sdev->quiesced_by = current;
+       else
+               blk_clear_preempt_only(q);
        mutex_unlock(&sdev->state_mutex);
  
-       if (err)
-               return err;
-       scsi_run_queue(sdev->request_queue);
-       while (atomic_read(&sdev->device_busy)) {
-               msleep_interruptible(200);
-               scsi_run_queue(sdev->request_queue);
-       }
-       return 0;
+       return err;
  }
  EXPORT_SYMBOL(scsi_device_quiesce);
  
@@@ -2953,9 -2997,11 +2990,11 @@@ void scsi_device_resume(struct scsi_dev
         * device deleted during suspend)
         */
        mutex_lock(&sdev->state_mutex);
-       if (sdev->sdev_state == SDEV_QUIESCE &&
-           scsi_device_set_state(sdev, SDEV_RUNNING) == 0)
-               scsi_run_queue(sdev->request_queue);
+       WARN_ON_ONCE(!sdev->quiesced_by);
+       sdev->quiesced_by = NULL;
+       blk_clear_preempt_only(sdev->request_queue);
+       if (sdev->sdev_state == SDEV_QUIESCE)
+               scsi_device_set_state(sdev, SDEV_RUNNING);
        mutex_unlock(&sdev->state_mutex);
  }
  EXPORT_SYMBOL(scsi_device_resume);
@@@ -3108,6 -3154,7 +3147,6 @@@ int scsi_internal_device_unblock_nowait
        case SDEV_BLOCK:
        case SDEV_TRANSPORT_OFFLINE:
                sdev->sdev_state = new_state;
 -              sysfs_notify(&sdev->sdev_gendev.kobj, NULL, "state");
                break;
        case SDEV_CREATED_BLOCK:
                if (new_state == SDEV_TRANSPORT_OFFLINE ||
                        sdev->sdev_state = new_state;
                else
                        sdev->sdev_state = SDEV_CREATED;
 -              sysfs_notify(&sdev->sdev_gendev.kobj, NULL, "state");
                break;
        case SDEV_CANCEL:
        case SDEV_OFFLINE:
diff --combined drivers/scsi/sg.c
index aa28874e8fb92f5090d64c9ceb9523fce224eabe,92fd870e13156bb46fb42ac4da77000c11dc00b4..f098877eed4aa83fb4c1245e49b378a7699389d5
@@@ -217,7 -217,7 +217,7 @@@ static int sg_allow_access(struct file 
        if (sfp->parentdp->device->type == TYPE_SCANNER)
                return 0;
  
-       return blk_verify_command(cmd, filp->f_mode & FMODE_WRITE);
+       return blk_verify_command(cmd, filp->f_mode);
  }
  
  static int
@@@ -837,7 -837,7 +837,7 @@@ sg_fill_request_table(Sg_fd *sfp, sg_re
  
        val = 0;
        list_for_each_entry(srp, &sfp->rq_list, entry) {
 -              if (val > SG_MAX_QUEUE)
 +              if (val >= SG_MAX_QUEUE)
                        break;
                rinfo[val].req_state = srp->done + 1;
                rinfo[val].problem =
diff --combined fs/block_dev.c
index 789f55e851aeffb6b1212403188638d12a1d2540,04973f48442243a441364b2f4cf75b1c8d50257c..4a181fcb51751dc2cbc8fda10930a47bc883380e
@@@ -54,18 -54,6 +54,6 @@@ struct block_device *I_BDEV(struct inod
  }
  EXPORT_SYMBOL(I_BDEV);
  
- void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
- {
-       struct va_format vaf;
-       va_list args;
-       va_start(args, fmt);
-       vaf.fmt = fmt;
-       vaf.va = &args;
-       printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
-       va_end(args);
- }
  static void bdev_write_inode(struct block_device *bdev)
  {
        struct inode *inode = bdev->bd_inode;
@@@ -249,7 -237,7 +237,7 @@@ __blkdev_direct_IO_simple(struct kiocb 
                if (!READ_ONCE(bio.bi_private))
                        break;
                if (!(iocb->ki_flags & IOCB_HIPRI) ||
-                   !blk_mq_poll(bdev_get_queue(bdev), qc))
+                   !blk_poll(bdev_get_queue(bdev), qc))
                        io_schedule();
        }
        __set_current_state(TASK_RUNNING);
@@@ -414,7 -402,7 +402,7 @@@ __blkdev_direct_IO(struct kiocb *iocb, 
                        break;
  
                if (!(iocb->ki_flags & IOCB_HIPRI) ||
-                   !blk_mq_poll(bdev_get_queue(bdev), qc))
+                   !blk_poll(bdev_get_queue(bdev), qc))
                        io_schedule();
        }
        __set_current_state(TASK_RUNNING);
@@@ -674,7 -662,7 +662,7 @@@ int bdev_read_page(struct block_device 
        if (!ops->rw_page || bdev_get_integrity(bdev))
                return result;
  
-       result = blk_queue_enter(bdev->bd_queue, false);
+       result = blk_queue_enter(bdev->bd_queue, 0);
        if (result)
                return result;
        result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
@@@ -710,18 -698,16 +698,18 @@@ int bdev_write_page(struct block_devic
  
        if (!ops->rw_page || bdev_get_integrity(bdev))
                return -EOPNOTSUPP;
-       result = blk_queue_enter(bdev->bd_queue, false);
+       result = blk_queue_enter(bdev->bd_queue, 0);
        if (result)
                return result;
  
        set_page_writeback(page);
        result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true);
 -      if (result)
 +      if (result) {
                end_page_writeback(page);
 -      else
 +      } else {
 +              clean_page_buffers(page);
                unlock_page(page);
 +      }
        blk_queue_exit(bdev->bd_queue);
        return result;
  }
diff --combined fs/buffer.c
index 49b7e9bdcd1d34b0815142991e956f3d6fb3d6b2,bcabb69e7462e23caf2eab6cdb67c3bbb25b2410..1c18a22a6013b2c0ff3b4c0b31ac416f31973ca4
@@@ -252,27 -252,6 +252,6 @@@ out
        return ret;
  }
  
- /*
-  * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
-  */
- static void free_more_memory(void)
- {
-       struct zoneref *z;
-       int nid;
-       wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
-       yield();
-       for_each_online_node(nid) {
-               z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
-                                               gfp_zone(GFP_NOFS), NULL);
-               if (z->zone)
-                       try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
-                                               GFP_NOFS, NULL);
-       }
- }
  /*
   * I/O completion handler for block_read_full_page() - pages
   * which come unlocked at the end of I/O.
@@@ -861,16 -840,19 +840,19 @@@ int remove_inode_buffers(struct inode *
   * which may not fail from ordinary buffer allocations.
   */
  struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
-               int retry)
+               bool retry)
  {
        struct buffer_head *bh, *head;
+       gfp_t gfp = GFP_NOFS;
        long offset;
  
- try_again:
+       if (retry)
+               gfp |= __GFP_NOFAIL;
        head = NULL;
        offset = PAGE_SIZE;
        while ((offset -= size) >= 0) {
-               bh = alloc_buffer_head(GFP_NOFS);
+               bh = alloc_buffer_head(gfp);
                if (!bh)
                        goto no_grow;
  
@@@ -896,23 -878,7 +878,7 @@@ no_grow
                } while (head);
        }
  
-       /*
-        * Return failure for non-async IO requests.  Async IO requests
-        * are not allowed to fail, so we have to wait until buffer heads
-        * become available.  But we don't want tasks sleeping with 
-        * partially complete buffers, so all were released above.
-        */
-       if (!retry)
-               return NULL;
-       /* We're _really_ low on memory. Now we just
-        * wait for old buffer heads to become free due to
-        * finishing IO.  Since this is an async request and
-        * the reserve list is empty, we're sure there are 
-        * async buffer heads in use.
-        */
-       free_more_memory();
-       goto try_again;
+       return NULL;
  }
  EXPORT_SYMBOL_GPL(alloc_page_buffers);
  
@@@ -1001,8 -967,6 +967,6 @@@ grow_dev_page(struct block_device *bdev
        gfp_mask |= __GFP_NOFAIL;
  
        page = find_or_create_page(inode->i_mapping, index, gfp_mask);
-       if (!page)
-               return ret;
  
        BUG_ON(!PageLocked(page));
  
        /*
         * Allocate some buffers for this page
         */
-       bh = alloc_page_buffers(page, size, 0);
-       if (!bh)
-               goto failed;
+       bh = alloc_page_buffers(page, size, true);
  
        /*
         * Link the page to the buffers and initialise them.  Take the
@@@ -1103,8 -1065,6 +1065,6 @@@ __getblk_slow(struct block_device *bdev
                ret = grow_buffers(bdev, block, size, gfp);
                if (ret < 0)
                        return NULL;
-               if (ret == 0)
-                       free_more_memory();
        }
  }
  
@@@ -1575,7 -1535,7 +1535,7 @@@ void create_empty_buffers(struct page *
  {
        struct buffer_head *bh, *head, *tail;
  
-       head = alloc_page_buffers(page, blocksize, 1);
+       head = alloc_page_buffers(page, blocksize, true);
        bh = head;
        do {
                bh->b_state |= b_state;
@@@ -1692,8 -1652,7 +1652,8 @@@ static struct buffer_head *create_page_
        BUG_ON(!PageLocked(page));
  
        if (!page_has_buffers(page))
 -              create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
 +              create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits),
 +                                   b_state);
        return page_buffers(page);
  }
  
@@@ -1979,8 -1938,8 +1939,8 @@@ iomap_to_bh(struct inode *inode, sector
        case IOMAP_MAPPED:
                if (offset >= i_size_read(inode))
                        set_buffer_new(bh);
 -              bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
 -                              ((offset - iomap->offset) >> inode->i_blkbits);
 +              bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
 +                              inode->i_blkbits;
                set_buffer_mapped(bh);
                break;
        }
@@@ -2639,7 -2598,7 +2599,7 @@@ int nobh_write_begin(struct address_spa
         * Be careful: the buffer linked list is a NULL terminated one, rather
         * than the circular one we're used to.
         */
-       head = alloc_page_buffers(page, blocksize, 0);
+       head = alloc_page_buffers(page, blocksize, false);
        if (!head) {
                ret = -ENOMEM;
                goto out_release;
@@@ -3056,8 -3015,16 +3016,16 @@@ void guard_bio_eod(int op, struct bio *
        sector_t maxsector;
        struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
        unsigned truncated_bytes;
+       struct hd_struct *part;
+       rcu_read_lock();
+       part = __disk_get_part(bio->bi_disk, bio->bi_partno);
+       if (part)
+               maxsector = part_nr_sects_read(part);
+       else
+               maxsector = get_capacity(bio->bi_disk);
+       rcu_read_unlock();
  
-       maxsector = get_capacity(bio->bi_disk);
        if (!maxsector)
                return;
  
diff --combined fs/direct-io.c
index 98fe1325da9d07e52135eb728c3d0dd381d35649,d2bc339cb1e98cb8c29808758db912c0752580d4..3aafb3343a65c76fb66211228564d73ce9097485
   */
  #define DIO_PAGES     64
  
 +/*
 + * Flags for dio_complete()
 + */
 +#define DIO_COMPLETE_ASYNC            0x01    /* This is async IO */
 +#define DIO_COMPLETE_INVALIDATE               0x02    /* Can invalidate pages */
 +
  /*
   * This code generally works in units of "dio_blocks".  A dio_block is
   * somewhere between the hard sector size and the filesystem block size.  it
@@@ -231,7 -225,7 +231,7 @@@ static inline struct page *dio_get_page
   * filesystems can use it to hold additional state between get_block calls and
   * dio_complete.
   */
 -static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
 +static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
  {
        loff_t offset = dio->iocb->ki_pos;
        ssize_t transferred = 0;
        if (ret == 0)
                ret = transferred;
  
 +      if (dio->end_io) {
 +              // XXX: ki_pos??
 +              err = dio->end_io(dio->iocb, offset, ret, dio->private);
 +              if (err)
 +                      ret = err;
 +      }
 +
        /*
         * Try again to invalidate clean pages which might have been cached by
         * non-direct readahead, or faulted in by get_user_pages() if the source
         * of the write was an mmap'ed region of the file we're writing.  Either
         * one is a pretty crazy thing to do, so we don't support it 100%.  If
         * this invalidation fails, tough, the write still worked...
 +       *
 +       * And this page cache invalidation has to be after dio->end_io(), as
 +       * some filesystems convert unwritten extents to real allocations in
 +       * end_io() when necessary, otherwise a racing buffer read would cache
 +       * zeros from unwritten extents.
         */
 -      if (ret > 0 && dio->op == REQ_OP_WRITE &&
 +      if (flags & DIO_COMPLETE_INVALIDATE &&
 +          ret > 0 && dio->op == REQ_OP_WRITE &&
            dio->inode->i_mapping->nrpages) {
                err = invalidate_inode_pages2_range(dio->inode->i_mapping,
                                        offset >> PAGE_SHIFT,
                WARN_ON_ONCE(err);
        }
  
 -      if (dio->end_io) {
 -
 -              // XXX: ki_pos??
 -              err = dio->end_io(dio->iocb, offset, ret, dio->private);
 -              if (err)
 -                      ret = err;
 -      }
 -
        if (!(dio->flags & DIO_SKIP_DIO_COUNT))
                inode_dio_end(dio->inode);
  
 -      if (is_async) {
 +      if (flags & DIO_COMPLETE_ASYNC) {
                /*
                 * generic_write_sync expects ki_pos to have been updated
                 * already, but the submission path only does this for
@@@ -317,7 -306,7 +317,7 @@@ static void dio_aio_complete_work(struc
  {
        struct dio *dio = container_of(work, struct dio, complete_work);
  
 -      dio_complete(dio, 0, true);
 +      dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE);
  }
  
  static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio);
@@@ -359,7 -348,7 +359,7 @@@ static void dio_bio_end_aio(struct bio 
                        queue_work(dio->inode->i_sb->s_dio_done_wq,
                                   &dio->complete_work);
                } else {
 -                      dio_complete(dio, 0, true);
 +                      dio_complete(dio, 0, DIO_COMPLETE_ASYNC);
                }
        }
  }
@@@ -497,7 -486,7 +497,7 @@@ static struct bio *dio_await_one(struc
                dio->waiter = current;
                spin_unlock_irqrestore(&dio->bio_lock, flags);
                if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
-                   !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie))
+                   !blk_poll(dio->bio_disk->queue, dio->bio_cookie))
                        io_schedule();
                /* wake up sets us TASK_RUNNING */
                spin_lock_irqsave(&dio->bio_lock, flags);
@@@ -877,8 -866,7 +877,8 @@@ out
         */
        if (sdio->boundary) {
                ret = dio_send_cur_page(dio, sdio, map_bh);
 -              dio_bio_submit(dio, sdio);
 +              if (sdio->bio)
 +                      dio_bio_submit(dio, sdio);
                put_page(sdio->cur_page);
                sdio->cur_page = NULL;
        }
@@@ -1152,7 -1140,7 +1152,7 @@@ do_blockdev_direct_IO(struct kiocb *ioc
                      get_block_t get_block, dio_iodone_t end_io,
                      dio_submit_t submit_io, int flags)
  {
 -      unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
 +      unsigned i_blkbits = READ_ONCE(inode->i_blkbits);
        unsigned blkbits = i_blkbits;
        unsigned blocksize_mask = (1 << blkbits) - 1;
        ssize_t retval = -EINVAL;
                dio_await_completion(dio);
  
        if (drop_refcount(dio) == 0) {
 -              retval = dio_complete(dio, retval, false);
 +              retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE);
        } else
                BUG_ON(retval != -EIOCBQUEUED);
  
diff --combined fs/iomap.c
index 5011a964a5501ece5a42185d666ae9c6a69d9624,4241bac905b19bea8b66c17cd684ab28906743a7..b9f74803e56c08ce9655c96c5c32260e0e925efd
@@@ -350,8 -350,8 +350,8 @@@ static int iomap_zero(struct inode *ino
  static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
                struct iomap *iomap)
  {
 -      sector_t sector = iomap->blkno +
 -              (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
 +      sector_t sector = (iomap->addr +
 +                         (pos & PAGE_MASK) - iomap->offset) >> 9;
  
        return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
                        offset, bytes);
@@@ -510,12 -510,11 +510,12 @@@ static int iomap_to_fiemap(struct fiema
                flags |= FIEMAP_EXTENT_MERGED;
        if (iomap->flags & IOMAP_F_SHARED)
                flags |= FIEMAP_EXTENT_SHARED;
 +      if (iomap->flags & IOMAP_F_DATA_INLINE)
 +              flags |= FIEMAP_EXTENT_DATA_INLINE;
  
        return fiemap_fill_next_extent(fi, iomap->offset,
 -                      iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
 +                      iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
                        iomap->length, flags);
 -
  }
  
  static loff_t
@@@ -715,9 -714,23 +715,9 @@@ static ssize_t iomap_dio_complete(struc
  {
        struct kiocb *iocb = dio->iocb;
        struct inode *inode = file_inode(iocb->ki_filp);
 +      loff_t offset = iocb->ki_pos;
        ssize_t ret;
  
 -      /*
 -       * Try again to invalidate clean pages which might have been cached by
 -       * non-direct readahead, or faulted in by get_user_pages() if the source
 -       * of the write was an mmap'ed region of the file we're writing.  Either
 -       * one is a pretty crazy thing to do, so we don't support it 100%.  If
 -       * this invalidation fails, tough, the write still worked...
 -       */
 -      if (!dio->error &&
 -          (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
 -              ret = invalidate_inode_pages2_range(inode->i_mapping,
 -                              iocb->ki_pos >> PAGE_SHIFT,
 -                              (iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
 -              WARN_ON_ONCE(ret);
 -      }
 -
        if (dio->end_io) {
                ret = dio->end_io(iocb,
                                dio->error ? dio->error : dio->size,
        if (likely(!ret)) {
                ret = dio->size;
                /* check for short read */
 -              if (iocb->ki_pos + ret > dio->i_size &&
 +              if (offset + ret > dio->i_size &&
                    !(dio->flags & IOMAP_DIO_WRITE))
 -                      ret = dio->i_size - iocb->ki_pos;
 +                      ret = dio->i_size - offset;
                iocb->ki_pos += ret;
        }
  
 +      /*
 +       * Try again to invalidate clean pages which might have been cached by
 +       * non-direct readahead, or faulted in by get_user_pages() if the source
 +       * of the write was an mmap'ed region of the file we're writing.  Either
 +       * one is a pretty crazy thing to do, so we don't support it 100%.  If
 +       * this invalidation fails, tough, the write still worked...
 +       *
 +       * And this page cache invalidation has to be after dio->end_io(), as
 +       * some filesystems convert unwritten extents to real allocations in
 +       * end_io() when necessary, otherwise a racing buffer read would cache
 +       * zeros from unwritten extents.
 +       */
 +      if (!dio->error &&
 +          (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
 +              int err;
 +              err = invalidate_inode_pages2_range(inode->i_mapping,
 +                              offset >> PAGE_SHIFT,
 +                              (offset + dio->size - 1) >> PAGE_SHIFT);
 +              WARN_ON_ONCE(err);
 +      }
 +
        inode_dio_end(file_inode(iocb->ki_filp));
        kfree(dio);
  
@@@ -831,7 -823,7 +831,7 @@@ iomap_dio_zero(struct iomap_dio *dio, s
        bio = bio_alloc(GFP_KERNEL, 1);
        bio_set_dev(bio, iomap->bdev);
        bio->bi_iter.bi_sector =
 -              iomap->blkno + ((pos - iomap->offset) >> 9);
 +              (iomap->addr + pos - iomap->offset) >> 9;
        bio->bi_private = dio;
        bio->bi_end_io = iomap_dio_bio_end_io;
  
@@@ -910,7 -902,7 +910,7 @@@ iomap_dio_actor(struct inode *inode, lo
                bio = bio_alloc(GFP_KERNEL, nr_pages);
                bio_set_dev(bio, iomap->bdev);
                bio->bi_iter.bi_sector =
 -                      iomap->blkno + ((pos - iomap->offset) >> 9);
 +                      (iomap->addr + pos - iomap->offset) >> 9;
                bio->bi_write_hint = dio->iocb->ki_hint;
                bio->bi_private = dio;
                bio->bi_end_io = iomap_dio_bio_end_io;
@@@ -1017,13 -1009,6 +1017,13 @@@ iomap_dio_rw(struct kiocb *iocb, struc
        WARN_ON_ONCE(ret);
        ret = 0;
  
 +      if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
 +          !inode->i_sb->s_dio_done_wq) {
 +              ret = sb_init_dio_done_wq(inode->i_sb);
 +              if (ret < 0)
 +                      goto out_free_dio;
 +      }
 +
        inode_dio_begin(inode);
  
        blk_start_plug(&plug);
        if (ret < 0)
                iomap_dio_set_error(dio, ret);
  
 -      if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
 -                      !inode->i_sb->s_dio_done_wq) {
 -              ret = sb_init_dio_done_wq(inode->i_sb);
 -              if (ret < 0)
 -                      iomap_dio_set_error(dio, ret);
 -      }
 -
        if (!atomic_dec_and_test(&dio->ref)) {
                if (!is_sync_kiocb(iocb))
                        return -EIOCBQUEUED;
  
                        if (!(iocb->ki_flags & IOCB_HIPRI) ||
                            !dio->submit.last_queue ||
-                           !blk_mq_poll(dio->submit.last_queue,
+                           !blk_poll(dio->submit.last_queue,
                                         dio->submit.cookie))
                                io_schedule();
                }
diff --combined fs/sync.c
index 83ac79a960dd1aea9aa79932bbb08de662e7abab,09f96a18dd930b48a8cc7ec9760c52f97c2f1c6a..6e0a2cbaf6dedb495b91e16f16ab1d44eee3ea46
+++ b/fs/sync.c
@@@ -1,4 -1,3 +1,4 @@@
 +// SPDX-License-Identifier: GPL-2.0
  /*
   * High-level sync()-related operations
   */
@@@ -109,7 -108,7 +109,7 @@@ SYSCALL_DEFINE0(sync
  {
        int nowait = 0, wait = 1;
  
-       wakeup_flusher_threads(0, WB_REASON_SYNC);
+       wakeup_flusher_threads(WB_REASON_SYNC);
        iterate_supers(sync_inodes_one_sb, NULL);
        iterate_supers(sync_fs_one_sb, &nowait);
        iterate_supers(sync_fs_one_sb, &wait);
index fff4cfa0c21df2117f99d93f7d6c7130d02c9d87,b7c7be6f5986ac7a3adba7f23c21a92ecd34a270..bfe86b54f6c149a6c8718f13417ecf7a796b7193
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef __LINUX_BACKING_DEV_DEFS_H
  #define __LINUX_BACKING_DEV_DEFS_H
  
@@@ -25,6 -24,7 +25,7 @@@ enum wb_state 
        WB_shutting_down,       /* wb_shutdown() in progress */
        WB_writeback_running,   /* Writeback is in progress */
        WB_has_dirty_io,        /* Dirty inodes on ->b_{dirty|io|more_io} */
+       WB_start_all,           /* nr_pages == 0 (all) work pending */
  };
  
  enum wb_congested_state {
@@@ -44,6 -44,28 +45,28 @@@ enum wb_stat_item 
  
  #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
  
+ /*
+  * why some writeback work was initiated
+  */
+ enum wb_reason {
+       WB_REASON_BACKGROUND,
+       WB_REASON_VMSCAN,
+       WB_REASON_SYNC,
+       WB_REASON_PERIODIC,
+       WB_REASON_LAPTOP_TIMER,
+       WB_REASON_FREE_MORE_MEM,
+       WB_REASON_FS_FREE_SPACE,
+       /*
+        * There is no bdi forker thread any more and works are done
+        * by emergency worker, however, this is TPs userland visible
+        * and we'll be exposing exactly the same information,
+        * so it has a mismatch name.
+        */
+       WB_REASON_FORKER_THREAD,
+       WB_REASON_MAX,
+ };
  /*
   * For cgroup writeback, multiple wb's may map to the same blkcg.  Those
   * wb's can operate mostly independently but should share the congested
@@@ -116,6 -138,7 +139,7 @@@ struct bdi_writeback 
  
        struct fprop_local_percpu completions;
        int dirty_exceeded;
+       enum wb_reason start_all_reason;
  
        spinlock_t work_lock;           /* protects work_list & dwork scheduling */
        struct list_head work_list;
index 16621579a3db313bf4a5f315a732ae6121e8b8c2,872afa41abc2d78b4d01b279ae0d92bf887adfaa..f41ca8486e0272c71a59f68d33394e72ac0407e3
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  /*
   * include/linux/backing-dev.h
   *
@@@ -39,8 -38,6 +39,6 @@@ static inline struct backing_dev_info *
        return bdi_alloc_node(gfp_mask, NUMA_NO_NODE);
  }
  
- void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
-                       bool range_cyclic, enum wb_reason reason);
  void wb_start_background_writeback(struct bdi_writeback *wb);
  void wb_workfn(struct work_struct *work);
  void wb_wakeup_delayed(struct bdi_writeback *wb);
@@@ -175,8 -172,6 +173,6 @@@ static inline int wb_congested(struct b
  
  long congestion_wait(int sync, long timeout);
  long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout);
- int pdflush_proc_obsolete(struct ctl_table *table, int write,
-               void __user *buffer, size_t *lenp, loff_t *ppos);
  
  static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi)
  {
index 8bbc3716507ac254c38aebeda44fa69a300aa47d,f57e54d645297f3421a260f79a956b3866c83756..e9825ff57b155d75153a7217b2354fcbfda914a6
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef _BLK_CGROUP_H
  #define _BLK_CGROUP_H
  /*
@@@ -20,6 -19,7 +20,7 @@@
  #include <linux/radix-tree.h>
  #include <linux/blkdev.h>
  #include <linux/atomic.h>
+ #include <linux/kthread.h>
  
  /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
  #define BLKG_STAT_CPU_BATCH   (INT_MAX / 2)
@@@ -224,22 -224,16 +225,16 @@@ static inline struct blkcg *css_to_blkc
        return css ? container_of(css, struct blkcg, css) : NULL;
  }
  
- static inline struct blkcg *task_blkcg(struct task_struct *tsk)
- {
-       return css_to_blkcg(task_css(tsk, io_cgrp_id));
- }
  static inline struct blkcg *bio_blkcg(struct bio *bio)
  {
+       struct cgroup_subsys_state *css;
        if (bio && bio->bi_css)
                return css_to_blkcg(bio->bi_css);
-       return task_blkcg(current);
- }
- static inline struct cgroup_subsys_state *
- task_get_blkcg_css(struct task_struct *task)
- {
-       return task_get_css(task, io_cgrp_id);
+       css = kthread_blkcg();
+       if (css)
+               return css_to_blkcg(css);
+       return css_to_blkcg(task_css(current, io_cgrp_id));
  }
  
  /**
@@@ -736,12 -730,6 +731,6 @@@ struct blkcg_policy 
  
  #define blkcg_root_css        ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
  
- static inline struct cgroup_subsys_state *
- task_get_blkcg_css(struct task_struct *task)
- {
-       return NULL;
- }
  #ifdef CONFIG_BLOCK
  
  static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
diff --combined include/linux/blk-mq.h
index 994cbb0f7ffca5a38771c4dd24eb46eccda6abde,eb1e2cdffb317a0ae7383c000e51b6b721de1417..95c9a5c862e2545b26922b3cbb2103200a29a888
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef BLK_MQ_H
  #define BLK_MQ_H
  
@@@ -31,10 -30,12 +31,12 @@@ struct blk_mq_hw_ctx 
  
        struct sbitmap          ctx_map;
  
+       struct blk_mq_ctx       *dispatch_from;
        struct blk_mq_ctx       **ctxs;
        unsigned int            nr_ctx;
  
-       wait_queue_entry_t              dispatch_wait;
+       wait_queue_entry_t      dispatch_wait;
        atomic_t                wait_index;
  
        struct blk_mq_tags      *tags;
@@@ -91,6 -92,8 +93,8 @@@ struct blk_mq_queue_data 
  
  typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
                const struct blk_mq_queue_data *);
+ typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *);
+ typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
  typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
  typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
  typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
@@@ -112,6 -115,15 +116,15 @@@ struct blk_mq_ops 
         */
        queue_rq_fn             *queue_rq;
  
+       /*
+        * Reserve budget before queue request, once .queue_rq is
+        * run, it is driver's responsibility to release the
+        * reserved budget. Also we have to handle failure case
+        * of .get_budget for avoiding I/O deadlock.
+        */
+       get_budget_fn           *get_budget;
+       put_budget_fn           *put_budget;
        /*
         * Called on request timeout
         */
@@@ -169,8 -181,7 +182,7 @@@ enum 
        BLK_MQ_S_STOPPED        = 0,
        BLK_MQ_S_TAG_ACTIVE     = 1,
        BLK_MQ_S_SCHED_RESTART  = 2,
-       BLK_MQ_S_TAG_WAITING    = 3,
-       BLK_MQ_S_START_ON_RUN   = 4,
+       BLK_MQ_S_START_ON_RUN   = 3,
  
        BLK_MQ_MAX_DEPTH        = 10240,
  
@@@ -198,15 -209,21 +210,21 @@@ void blk_mq_free_request(struct reques
  bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
  
  enum {
-       BLK_MQ_REQ_NOWAIT       = (1 << 0), /* return when out of requests */
-       BLK_MQ_REQ_RESERVED     = (1 << 1), /* allocate from reserved pool */
-       BLK_MQ_REQ_INTERNAL     = (1 << 2), /* allocate internal/sched tag */
+       /* return when out of requests */
+       BLK_MQ_REQ_NOWAIT       = (__force blk_mq_req_flags_t)(1 << 0),
+       /* allocate from reserved pool */
+       BLK_MQ_REQ_RESERVED     = (__force blk_mq_req_flags_t)(1 << 1),
+       /* allocate internal/sched tag */
+       BLK_MQ_REQ_INTERNAL     = (__force blk_mq_req_flags_t)(1 << 2),
+       /* set RQF_PREEMPT */
+       BLK_MQ_REQ_PREEMPT      = (__force blk_mq_req_flags_t)(1 << 3),
  };
  
  struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
-               unsigned int flags);
+               blk_mq_req_flags_t flags);
  struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
-               unsigned int op, unsigned int flags, unsigned int hctx_idx);
+               unsigned int op, blk_mq_req_flags_t flags,
+               unsigned int hctx_idx);
  struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
  
  enum {
@@@ -249,7 -266,7 +267,7 @@@ void blk_mq_start_stopped_hw_queues(str
  void blk_mq_quiesce_queue(struct request_queue *q);
  void blk_mq_unquiesce_queue(struct request_queue *q);
  void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
  void blk_mq_run_hw_queues(struct request_queue *q, bool async);
  void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
  void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
@@@ -260,8 -277,8 +278,8 @@@ void blk_freeze_queue_start(struct requ
  void blk_mq_freeze_queue_wait(struct request_queue *q);
  int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
                                     unsigned long timeout);
- int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
-                        int (reinit_request)(void *, struct request *));
+ int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data,
+               int (reinit_request)(void *, struct request *));
  
  int blk_mq_map_queues(struct blk_mq_tag_set *set);
  void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
index 96ac3815542c1484b6c9300dafdef53bacdf4903,13ccfc9b210ac78a2c99cecce68e1dce4eaf82fc..a1e628e032dad75bf1837a25e45b55a7f54ca2df
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  /*
   * Block data types and constants.  Directly include this file only to
   * break include dependency loop.
@@@ -163,6 -162,8 +163,8 @@@ struct bio 
   */
  #define BIO_RESET_BITS        BVEC_POOL_OFFSET
  
+ typedef __u32 __bitwise blk_mq_req_flags_t;
  /*
   * Operations and flags common to the bio and request structures.
   * We use 8 bits for encoding the operation, and the remaining 24 for flags.
@@@ -225,11 -226,14 +227,14 @@@ enum req_flag_bits 
        __REQ_PREFLUSH,         /* request for cache flush */
        __REQ_RAHEAD,           /* read ahead, can fail anytime */
        __REQ_BACKGROUND,       /* background IO */
+       __REQ_NOWAIT,           /* Don't wait if request will block */
  
        /* command specific flags for REQ_OP_WRITE_ZEROES: */
        __REQ_NOUNMAP,          /* do not free blocks when zeroing */
  
-       __REQ_NOWAIT,           /* Don't wait if request will block */
+       /* for driver use */
+       __REQ_DRV,
        __REQ_NR_BITS,          /* stops here */
  };
  
  #define REQ_PREFLUSH          (1ULL << __REQ_PREFLUSH)
  #define REQ_RAHEAD            (1ULL << __REQ_RAHEAD)
  #define REQ_BACKGROUND                (1ULL << __REQ_BACKGROUND)
+ #define REQ_NOWAIT            (1ULL << __REQ_NOWAIT)
  
  #define REQ_NOUNMAP           (1ULL << __REQ_NOUNMAP)
- #define REQ_NOWAIT            (1ULL << __REQ_NOWAIT)
+ #define REQ_DRV                       (1ULL << __REQ_DRV)
  
  #define REQ_FAILFAST_MASK \
        (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
@@@ -330,11 -336,10 +337,10 @@@ static inline bool blk_qc_t_is_internal
  }
  
  struct blk_rq_stat {
-       s64 mean;
+       u64 mean;
        u64 min;
        u64 max;
-       s32 nr_samples;
-       s32 nr_batch;
+       u32 nr_samples;
        u64 batch;
  };
  
diff --combined include/linux/blkdev.h
index 8da66379f7ea7afceb9af2f32f998d7ca71c25a7,e80ea1d31343da36b808dfe9f547ad8466d8d091..8089ca17db9ac65998ec9cf82f65743bb5c5abb9
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef _LINUX_BLKDEV_H
  #define _LINUX_BLKDEV_H
  
@@@ -267,6 -266,7 +267,7 @@@ struct blk_queue_ctx
  
  typedef void (request_fn_proc) (struct request_queue *q);
  typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
+ typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
  typedef int (prep_rq_fn) (struct request_queue *, struct request *);
  typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
  
@@@ -409,6 -409,7 +410,7 @@@ struct request_queue 
  
        request_fn_proc         *request_fn;
        make_request_fn         *make_request_fn;
+       poll_q_fn               *poll_fn;
        prep_rq_fn              *prep_rq_fn;
        unprep_rq_fn            *unprep_rq_fn;
        softirq_done_fn         *softirq_done_fn;
  #define QUEUE_FLAG_NOMERGES     5     /* disable merge attempts */
  #define QUEUE_FLAG_SAME_COMP  6       /* complete on same CPU-group */
  #define QUEUE_FLAG_FAIL_IO    7       /* fake timeout */
- #define QUEUE_FLAG_STACKABLE  8       /* supports request stacking */
  #define QUEUE_FLAG_NONROT     9       /* non-rotational device (SSD) */
  #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
  #define QUEUE_FLAG_IO_STAT     10     /* do IO stats */
  #define QUEUE_FLAG_REGISTERED  26     /* queue has been registered to a disk */
  #define QUEUE_FLAG_SCSI_PASSTHROUGH 27        /* queue supports SCSI commands */
  #define QUEUE_FLAG_QUIESCED    28     /* queue has been quiesced */
+ #define QUEUE_FLAG_PREEMPT_ONLY       29      /* only process REQ_PREEMPT requests */
  
  #define QUEUE_FLAG_DEFAULT    ((1 << QUEUE_FLAG_IO_STAT) |            \
-                                (1 << QUEUE_FLAG_STACKABLE)    |       \
                                 (1 << QUEUE_FLAG_SAME_COMP)    |       \
                                 (1 << QUEUE_FLAG_ADD_RANDOM))
  
  #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) |            \
-                                (1 << QUEUE_FLAG_STACKABLE)    |       \
                                 (1 << QUEUE_FLAG_SAME_COMP)    |       \
                                 (1 << QUEUE_FLAG_POLL))
  
@@@ -723,8 -722,6 +723,6 @@@ static inline void queue_flag_clear(uns
  #define blk_queue_nonrot(q)   test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
  #define blk_queue_io_stat(q)  test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
  #define blk_queue_add_random(q)       test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
- #define blk_queue_stackable(q)        \
-       test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
  #define blk_queue_discard(q)  test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
  #define blk_queue_secure_erase(q) \
        (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags))
        ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
                             REQ_FAILFAST_DRIVER))
  #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
+ #define blk_queue_preempt_only(q)                             \
+       test_bit(QUEUE_FLAG_PREEMPT_ONLY, &(q)->queue_flags)
+ extern int blk_set_preempt_only(struct request_queue *q);
+ extern void blk_clear_preempt_only(struct request_queue *q);
  
  static inline bool blk_account_rq(struct request *rq)
  {
@@@ -923,24 -925,17 +926,17 @@@ static inline void rq_flush_dcache_page
  }
  #endif
  
- #ifdef CONFIG_PRINTK
- #define vfs_msg(sb, level, fmt, ...)                          \
-       __vfs_msg(sb, level, fmt, ##__VA_ARGS__)
- #else
- #define vfs_msg(sb, level, fmt, ...)                          \
- do {                                                          \
-       no_printk(fmt, ##__VA_ARGS__);                          \
-       __vfs_msg(sb, "", " ");                                 \
- } while (0)
- #endif
  extern int blk_register_queue(struct gendisk *disk);
  extern void blk_unregister_queue(struct gendisk *disk);
  extern blk_qc_t generic_make_request(struct bio *bio);
+ extern blk_qc_t direct_make_request(struct bio *bio);
  extern void blk_rq_init(struct request_queue *q, struct request *rq);
  extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
  extern void blk_put_request(struct request *);
  extern void __blk_put_request(struct request_queue *, struct request *);
+ extern struct request *blk_get_request_flags(struct request_queue *,
+                                            unsigned int op,
+                                            blk_mq_req_flags_t flags);
  extern struct request *blk_get_request(struct request_queue *, unsigned int op,
                                       gfp_t gfp_mask);
  extern void blk_requeue_request(struct request_queue *, struct request *);
@@@ -964,7 -959,7 +960,7 @@@ extern int scsi_cmd_ioctl(struct reques
  extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
                         struct scsi_ioctl_command __user *);
  
- extern int blk_queue_enter(struct request_queue *q, bool nowait);
+ extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
  extern void blk_queue_exit(struct request_queue *q);
  extern void blk_start_queue(struct request_queue *q);
  extern void blk_start_queue_async(struct request_queue *q);
@@@ -991,7 -986,7 +987,7 @@@ extern void blk_execute_rq_nowait(struc
  int blk_status_to_errno(blk_status_t status);
  blk_status_t errno_to_blk_status(int errno);
  
- bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
+ bool blk_poll(struct request_queue *q, blk_qc_t cookie);
  
  static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
  {
@@@ -1110,6 -1105,8 +1106,8 @@@ extern struct request *blk_peek_request
  extern void blk_start_request(struct request *rq);
  extern struct request *blk_fetch_request(struct request_queue *q);
  
+ void blk_steal_bios(struct bio_list *list, struct request *rq);
  /*
   * Request completion related functions.
   *
@@@ -1372,7 -1369,7 +1370,7 @@@ static inline int sb_issue_zeroout(stru
                                    gfp_mask, 0);
  }
  
- extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
+ extern int blk_verify_command(unsigned char *cmd, fmode_t mode);
  
  enum blk_default_limits {
        BLK_MAX_SEGMENTS        = 128,
index afa37f807f12c1065a44ba35ffe6c88423756fd2,ae2d25f01b98e66b4c35e29a8c5a1365f6c807ec..8b1bf8d3d4a202944969a1ae70b7659cd94484e1
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  /*
   * include/linux/buffer_head.h
   *
@@@ -157,7 -156,7 +157,7 @@@ void set_bh_page(struct buffer_head *bh
                struct page *page, unsigned long offset);
  int try_to_free_buffers(struct page *);
  struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
-               int retry);
+               bool retry);
  void create_empty_buffers(struct page *, unsigned long,
                        unsigned long b_state);
  void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
@@@ -233,7 -232,6 +233,7 @@@ int generic_write_end(struct file *, st
                                loff_t, unsigned, unsigned,
                                struct page *, void *);
  void page_zero_new_buffers(struct page *page, unsigned from, unsigned to);
 +void clean_page_buffers(struct page *page);
  int cont_write_begin(struct file *, struct address_space *, loff_t,
                        unsigned, unsigned, struct page **, void **,
                        get_block_t *, loff_t *);
diff --combined include/linux/elevator.h
index ddb7632d73b9532df238f6d727bafd19ec39ec60,6df8b14f1f6a04ea984ce3500ab9e0a200187006..3d794b3dc53236a9bff9b0f2ae69f085209ee385
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef _LINUX_ELEVATOR_H
  #define _LINUX_ELEVATOR_H
  
@@@ -145,6 -144,7 +145,7 @@@ struct elevator_typ
        size_t icq_align;       /* ditto */
        struct elv_fs_entry *elevator_attrs;
        char elevator_name[ELV_NAME_MAX];
+       const char *elevator_alias;
        struct module *elevator_owner;
        bool uses_mq;
  #ifdef CONFIG_BLK_DEBUG_FS
diff --combined include/linux/genhd.h
index eaefb7a62f83707a9493f664c232f3514d2db880,ca10cc292187dcdb59f4974dda214fe21a0494da..5144ebe046c97aefed613bbdcea8dceeb2442ed3
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef _LINUX_GENHD_H
  #define _LINUX_GENHD_H
  
@@@ -141,6 -140,7 +141,7 @@@ struct hd_struct 
  #define GENHD_FL_NATIVE_CAPACITY              128
  #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE   256
  #define GENHD_FL_NO_PART_SCAN                 512
+ #define GENHD_FL_HIDDEN                               1024
  
  enum {
        DISK_EVENT_MEDIA_CHANGE                 = 1 << 0, /* media changed */
@@@ -207,7 -207,6 +208,7 @@@ struct gendisk 
  #endif        /* CONFIG_BLK_DEV_INTEGRITY */
        int node_id;
        struct badblocks *bb;
 +      struct lockdep_map lockdep_map;
  };
  
  static inline struct gendisk *part_to_disk(struct hd_struct *part)
@@@ -236,7 -235,7 +237,7 @@@ static inline bool disk_part_scan_enabl
  
  static inline dev_t disk_devt(struct gendisk *disk)
  {
-       return disk_to_dev(disk)->devt;
+       return MKDEV(disk->major, disk->first_minor);
  }
  
  static inline dev_t part_devt(struct hd_struct *part)
        return part_to_dev(part)->devt;
  }
  
+ extern struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
  extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno);
  
  static inline void disk_put_part(struct hd_struct *part)
@@@ -592,7 -592,8 +594,7 @@@ extern void __delete_partition(struct p
  extern void delete_partition(struct gendisk *, int);
  extern void printk_all_partitions(void);
  
 -extern struct gendisk *alloc_disk_node(int minors, int node_id);
 -extern struct gendisk *alloc_disk(int minors);
 +extern struct gendisk *__alloc_disk_node(int minors, int node_id);
  extern struct kobject *get_disk(struct gendisk *disk);
  extern void put_disk(struct gendisk *disk);
  extern void blk_register_region(dev_t devt, unsigned long range,
@@@ -616,24 -617,6 +618,24 @@@ extern ssize_t part_fail_store(struct d
                               const char *buf, size_t count);
  #endif /* CONFIG_FAIL_MAKE_REQUEST */
  
 +#define alloc_disk_node(minors, node_id)                              \
 +({                                                                    \
 +      static struct lock_class_key __key;                             \
 +      const char *__name;                                             \
 +      struct gendisk *__disk;                                         \
 +                                                                      \
 +      __name = "(gendisk_completion)"#minors"("#node_id")";           \
 +                                                                      \
 +      __disk = __alloc_disk_node(minors, node_id);                    \
 +                                                                      \
 +      if (__disk)                                                     \
 +              lockdep_init_map(&__disk->lockdep_map, __name, &__key, 0); \
 +                                                                      \
 +      __disk;                                                         \
 +})
 +
 +#define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE)
 +
  static inline int hd_ref_init(struct hd_struct *part)
  {
        if (percpu_ref_init(&part->ref, __delete_partition, 0,
diff --combined include/linux/kthread.h
index 86d53a3cb497ff0d587952f4b9073da2f71e98a2,fb201842c635d0497f4a24a8db81ecb91875071a..3203e36b2ee81f746b6d87c16701cdc567274ebd
@@@ -1,9 -1,9 +1,10 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef _LINUX_KTHREAD_H
  #define _LINUX_KTHREAD_H
  /* Simple interface for creating and stopping kernel threads without mess. */
  #include <linux/err.h>
  #include <linux/sched.h>
+ #include <linux/cgroup.h>
  
  __printf(4, 5)
  struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
@@@ -76,7 -76,7 +77,7 @@@ extern int tsk_fork_get_node(struct tas
   */
  struct kthread_work;
  typedef void (*kthread_work_func_t)(struct kthread_work *work);
 -void kthread_delayed_work_timer_fn(unsigned long __data);
 +void kthread_delayed_work_timer_fn(struct timer_list *t);
  
  enum {
        KTW_FREEZABLE           = 1 << 0,       /* freeze during suspend */
@@@ -117,8 -117,8 +118,8 @@@ struct kthread_delayed_work 
  
  #define KTHREAD_DELAYED_WORK_INIT(dwork, fn) {                                \
        .work = KTHREAD_WORK_INIT((dwork).work, (fn)),                  \
 -      .timer = __TIMER_INITIALIZER(kthread_delayed_work_timer_fn,     \
 -                                   0, (unsigned long)&(dwork),        \
 +      .timer = __TIMER_INITIALIZER((TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn,\
 +                                   (TIMER_DATA_TYPE)&(dwork.timer),   \
                                     TIMER_IRQSAFE),                    \
        }
  
@@@ -165,8 -165,8 +166,8 @@@ extern void __kthread_init_worker(struc
        do {                                                            \
                kthread_init_work(&(dwork)->work, (fn));                \
                __setup_timer(&(dwork)->timer,                          \
 -                            kthread_delayed_work_timer_fn,            \
 -                            (unsigned long)(dwork),                   \
 +                            (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn,\
 +                            (TIMER_DATA_TYPE)&(dwork)->timer,         \
                              TIMER_IRQSAFE);                           \
        } while (0)
  
@@@ -199,4 -199,14 +200,14 @@@ bool kthread_cancel_delayed_work_sync(s
  
  void kthread_destroy_worker(struct kthread_worker *worker);
  
+ #ifdef CONFIG_BLK_CGROUP
+ void kthread_associate_blkcg(struct cgroup_subsys_state *css);
+ struct cgroup_subsys_state *kthread_blkcg(void);
+ #else
+ static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { }
+ static inline struct cgroup_subsys_state *kthread_blkcg(void)
+ {
+       return NULL;
+ }
+ #endif
  #endif /* _LINUX_KTHREAD_H */
diff --combined include/linux/lightnvm.h
index a29a8db5cc2fcd3865ee99fb8e35520389aa8975,b7f111ff4d3b3ace306abc853f387f8dfd370926..2d1d9de06728d619d98b9b718a8e9cdb432d56de
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef NVM_H
  #define NVM_H
  
@@@ -57,6 -56,7 +57,7 @@@ typedef int (nvm_get_l2p_tbl_fn)(struc
  typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
  typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
  typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
+ typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *);
  typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *);
  typedef void (nvm_destroy_dma_pool_fn)(void *);
  typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
@@@ -70,6 -70,7 +71,7 @@@ struct nvm_dev_ops 
        nvm_op_set_bb_fn        *set_bb_tbl;
  
        nvm_submit_io_fn        *submit_io;
+       nvm_submit_io_sync_fn   *submit_io_sync;
  
        nvm_create_dma_pool_fn  *create_dma_pool;
        nvm_destroy_dma_pool_fn *destroy_dma_pool;
@@@ -461,10 -462,9 +463,9 @@@ struct nvm_tgt_type 
  
        /* For internal use */
        struct list_head list;
+       struct module *owner;
  };
  
- extern struct nvm_tgt_type *nvm_find_target_type(const char *, int);
  extern int nvm_register_tgt_type(struct nvm_tgt_type *);
  extern void nvm_unregister_tgt_type(struct nvm_tgt_type *);
  
@@@ -479,10 -479,8 +480,8 @@@ extern int nvm_set_tgt_bb_tbl(struct nv
                              int, int);
  extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
  extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
+ extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *);
  extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
- extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *,
-                                       const struct ppa_addr *, int, int);
- extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *);
  extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
                           void *);
  extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
@@@ -491,8 -489,6 +490,6 @@@ extern void nvm_end_io(struct nvm_rq *)
  extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int);
  extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *);
  
- extern int nvm_dev_factory(struct nvm_dev *, int flags);
  extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int);
  
  #else /* CONFIG_NVM */
index e12d92808e983c4b6e129c4304ac50f307d20f6d,e15ec14085ade47f366d6ec822daac637407ea3c..f42d85631d1711fd0085141fc1e61b0c31cd1ddd
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  /*
   * include/linux/writeback.h
   */
@@@ -42,28 -41,6 +42,6 @@@ enum writeback_sync_modes 
        WB_SYNC_ALL,    /* Wait on every mapping */
  };
  
- /*
-  * why some writeback work was initiated
-  */
- enum wb_reason {
-       WB_REASON_BACKGROUND,
-       WB_REASON_VMSCAN,
-       WB_REASON_SYNC,
-       WB_REASON_PERIODIC,
-       WB_REASON_LAPTOP_TIMER,
-       WB_REASON_FREE_MORE_MEM,
-       WB_REASON_FS_FREE_SPACE,
-       /*
-        * There is no bdi forker thread any more and works are done
-        * by emergency worker, however, this is TPs userland visible
-        * and we'll be exposing exactly the same information,
-        * so it has a mismatch name.
-        */
-       WB_REASON_FORKER_THREAD,
-       WB_REASON_MAX,
- };
  /*
   * A control structure which tells the writeback code what to do.  These are
   * always on the stack, and hence need no locking.  They are always initialised
@@@ -186,11 -163,11 +164,11 @@@ struct bdi_writeback
  void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
  void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
                                                        enum wb_reason reason);
- bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
- bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
-                                  enum wb_reason reason);
+ void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason);
  void sync_inodes_sb(struct super_block *);
- void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
+ void wakeup_flusher_threads(enum wb_reason reason);
+ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
+                               enum wb_reason reason);
  void inode_wait_for_writeback(struct inode *inode);
  
  /* writeback.h requires fs.h; it, too, is not included from here. */
index 571ddb49b92693ef6f5f4eebc11985f51765a8ff,6f0f1e242e236da26c716887eb7d64519c721fc7..73af87dfbff8d6915b79777d5e483ec4af25a337
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef _SCSI_SCSI_DEVICE_H
  #define _SCSI_SCSI_DEVICE_H
  
@@@ -193,7 -192,6 +193,7 @@@ struct scsi_device 
        unsigned no_dif:1;      /* T10 PI (DIF) should be disabled */
        unsigned broken_fua:1;          /* Don't set FUA bit */
        unsigned lun_in_cdb:1;          /* Store LUN bits in CDB[1] */
 +      unsigned unmap_limit_for_ws:1;  /* Use the UNMAP limit for WRITE SAME */
  
        atomic_t disk_events_disable_depth; /* disable depth for disk events */
  
        unsigned char           access_state;
        struct mutex            state_mutex;
        enum scsi_device_state sdev_state;
+       struct task_struct      *quiesced_by;
        unsigned long           sdev_data[0];
  } __attribute__((aligned(sizeof(unsigned long))));
  
index 2e1fa7910306d794abfabf23223d37206cb2e6b3,19a0ea08e098984c25b87707a24e37efcc76f82b..32db72c7c055fa2f566f70d89cce51402f213b28
@@@ -1,4 -1,3 +1,4 @@@
 +/* SPDX-License-Identifier: GPL-2.0 */
  #undef TRACE_SYSTEM
  #define TRACE_SYSTEM writeback
  
@@@ -287,7 -286,6 +287,6 @@@ DEFINE_EVENT(writeback_class, name, 
        TP_PROTO(struct bdi_writeback *wb), \
        TP_ARGS(wb))
  
- DEFINE_WRITEBACK_EVENT(writeback_nowork);
  DEFINE_WRITEBACK_EVENT(writeback_wake_background);
  
  TRACE_EVENT(writeback_bdi_register,
diff --combined kernel/kthread.c
index ba3992c8c3753bcc0785ecf998e457d21c013873,8dbe2454cb1deed1447450ea301e2b7ac481ea9d..8af313081b0d9a7f626f6b3b496119737e9e89a6
@@@ -20,7 -20,6 +20,6 @@@
  #include <linux/freezer.h>
  #include <linux/ptrace.h>
  #include <linux/uaccess.h>
- #include <linux/cgroup.h>
  #include <trace/events/sched.h>
  
  static DEFINE_SPINLOCK(kthread_create_lock);
@@@ -47,6 -46,9 +46,9 @@@ struct kthread 
        void *data;
        struct completion parked;
        struct completion exited;
+ #ifdef CONFIG_BLK_CGROUP
+       struct cgroup_subsys_state *blkcg_css;
+ #endif
  };
  
  enum KTHREAD_BITS {
@@@ -74,11 -76,17 +76,17 @@@ static inline struct kthread *to_kthrea
  
  void free_kthread_struct(struct task_struct *k)
  {
+       struct kthread *kthread;
        /*
         * Can be NULL if this kthread was created by kernel_thread()
         * or if kmalloc() in kthread() failed.
         */
-       kfree(to_kthread(k));
+       kthread = to_kthread(k);
+ #ifdef CONFIG_BLK_CGROUP
+       WARN_ON_ONCE(kthread && kthread->blkcg_css);
+ #endif
+       kfree(kthread);
  }
  
  /**
@@@ -196,7 -204,7 +204,7 @@@ static int kthread(void *_create
        struct kthread *self;
        int ret;
  
-       self = kmalloc(sizeof(*self), GFP_KERNEL);
+       self = kzalloc(sizeof(*self), GFP_KERNEL);
        set_kthread_struct(self);
  
        /* If user was SIGKILLed, I release the structure. */
                do_exit(-ENOMEM);
        }
  
-       self->flags = 0;
        self->data = data;
        init_completion(&self->exited);
        init_completion(&self->parked);
@@@ -798,14 -805,15 +805,14 @@@ EXPORT_SYMBOL_GPL(kthread_queue_work)
  /**
   * kthread_delayed_work_timer_fn - callback that queues the associated kthread
   *    delayed work when the timer expires.
 - * @__data: pointer to the data associated with the timer
 + * @t: pointer to the expired timer
   *
   * The format of the function is defined by struct timer_list.
   * It should have been called from irqsafe timer with irq already off.
   */
 -void kthread_delayed_work_timer_fn(unsigned long __data)
 +void kthread_delayed_work_timer_fn(struct timer_list *t)
  {
 -      struct kthread_delayed_work *dwork =
 -              (struct kthread_delayed_work *)__data;
 +      struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
        struct kthread_work *work = &dwork->work;
        struct kthread_worker *worker = work->worker;
  
@@@ -836,7 -844,8 +843,7 @@@ void __kthread_queue_delayed_work(struc
        struct timer_list *timer = &dwork->timer;
        struct kthread_work *work = &dwork->work;
  
 -      WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn ||
 -                   timer->data != (unsigned long)dwork);
 +      WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn);
  
        /*
         * If @delay is 0, queue @dwork->work immediately.  This is for
@@@ -1152,3 -1161,54 +1159,54 @@@ void kthread_destroy_worker(struct kthr
        kfree(worker);
  }
  EXPORT_SYMBOL(kthread_destroy_worker);
+ #ifdef CONFIG_BLK_CGROUP
+ /**
+  * kthread_associate_blkcg - associate blkcg to current kthread
+  * @css: the cgroup info
+  *
+  * Current thread must be a kthread. The thread is running jobs on behalf of
+  * other threads. In some cases, we expect the jobs attach cgroup info of
+  * original threads instead of that of current thread. This function stores
+  * original thread's cgroup info in current kthread context for later
+  * retrieval.
+  */
+ void kthread_associate_blkcg(struct cgroup_subsys_state *css)
+ {
+       struct kthread *kthread;
+       if (!(current->flags & PF_KTHREAD))
+               return;
+       kthread = to_kthread(current);
+       if (!kthread)
+               return;
+       if (kthread->blkcg_css) {
+               css_put(kthread->blkcg_css);
+               kthread->blkcg_css = NULL;
+       }
+       if (css) {
+               css_get(css);
+               kthread->blkcg_css = css;
+       }
+ }
+ EXPORT_SYMBOL(kthread_associate_blkcg);
+ /**
+  * kthread_blkcg - get associated blkcg css of current kthread
+  *
+  * Current thread must be a kthread.
+  */
+ struct cgroup_subsys_state *kthread_blkcg(void)
+ {
+       struct kthread *kthread;
+       if (current->flags & PF_KTHREAD) {
+               kthread = to_kthread(current);
+               if (kthread)
+                       return kthread->blkcg_css;
+       }
+       return NULL;
+ }
+ EXPORT_SYMBOL(kthread_blkcg);
+ #endif
diff --combined kernel/sysctl.c
index d9c31bc2eaea2c95a7a7be5a5321700b84d8f640,a5dd8d82c25385f2de8a4753ce49cb24777351dd..9576bd582d4a870f657d974e1e3beaca6ea635fa
@@@ -367,8 -367,7 +367,8 @@@ static struct ctl_table kern_table[] = 
                .data           = &sysctl_sched_time_avg,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
 -              .proc_handler   = proc_dointvec,
 +              .proc_handler   = proc_dointvec_minmax,
 +              .extra1         = &one,
        },
  #ifdef CONFIG_SCHEDSTATS
        {
  #if defined(CONFIG_LOCKUP_DETECTOR)
        {
                .procname       = "watchdog",
 -              .data           = &watchdog_user_enabled,
 -              .maxlen         = sizeof (int),
 -              .mode           = 0644,
 +              .data           = &watchdog_user_enabled,
 +              .maxlen         = sizeof(int),
 +              .mode           = 0644,
                .proc_handler   = proc_watchdog,
                .extra1         = &zero,
                .extra2         = &one,
        },
        {
                .procname       = "nmi_watchdog",
 -              .data           = &nmi_watchdog_enabled,
 -              .maxlen         = sizeof (int),
 -              .mode           = 0644,
 +              .data           = &nmi_watchdog_user_enabled,
 +              .maxlen         = sizeof(int),
 +              .mode           = NMI_WATCHDOG_SYSCTL_PERM,
                .proc_handler   = proc_nmi_watchdog,
                .extra1         = &zero,
 -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
                .extra2         = &one,
 -#else
 -              .extra2         = &zero,
 -#endif
        },
        {
                .procname       = "watchdog_cpumask",
  #ifdef CONFIG_SOFTLOCKUP_DETECTOR
        {
                .procname       = "soft_watchdog",
 -              .data           = &soft_watchdog_enabled,
 -              .maxlen         = sizeof (int),
 -              .mode           = 0644,
 +              .data           = &soft_watchdog_user_enabled,
 +              .maxlen         = sizeof(int),
 +              .mode           = 0644,
                .proc_handler   = proc_soft_watchdog,
                .extra1         = &zero,
                .extra2         = &one,
@@@ -1341,11 -1344,6 +1341,6 @@@ static struct ctl_table vm_table[] = 
                .proc_handler   = dirtytime_interval_handler,
                .extra1         = &zero,
        },
-       {
-               .procname       = "nr_pdflush_threads",
-               .mode           = 0444 /* read-only */,
-               .proc_handler   = pdflush_proc_obsolete,
-       },
        {
                .procname       = "swappiness",
                .data           = &vm_swappiness,
@@@ -2182,6 -2180,8 +2177,6 @@@ static int do_proc_douintvec_conv(unsig
                                  int write, void *data)
  {
        if (write) {
 -              if (*lvalp > UINT_MAX)
 -                      return -EINVAL;
                if (*lvalp > UINT_MAX)
                        return -EINVAL;
                *valp = *lvalp;
diff --combined mm/page_io.c
index 5d882de3fbfd2bf0f94a35d004205ceddc87b817,ff04de630c465538b65bfd0f14dcf4ec1ea5548b..cd52b9cc169bc3cf4821778af2bc159400ae9bb3
@@@ -1,4 -1,3 +1,4 @@@
 +// SPDX-License-Identifier: GPL-2.0
  /*
   *  linux/mm/page_io.c
   *
@@@ -408,7 -407,7 +408,7 @@@ int swap_readpage(struct page *page, bo
                if (!READ_ONCE(bio->bi_private))
                        break;
  
-               if (!blk_mq_poll(disk->queue, qc))
+               if (!blk_poll(disk->queue, qc))
                        break;
        }
        __set_current_state(TASK_RUNNING);
diff --combined mm/vmscan.c
index eb2f0315b8c0e9549b98e51c2d5e5cd45b9920e1,42a7fdd52d8778e1025197b29151bcb36a4499f9..15b483ef6440d3a45ba69816159f7bf7ecc9915b
@@@ -1,4 -1,3 +1,4 @@@
 +// SPDX-License-Identifier: GPL-2.0
  /*
   *  linux/mm/vmscan.c
   *
@@@ -1868,7 -1867,7 +1868,7 @@@ shrink_inactive_list(unsigned long nr_t
                 * also allow kswapd to start writing pages during reclaim.
                 */
                if (stat.nr_unqueued_dirty == nr_taken) {
-                       wakeup_flusher_threads(0, WB_REASON_VMSCAN);
+                       wakeup_flusher_threads(WB_REASON_VMSCAN);
                        set_bit(PGDAT_DIRTY, &pgdat->flags);
                }
  
This page took 0.560172 seconds and 4 git commands to generate.