From: Linus Torvalds Date: Sun, 28 Oct 2018 18:35:40 +0000 (-0700) Subject: Merge branch 'xarray' of git://git.infradead.org/users/willy/linux-dax X-Git-Tag: v4.20-rc1~85 X-Git-Url: https://repo.jachan.dev/linux.git/commitdiff_plain/dad4f140edaa3f6bb452b6913d41af1ffd672e45?hp=-c Merge branch 'xarray' of git://git.infradead.org/users/willy/linux-dax Pull XArray conversion from Matthew Wilcox: "The XArray provides an improved interface to the radix tree data structure, providing locking as part of the API, specifying GFP flags at allocation time, eliminating preloading, less re-walking the tree, more efficient iterations and not exposing RCU-protected pointers to its users. This patch set 1. Introduces the XArray implementation 2. Converts the pagecache to use it 3. Converts memremap to use it The page cache is the most complex and important user of the radix tree, so converting it was most important. Converting the memremap code removes the only other user of the multiorder code, which allows us to remove the radix tree code that supported it. I have 40+ followup patches to convert many other users of the radix tree over to the XArray, but I'd like to get this part in first. The other conversions haven't been in linux-next and aren't suitable for applying yet, but you can see them in the xarray-conv branch if you're interested" * 'xarray' of git://git.infradead.org/users/willy/linux-dax: (90 commits) radix tree: Remove multiorder support radix tree test: Convert multiorder tests to XArray radix tree tests: Convert item_delete_rcu to XArray radix tree tests: Convert item_kill_tree to XArray radix tree tests: Move item_insert_order radix tree test suite: Remove multiorder benchmarking radix tree test suite: Remove __item_insert memremap: Convert to XArray xarray: Add range store functionality xarray: Move multiorder_check to in-kernel tests xarray: Move multiorder_shrink to kernel tests xarray: Move multiorder account test in-kernel radix tree test suite: Convert iteration test to XArray radix tree test suite: Convert tag_tagged_items to XArray radix tree: Remove radix_tree_clear_tags radix tree: Remove radix_tree_maybe_preload_order radix tree: Remove split/join code radix tree: Remove radix_tree_update_node_t page cache: Finish XArray conversion dax: Convert page fault handlers to XArray ... --- dad4f140edaa3f6bb452b6913d41af1ffd672e45 diff --combined Documentation/core-api/index.rst index 29c790f571a5,df3105aa75f4..3adee82be311 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@@ -21,19 -21,17 +21,20 @@@ Core utilitie local_ops workqueue genericirq + xarray flexible-arrays librs genalloc errseq printk-formats circular-buffers + memory-allocation mm-api gfp_mask-from-fs-io timekeeping boot-time-mm + memory-hotplug + Interfaces for kernel debugging =============================== diff --combined MAINTAINERS index bfc9722a4932,e54349ea090b..a78d45755881 --- a/MAINTAINERS +++ b/MAINTAINERS @@@ -324,6 -324,7 +324,6 @@@ F: Documentation/ABI/testing/sysfs-bus- F: Documentation/ABI/testing/configfs-acpi F: drivers/pci/*acpi* F: drivers/pci/*/*acpi* -F: drivers/pci/*/*/*acpi* F: tools/power/acpi/ ACPI APEI @@@ -535,7 -536,7 +535,7 @@@ F: Documentation/hwmon/adt747 F: drivers/hwmon/adt7475.c ADVANSYS SCSI DRIVER - M: Matthew Wilcox + M: Matthew Wilcox M: Hannes Reinecke L: linux-scsi@vger.kernel.org S: Maintained @@@ -839,7 -840,7 +839,7 @@@ ANALOG DEVICES INC ADGS1408 DRIVE M: Mircea Caprioru S: Supported F: drivers/mux/adgs1408.c -F: Documentation/devicetree/bindings/mux/adgs1408.txt +F: Documentation/devicetree/bindings/mux/adi,adgs1408.txt ANALOG DEVICES INC ADP5061 DRIVER M: Stefan Popa @@@ -932,7 -933,6 +932,7 @@@ M: Greg Kroah-Hartman M: Todd Kjos M: Martijn Coenen +M: Joel Fernandes T: git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git L: devel@driverdev.osuosl.org S: Supported @@@ -1181,7 -1181,7 +1181,7 @@@ N: ow F: arch/arm/mach-actions/ F: arch/arm/boot/dts/owl-* F: arch/arm64/boot/dts/actions/ -F: drivers/clocksource/owl-* +F: drivers/clocksource/timer-owl* F: drivers/pinctrl/actions/* F: drivers/soc/actions/ F: include/dt-bindings/power/owl-* @@@ -1251,7 -1251,7 +1251,7 @@@ N: meso ARM/Annapurna Labs ALPINE ARCHITECTURE M: Tsahee Zidenberg -M: Antoine Tenart +M: Antoine Tenart L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-alpine/ @@@ -1604,7 -1604,7 +1604,7 @@@ L: linux-arm-kernel@lists.infradead.or S: Maintained F: arch/arm/boot/dts/lpc43* F: drivers/clk/nxp/clk-lpc18xx* -F: drivers/clocksource/time-lpc32xx.c +F: drivers/clocksource/timer-lpc32xx.c F: drivers/i2c/busses/i2c-lpc2k.c F: drivers/memory/pl172.c F: drivers/mtd/spi-nor/nxp-spifi.c @@@ -2196,7 -2196,6 +2196,7 @@@ F: drivers/clk/uniphier F: drivers/gpio/gpio-uniphier.c F: drivers/i2c/busses/i2c-uniphier* F: drivers/irqchip/irq-uniphier-aidet.c +F: drivers/mmc/host/uniphier-sd.c F: drivers/pinctrl/uniphier/ F: drivers/reset/reset-uniphier.c F: drivers/tty/serial/8250/8250_uniphier.c @@@ -2221,7 -2220,7 +2221,7 @@@ F: arch/arm/mach-vexpress F: */*/vexpress* F: */*/*/vexpress* F: drivers/clk/versatile/clk-vexpress-osc.c -F: drivers/clocksource/versatile.c +F: drivers/clocksource/timer-versatile.c N: mps2 ARM/VFP SUPPORT @@@ -2243,7 -2242,7 +2243,7 @@@ M: Tony Prisk -M: Harish Patil M: Dept-GELinuxNICDev@cavium.com L: netdev@vger.kernel.org S: Supported @@@ -2977,7 -2977,6 +2977,7 @@@ F: drivers/scsi/bnx2i BROADCOM BNX2X 10 GIGABIT ETHERNET DRIVER M: Ariel Elior +M: Sudarsana Kalluru M: everest-linux-l2@cavium.com L: netdev@vger.kernel.org S: Supported @@@ -3008,14 -3007,6 +3008,14 @@@ S: Supporte F: drivers/gpio/gpio-brcmstb.c F: Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.txt +BROADCOM BRCMSTB I2C DRIVER +M: Kamal Dasu +L: linux-i2c@vger.kernel.org +L: bcm-kernel-feedback-list@broadcom.com +S: Supported +F: drivers/i2c/busses/i2c-brcmstb.c +F: Documentation/devicetree/bindings/i2c/i2c-brcmstb.txt + BROADCOM BRCMSTB USB2 and USB3 PHY DRIVER M: Al Cooper L: linux-kernel@vger.kernel.org @@@ -3123,15 -3114,6 +3123,15 @@@ S: Maintaine F: Documentation/devicetree/bindings/memory-controllers/brcm,dpfe-cpu.txt F: drivers/memory/brcmstb_dpfe.c +BROADCOM SPI DRIVER +M: Kamal Dasu +M: bcm-kernel-feedback-list@broadcom.com +S: Maintained +F: Documentation/devicetree/bindings/spi/brcm,spi-bcm-qspi.txt +F: drivers/spi/spi-bcm-qspi.* +F: drivers/spi/spi-brcmstb-qspi.c +F: drivers/spi/spi-iproc-qspi.c + BROADCOM SYSTEMPORT ETHERNET DRIVER M: Florian Fainelli L: netdev@vger.kernel.org @@@ -3692,12 -3674,6 +3692,12 @@@ S: Maintaine F: Documentation/devicetree/bindings/media/coda.txt F: drivers/media/platform/coda/ +CODE OF CONDUCT +M: Greg Kroah-Hartman +S: Supported +F: Documentation/process/code-of-conduct.rst +F: Documentation/process/code-of-conduct-interpretation.rst + COMMON CLK FRAMEWORK M: Michael Turquette M: Stephen Boyd @@@ -4056,7 -4032,7 +4056,7 @@@ M: Uma Krishnan L: linux-input@vger.kernel.org S: Supported -F: drivers/input/dlink-dir685-touchkeys.c +F: drivers/input/keyboard/dlink-dir685-touchkeys.c DALLAS/MAXIM DS1685-FAMILY REAL TIME CLOCK M: Joshua Kinard @@@ -4195,11 -4171,6 +4195,11 @@@ S: Maintaine F: drivers/platform/x86/dell-smbios-wmi.c F: tools/wmi/dell-smbios-example.c +DEFZA FDDI NETWORK DRIVER +M: "Maciej W. Rozycki" +S: Maintained +F: drivers/net/fddi/defza.* + DELL LAPTOP DRIVER M: Matthew Garrett M: Pali Rohár @@@ -4393,7 -4364,7 +4393,7 @@@ S: Maintaine F: drivers/i2c/busses/i2c-diolan-u2c.c FILESYSTEM DIRECT ACCESS (DAX) - M: Matthew Wilcox + M: Matthew Wilcox M: Ross Zwisler M: Jan Kara L: linux-fsdevel@vger.kernel.org @@@ -4515,12 -4486,11 +4515,12 @@@ S: Maintaine F: Documentation/ F: scripts/kernel-doc X: Documentation/ABI/ +X: Documentation/acpi/ X: Documentation/devicetree/ -X: Documentation/acpi -X: Documentation/power -X: Documentation/spi -X: Documentation/media +X: Documentation/i2c/ +X: Documentation/media/ +X: Documentation/power/ +X: Documentation/spi/ T: git git://git.lwn.net/linux.git docs-next DOCUMENTATION/ITALIAN @@@ -4558,13 -4528,9 +4558,13 @@@ F: drivers/soc/fsl/dpi DPAA2 ETHERNET DRIVER M: Ioana Radulescu -L: linux-kernel@vger.kernel.org +L: netdev@vger.kernel.org S: Maintained -F: drivers/staging/fsl-dpaa2/ethernet +F: drivers/net/ethernet/freescale/dpaa2/dpaa2-eth* +F: drivers/net/ethernet/freescale/dpaa2/dpni* +F: drivers/net/ethernet/freescale/dpaa2/dpkg.h +F: drivers/net/ethernet/freescale/dpaa2/Makefile +F: drivers/net/ethernet/freescale/dpaa2/Kconfig DPAA2 ETHERNET SWITCH DRIVER M: Ioana Radulescu @@@ -4575,10 -4541,9 +4575,10 @@@ F: drivers/staging/fsl-dpaa2/eths DPAA2 PTP CLOCK DRIVER M: Yangbo Lu -L: linux-kernel@vger.kernel.org +L: netdev@vger.kernel.org S: Maintained -F: drivers/staging/fsl-dpaa2/rtc +F: drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp* +F: drivers/net/ethernet/freescale/dpaa2/dprtc* DPT_I2O SCSI RAID DRIVER M: Adaptec OEM Raid Solutions @@@ -5365,8 -5330,7 +5365,8 @@@ S: Maintaine F: drivers/edac/r82600_edac.c EDAC-SBRIDGE -M: Mauro Carvalho Chehab +M: Tony Luck +R: Qiuxu Zhuo L: linux-edac@vger.kernel.org S: Maintained F: drivers/edac/sb_edac.c @@@ -5506,8 -5470,7 +5506,8 @@@ S: Odd Fixe F: drivers/net/ethernet/agere/ ETHERNET BRIDGE -M: Stephen Hemminger +M: Roopa Prabhu +M: Nikolay Aleksandrov L: bridge@lists.linux-foundation.org (moderated for non-subscribers) L: netdev@vger.kernel.org W: http://www.linuxfoundation.org/en/Net:Bridge @@@ -5551,7 -5514,7 +5551,7 @@@ W: http://ext4.wiki.kernel.or Q: http://patchwork.ozlabs.org/project/linux-ext4/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git S: Maintained -F: Documentation/filesystems/ext4.txt +F: Documentation/filesystems/ext4/ext4.rst F: fs/ext4/ Extended Verification Module (EVM) @@@ -6491,7 -6454,6 +6491,7 @@@ F: Documentation/devicetree/bindings/hw F: Documentation/hwmon/ F: drivers/hwmon/ F: include/linux/hwmon*.h +F: include/trace/events/hwmon*.h HARDWARE RANDOM NUMBER GENERATOR CORE M: Matt Mackall @@@ -6800,12 -6762,6 +6800,12 @@@ S: Maintaine F: mm/memory-failure.c F: mm/hwpoison-inject.c +HYGON PROCESSOR SUPPORT +M: Pu Wen +L: linux-kernel@vger.kernel.org +S: Maintained +F: arch/x86/kernel/cpu/hygon.c + Hyper-V CORE AND DRIVERS M: "K. Y. Srinivasan" M: Haiyang Zhang @@@ -7385,16 -7341,15 +7385,16 @@@ T: git git://git.kernel.org/pub/scm/lin S: Supported F: Documentation/networking/e100.rst F: Documentation/networking/e1000.rst -F: Documentation/networking/e1000e.txt -F: Documentation/networking/igb.txt -F: Documentation/networking/igbvf.txt -F: Documentation/networking/ixgb.txt -F: Documentation/networking/ixgbe.txt -F: Documentation/networking/ixgbevf.txt -F: Documentation/networking/i40e.txt -F: Documentation/networking/i40evf.txt -F: Documentation/networking/ice.txt +F: Documentation/networking/e1000e.rst +F: Documentation/networking/fm10k.rst +F: Documentation/networking/igb.rst +F: Documentation/networking/igbvf.rst +F: Documentation/networking/ixgb.rst +F: Documentation/networking/ixgbe.rst +F: Documentation/networking/ixgbevf.rst +F: Documentation/networking/i40e.rst +F: Documentation/networking/iavf.rst +F: Documentation/networking/ice.rst F: drivers/net/ethernet/intel/ F: drivers/net/ethernet/intel/*/ F: include/linux/avf/virtchnl.h @@@ -7416,12 -7371,6 +7416,12 @@@ T: git https://github.com/intel/gvt-lin S: Supported F: drivers/gpu/drm/i915/gvt/ +INTEL PMIC GPIO DRIVER +R: Andy Shevchenko +S: Maintained +F: drivers/gpio/gpio-*cove.c +F: drivers/gpio/gpio-msic.c + INTEL HID EVENT DRIVER M: Alex Hung L: platform-driver-x86@vger.kernel.org @@@ -7548,14 -7497,6 +7548,14 @@@ F: drivers/platform/x86/intel_punit_ipc F: arch/x86/include/asm/intel_pmc_ipc.h F: arch/x86/include/asm/intel_punit_ipc.h +INTEL MULTIFUNCTION PMIC DEVICE DRIVERS +R: Andy Shevchenko +S: Maintained +F: drivers/mfd/intel_msic.c +F: drivers/mfd/intel_soc_pmic* +F: include/linux/mfd/intel_msic.h +F: include/linux/mfd/intel_soc_pmic* + INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT M: Stanislav Yakovlev L: linux-wireless@vger.kernel.org @@@ -7579,6 -7520,14 +7579,6 @@@ S: Supporte F: drivers/infiniband/hw/i40iw/ F: include/uapi/rdma/i40iw-abi.h -INTEL SHA MULTIBUFFER DRIVER -M: Megha Dey -R: Tim Chen -L: linux-crypto@vger.kernel.org -S: Supported -F: arch/x86/crypto/sha*-mb/ -F: crypto/mcryptd.c - INTEL TELEMETRY DRIVER M: Souvik Kumar Chakravarty L: platform-driver-x86@vger.kernel.org @@@ -7686,7 -7635,6 +7686,7 @@@ M: Corey Minyard +M: Jarkko Sakkinen M: Mimi Zohar L: linux-integrity@vger.kernel.org L: keyrings@vger.kernel.org @@@ -8236,25 -8183,6 +8236,25 @@@ S: Maintaine F: net/l3mdev F: include/net/l3mdev.h +L7 BPF FRAMEWORK +M: John Fastabend +M: Daniel Borkmann +L: netdev@vger.kernel.org +S: Maintained +F: include/linux/skmsg.h +F: net/core/skmsg.c +F: net/core/sock_map.c +F: net/ipv4/tcp_bpf.c + +LANTIQ / INTEL Ethernet drivers +M: Hauke Mehrtens +L: netdev@vger.kernel.org +S: Maintained +F: net/dsa/tag_gswip.c +F: drivers/net/ethernet/lantiq_xrx200.c +F: drivers/net/dsa/lantiq_pce.h +F: drivers/net/dsa/lantiq_gswip.c + LANTIQ MIPS ARCHITECTURE M: John Crispin L: linux-mips@linux-mips.org @@@ -8670,6 -8598,7 +8670,6 @@@ F: include/linux/spinlock*. F: arch/*/include/asm/spinlock*.h F: include/linux/rwlock*.h F: include/linux/mutex*.h -F: arch/*/include/asm/mutex*.h F: include/linux/rwsem*.h F: arch/*/include/asm/rwsem.h F: include/linux/seqlock.h @@@ -8697,7 -8626,7 +8697,7 @@@ F: drivers/message/fusion F: drivers/scsi/mpt3sas/ LSILOGIC/SYMBIOS/NCR 53C8XX and 53C1010 PCI-SCSI drivers - M: Matthew Wilcox + M: Matthew Wilcox L: linux-scsi@vger.kernel.org S: Maintained F: drivers/scsi/sym53c8xx_2/ @@@ -8815,7 -8744,7 +8815,7 @@@ M: Vivien Didelot +M: Linu Cherian +M: Geetha sowjanya +M: Jerin Jacob +L: netdev@vger.kernel.org +S: Supported +F: drivers/net/ethernet/marvell/octeontx2/af/ + MATROX FRAMEBUFFER DRIVER L: linux-fbdev@vger.kernel.org S: Orphan @@@ -8927,6 -8847,13 +8927,6 @@@ S: Maintaine F: Documentation/hwmon/max16065 F: drivers/hwmon/max16065.c -MAX20751 HARDWARE MONITOR DRIVER -M: Guenter Roeck -L: linux-hwmon@vger.kernel.org -S: Maintained -F: Documentation/hwmon/max20751 -F: drivers/hwmon/max20751.c - MAX2175 SDR TUNER DRIVER M: Ramesh Shanmugasundaram L: linux-media@vger.kernel.org @@@ -9568,7 -9495,7 +9568,7 @@@ MEN Z069 WATCHDOG DRIVE M: Johannes Thumshirn L: linux-watchdog@vger.kernel.org S: Maintained -F: drivers/watchdog/menz069_wdt.c +F: drivers/watchdog/menz69_wdt.c MESON AO CEC DRIVER FOR AMLOGIC SOCS M: Neil Armstrong @@@ -9592,7 -9519,6 +9592,7 @@@ M: Richard Genoud @@@ -9624,21 -9550,6 +9624,21 @@@ S: Supporte F: drivers/mtd/nand/raw/atmel/* F: Documentation/devicetree/bindings/mtd/atmel-nand.txt +MICROCHIP AT91 USART MFD DRIVER +M: Radu Pirea +L: linux-kernel@vger.kernel.org +S: Supported +F: drivers/mfd/at91-usart.c +F: include/dt-bindings/mfd/at91-usart.h +F: Documentation/devicetree/bindings/mfd/atmel-usart.txt + +MICROCHIP AT91 USART SPI DRIVER +M: Radu Pirea +L: linux-spi@vger.kernel.org +S: Supported +F: drivers/spi/spi-at91-usart.c +F: Documentation/devicetree/bindings/mfd/atmel-usart.txt + MICROCHIP KSZ SERIES ETHERNET SWITCH DRIVER M: Woojung Huh M: Microchip Linux Driver Support @@@ -9747,8 -9658,7 +9747,8 @@@ MIPS/LOONGSON2 ARCHITECTUR M: Jiaxun Yang L: linux-mips@linux-mips.org S: Maintained -F: arch/mips/loongson64/*{2e/2f}* +F: arch/mips/loongson64/fuloong-2e/ +F: arch/mips/loongson64/lemote-2f/ F: arch/mips/include/asm/mach-loongson64/ F: drivers/*/*loongson2* F: drivers/*/*/*loongson2* @@@ -9788,19 -9698,6 +9788,19 @@@ S: Maintaine F: arch/arm/boot/dts/mmp* F: arch/arm/mach-mmp/ +MMU GATHER AND TLB INVALIDATION +M: Will Deacon +M: "Aneesh Kumar K.V" +M: Andrew Morton +M: Nick Piggin +M: Peter Zijlstra +L: linux-arch@vger.kernel.org +L: linux-mm@kvack.org +S: Maintained +F: arch/*/include/asm/tlb.h +F: include/asm-generic/tlb.h +F: mm/mmu_gather.c + MN88472 MEDIA DRIVER M: Antti Palosaari L: linux-media@vger.kernel.org @@@ -9968,7 -9865,7 +9968,7 @@@ M: Peter Rosin +L: linux-scsi@vger.kernel.org +S: Supported +F: drivers/scsi/myrb.* +F: drivers/scsi/myrs.* + MYRICOM MYRI-10G 10GbE DRIVER (MYRI10GE) M: Chris Lee L: netdev@vger.kernel.org @@@ -10232,6 -10122,7 +10232,6 @@@ L: netdev@vger.kernel.or T: git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git S: Maintained -F: net/core/flow.c F: net/xfrm/ F: net/key/ F: net/ipv4/xfrm* @@@ -10294,8 -10185,6 +10294,8 @@@ NETWORKING [TLS M: Boris Pismenny M: Aviad Yehezkel M: Dave Watson +M: John Fastabend +M: Daniel Borkmann L: netdev@vger.kernel.org S: Maintained F: net/tls/* @@@ -11053,7 -10942,7 +11053,7 @@@ M: Willy Tarreau S: Odd Fixes F: Documentation/auxdisplay/lcd-panel-cgram.txt -F: drivers/misc/panel.c +F: drivers/auxdisplay/panel.c PARALLEL PORT SUBSYSTEM M: Sudip Mukherjee @@@ -11300,7 -11189,7 +11300,7 @@@ M: Murali Karicheri @@@ -11600,12 -11489,15 +11600,12 @@@ S: Maintaine F: drivers/pinctrl/intel/ PIN CONTROLLER - MEDIATEK -M: Sean Wang +M: Sean Wang L: linux-mediatek@lists.infradead.org (moderated for non-subscribers) S: Maintained F: Documentation/devicetree/bindings/pinctrl/pinctrl-mt65xx.txt F: Documentation/devicetree/bindings/pinctrl/pinctrl-mt7622.txt -F: drivers/pinctrl/mediatek/mtk-eint.* -F: drivers/pinctrl/mediatek/pinctrl-mtk-common.* -F: drivers/pinctrl/mediatek/pinctrl-mt2701.c -F: drivers/pinctrl/mediatek/pinctrl-mt7622.c +F: drivers/pinctrl/mediatek/ PIN CONTROLLER - QUALCOMM M: Bjorn Andersson @@@ -11683,26 -11575,7 +11683,26 @@@ W: http://hwmon.wiki.kernel.org W: http://www.roeck-us.net/linux/drivers/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git S: Maintained +F: Documentation/devicetree/bindings/hwmon/ibm,cffps1.txt +F: Documentation/devicetree/bindings/hwmon/max31785.txt +F: Documentation/devicetree/bindings/hwmon/ltc2978.txt +F: Documentation/hwmon/adm1275 +F: Documentation/hwmon/ibm-cffps +F: Documentation/hwmon/ir35221 +F: Documentation/hwmon/lm25066 +F: Documentation/hwmon/ltc2978 +F: Documentation/hwmon/ltc3815 +F: Documentation/hwmon/max16064 +F: Documentation/hwmon/max20751 +F: Documentation/hwmon/max31785 +F: Documentation/hwmon/max34440 +F: Documentation/hwmon/max8688 F: Documentation/hwmon/pmbus +F: Documentation/hwmon/pmbus-core +F: Documentation/hwmon/tps40422 +F: Documentation/hwmon/ucd9000 +F: Documentation/hwmon/ucd9200 +F: Documentation/hwmon/zl6100 F: drivers/hwmon/pmbus/ F: include/linux/pmbus.h @@@ -12106,7 -11979,7 +12106,7 @@@ F: Documentation/scsi/LICENSE.qla4xx F: drivers/scsi/qla4xxx/ QLOGIC QLCNIC (1/10)Gb ETHERNET DRIVER -M: Harish Patil +M: Shahed Shaikh M: Manish Chopra M: Dept-GELinuxNICDev@cavium.com L: netdev@vger.kernel.org @@@ -12114,6 -11987,7 +12114,6 @@@ S: Supporte F: drivers/net/ethernet/qlogic/qlcnic/ QLOGIC QLGE 10Gb ETHERNET DRIVER -M: Harish Patil M: Manish Chopra M: Dept-GELinuxNICDev@cavium.com L: netdev@vger.kernel.org @@@ -12801,18 -12675,6 +12801,18 @@@ W: http://www.ibm.com/developerworks/li S: Supported F: drivers/s390/crypto/ +S390 VFIO AP DRIVER +M: Tony Krowiak +M: Pierre Morel +M: Halil Pasic +L: linux-s390@vger.kernel.org +W: http://www.ibm.com/developerworks/linux/linux390/ +S: Supported +F: drivers/s390/crypto/vfio_ap_drv.c +F: drivers/s390/crypto/vfio_ap_private.h +F: drivers/s390/crypto/vfio_ap_ops.c +F: Documentation/s390/vfio-ap.txt + S390 ZFCP DRIVER M: Steffen Maier M: Benjamin Block @@@ -13201,7 -13063,7 +13201,7 @@@ SELINUX SECURITY MODUL M: Paul Moore M: Stephen Smalley M: Eric Paris -L: selinux@tycho.nsa.gov (moderated for non-subscribers) +L: selinux@vger.kernel.org W: https://selinuxproject.org W: https://github.com/SELinuxProject T: git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git @@@ -13445,7 -13307,6 +13445,7 @@@ M: Uwe Kleine-König S: Supported F: drivers/siox/* +F: drivers/gpio/gpio-siox.c F: include/trace/events/siox.h SIS 190 ETHERNET DRIVER @@@ -13629,8 -13490,8 +13629,8 @@@ L: linux-arm-kernel@lists.infradead.or S: Maintained F: Documentation/devicetree/bindings/arm/firmware/sdei.txt F: drivers/firmware/arm_sdei.c -F: include/linux/sdei.h -F: include/uapi/linux/sdei.h +F: include/linux/arm_sdei.h +F: include/uapi/linux/arm_sdei.h SOFTWARE RAID (Multiple Disks) SUPPORT M: Shaohua Li @@@ -13758,7 -13619,7 +13758,7 @@@ F: sound/soc F: include/sound/soc* SOUNDWIRE SUBSYSTEM -M: Vinod Koul +M: Vinod Koul M: Sanyog Kale R: Pierre-Louis Bossart L: alsa-devel@alsa-project.org (moderated for non-subscribers) @@@ -14173,12 -14034,6 +14173,12 @@@ S: Supporte F: drivers/reset/reset-axs10x.c F: Documentation/devicetree/bindings/reset/snps,axs10x-reset.txt +SYNOPSYS CREG GPIO DRIVER +M: Eugeniy Paltsev +S: Maintained +F: drivers/gpio/gpio-creg-snps.c +F: Documentation/devicetree/bindings/gpio/snps,creg-gpio.txt + SYNOPSYS DESIGNWARE 8250 UART DRIVER R: Andy Shevchenko S: Maintained @@@ -14765,13 -14620,6 +14765,13 @@@ L: netdev@vger.kernel.or S: Maintained F: drivers/net/ethernet/ti/netcp* +TI PCM3060 ASoC CODEC DRIVER +M: Kirill Marinushkin +L: alsa-devel@alsa-project.org (moderated for non-subscribers) +S: Maintained +F: Documentation/devicetree/bindings/sound/pcm3060.txt +F: sound/soc/codecs/pcm3060* + TI TAS571X FAMILY ASoC CODEC DRIVER M: Kevin Cernekee L: alsa-devel@alsa-project.org (moderated for non-subscribers) @@@ -15444,12 -15292,6 +15444,12 @@@ F: Documentation/driver-api/usb/typec_b F: drivers/usb/typec/altmodes/ F: include/linux/usb/typec_altmode.h +USB TYPEC PORT CONTROLLER DRIVERS +M: Guenter Roeck +L: linux-usb@vger.kernel.org +S: Maintained +F: drivers/usb/typec/tcpm/ + USB UHCI DRIVER M: Alan Stern L: linux-usb@vger.kernel.org @@@ -15524,19 -15366,13 +15524,19 @@@ F: arch/x86/um F: fs/hostfs/ F: fs/hppfs/ +USERSPACE COPYIN/COPYOUT (UIOVEC) +M: Alexander Viro +S: Maintained +F: lib/iov_iter.c +F: include/linux/uio.h + USERSPACE I/O (UIO) M: Greg Kroah-Hartman S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git F: Documentation/driver-api/uio-howto.rst F: drivers/uio/ -F: include/linux/uio*.h +F: include/linux/uio_driver.h UTIL-LINUX PACKAGE M: Karel Zak @@@ -15559,7 -15395,7 +15559,7 @@@ S: Maintaine UVESAFB DRIVER M: Michal Januszewski L: linux-fbdev@vger.kernel.org -W: http://dev.gentoo.org/~spock/projects/uvesafb/ +W: https://github.com/mjanusz/v86d S: Maintained F: Documentation/fb/uvesafb.txt F: drivers/video/fbdev/uvesafb.* @@@ -15872,7 -15708,7 +15872,7 @@@ F: include/linux/regulator VRF M: David Ahern -M: Shrijeet Mukherjee +M: Shrijeet Mukherjee L: netdev@vger.kernel.org S: Maintained F: drivers/net/vrf.c @@@ -16137,6 -15973,17 +16137,17 @@@ T: git git://git.kernel.org/pub/scm/lin S: Maintained F: arch/x86/entry/vdso/ + XARRAY + M: Matthew Wilcox + L: linux-fsdevel@vger.kernel.org + S: Supported + F: Documentation/core-api/xarray.rst + F: lib/idr.c + F: lib/xarray.c + F: include/linux/idr.h + F: include/linux/xarray.h + F: tools/testing/radix-tree + XC2028/3028 TUNER DRIVER M: Mauro Carvalho Chehab L: linux-media@vger.kernel.org diff --combined arch/parisc/kernel/syscall.S index f5f22ea9b97e,a9bc90dc4ae7..9505c317818d --- a/arch/parisc/kernel/syscall.S +++ b/arch/parisc/kernel/syscall.S @@@ -2,7 -2,7 +2,7 @@@ * Linux/PA-RISC Project (http://www.parisc-linux.org/) * * System call entry code / Linux gateway page - * Copyright (c) Matthew Wilcox 1999 + * Copyright (c) Matthew Wilcox 1999 * Licensed under the GNU GPL. * thanks to Philipp Rumpf, Mike Shaver and various others * sorry about the wall, puffin.. @@@ -640,7 -640,8 +640,7 @@@ cas_action sub,<> %r28, %r25, %r0 2: stw %r24, 0(%r26) /* Free lock */ - sync - stw %r20, 0(%sr2,%r20) + stw,ma %r20, 0(%sr2,%r20) #if ENABLE_LWS_DEBUG /* Clear thread register indicator */ stw %r0, 4(%sr2,%r20) @@@ -654,7 -655,8 +654,7 @@@ 3: /* Error occurred on load or store */ /* Free lock */ - sync - stw %r20, 0(%sr2,%r20) + stw,ma %r20, 0(%sr2,%r20) #if ENABLE_LWS_DEBUG stw %r0, 4(%sr2,%r20) #endif @@@ -855,7 -857,8 +855,7 @@@ cas2_action cas2_end: /* Free lock */ - sync - stw %r20, 0(%sr2,%r20) + stw,ma %r20, 0(%sr2,%r20) /* Enable interrupts */ ssm PSW_SM_I, %r0 /* Return to userspace, set no error */ @@@ -865,7 -868,8 +865,7 @@@ 22: /* Error occurred on load or store */ /* Free lock */ - sync - stw %r20, 0(%sr2,%r20) + stw,ma %r20, 0(%sr2,%r20) ssm PSW_SM_I, %r0 ldo 1(%r0),%r28 b lws_exit diff --combined arch/powerpc/include/asm/book3s/64/pgtable.h index c4a726c10af5,62039e557ac0..6c99e846a8c9 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@@ -14,6 -14,10 +14,6 @@@ */ #define _PAGE_BIT_SWAP_TYPE 0 -#define _PAGE_NA 0 -#define _PAGE_RO 0 -#define _PAGE_USER 0 - #define _PAGE_EXEC 0x00001 /* execute permission */ #define _PAGE_WRITE 0x00002 /* write access allowed */ #define _PAGE_READ 0x00004 /* read access allowed */ @@@ -110,7 -114,7 +110,7 @@@ */ #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ _PAGE_ACCESSED | H_PAGE_THP_HUGE | _PAGE_PTE | \ - _PAGE_SOFT_DIRTY) + _PAGE_SOFT_DIRTY | _PAGE_DEVMAP) /* * user access blocked by key */ @@@ -118,23 -122,34 +118,23 @@@ #define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_READ) #define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | \ _PAGE_RW | _PAGE_EXEC) -/* - * No page size encoding in the linux PTE - */ -#define _PAGE_PSIZE 0 /* * _PAGE_CHG_MASK masks of bits that are to be preserved across * pgprot changes */ #define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \ - _PAGE_SOFT_DIRTY) + _PAGE_SOFT_DIRTY | _PAGE_DEVMAP) #define H_PTE_PKEY (H_PTE_PKEY_BIT0 | H_PTE_PKEY_BIT1 | H_PTE_PKEY_BIT2 | \ H_PTE_PKEY_BIT3 | H_PTE_PKEY_BIT4) -/* - * Mask of bits returned by pte_pgprot() - */ -#define PAGE_PROT_BITS (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT | \ - H_PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \ - _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_EXEC | \ - _PAGE_SOFT_DIRTY | H_PTE_PKEY) /* * We define 2 sets of base prot bits, one for basic pages (ie, * cacheable kernel and user pages) and one for non cacheable * pages. We always set _PAGE_COHERENT when SMP is enabled or * the processor might need it for DMA coherency. */ -#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) #define _PAGE_BASE (_PAGE_BASE_NC) /* Permission masks used to generate the __P and __S table, @@@ -144,6 -159,8 +144,6 @@@ * Write permissions imply read permissions for now (we could make write-only * pages on BookE but we don't bother for now). Execute permission control is * possible on platforms that define _PAGE_EXEC - * - * Note due to the way vm flags are laid out, the bits are XWR */ #define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_PRIVILEGED) #define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_RW) @@@ -153,6 -170,24 +153,6 @@@ #define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_READ) #define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_X -#define __P101 PAGE_READONLY_X -#define __P110 PAGE_COPY_X -#define __P111 PAGE_COPY_X - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_X -#define __S101 PAGE_READONLY_X -#define __S110 PAGE_SHARED_X -#define __S111 PAGE_SHARED_X - /* Permission masks used for kernel mappings */ #define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) #define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ @@@ -426,7 -461,6 +426,7 @@@ static inline void ptep_set_wrprotect(s pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 0); } +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@@ -485,11 -519,7 +485,11 @@@ static inline int pte_special(pte_t pte return !!(pte_raw(pte) & cpu_to_be64(_PAGE_SPECIAL)); } -static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } +static inline bool pte_exec(pte_t pte) +{ + return !!(pte_raw(pte) & cpu_to_be64(_PAGE_EXEC)); +} + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline bool pte_soft_dirty(pte_t pte) @@@ -499,12 -529,12 +499,12 @@@ static inline pte_t pte_mksoft_dirty(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SOFT_DIRTY)); } static inline pte_t pte_clear_soft_dirty(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_SOFT_DIRTY)); } #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ @@@ -525,7 -555,7 +525,7 @@@ static inline pte_t pte_mk_savedwrite(p */ VM_BUG_ON((pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_RWX | _PAGE_PRIVILEGED)) != cpu_to_be64(_PAGE_PRESENT | _PAGE_PRIVILEGED)); - return __pte(pte_val(pte) & ~_PAGE_PRIVILEGED); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_PRIVILEGED)); } #define pte_clear_savedwrite pte_clear_savedwrite @@@ -535,14 -565,14 +535,14 @@@ static inline pte_t pte_clear_savedwrit * Used by KSM subsystem to make a protnone pte readonly. */ VM_BUG_ON(!pte_protnone(pte)); - return __pte(pte_val(pte) | _PAGE_PRIVILEGED); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PRIVILEGED)); } #else #define pte_clear_savedwrite pte_clear_savedwrite static inline pte_t pte_clear_savedwrite(pte_t pte) { VM_WARN_ON(1); - return __pte(pte_val(pte) & ~_PAGE_WRITE); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_WRITE)); } #endif /* CONFIG_NUMA_BALANCING */ @@@ -557,11 -587,6 +557,11 @@@ static inline int pte_present(pte_t pte return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID)); } +static inline bool pte_hw_valid(pte_t pte) +{ + return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT)); +} + #ifdef CONFIG_PPC_MEM_KEYS extern bool arch_pte_access_permitted(u64 pte, bool write, bool execute); #else @@@ -571,22 -596,25 +571,22 @@@ static inline bool arch_pte_access_perm } #endif /* CONFIG_PPC_MEM_KEYS */ +static inline bool pte_user(pte_t pte) +{ + return !(pte_raw(pte) & cpu_to_be64(_PAGE_PRIVILEGED)); +} + #define pte_access_permitted pte_access_permitted static inline bool pte_access_permitted(pte_t pte, bool write) { - unsigned long pteval = pte_val(pte); - /* Also check for pte_user */ - unsigned long clear_pte_bits = _PAGE_PRIVILEGED; /* * _PAGE_READ is needed for any access and will be * cleared for PROT_NONE */ - unsigned long need_pte_bits = _PAGE_PRESENT | _PAGE_READ; - - if (write) - need_pte_bits |= _PAGE_WRITE; - - if ((pteval & need_pte_bits) != need_pte_bits) + if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte)) return false; - if ((pteval & clear_pte_bits) == clear_pte_bits) + if (write && !pte_write(pte)) return false; return arch_pte_access_permitted(pte_val(pte), write, 0); @@@ -615,32 -643,17 +615,32 @@@ static inline pte_t pte_wrprotect(pte_ { if (unlikely(pte_savedwrite(pte))) return pte_clear_savedwrite(pte); - return __pte(pte_val(pte) & ~_PAGE_WRITE); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_WRITE)); +} + +static inline pte_t pte_exprotect(pte_t pte) +{ + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_EXEC)); } static inline pte_t pte_mkclean(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_DIRTY); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_DIRTY)); } static inline pte_t pte_mkold(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_ACCESSED); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_ACCESSED)); +} + +static inline pte_t pte_mkexec(pte_t pte) +{ + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_EXEC)); +} + +static inline pte_t pte_mkpte(pte_t pte) +{ + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE)); } static inline pte_t pte_mkwrite(pte_t pte) @@@ -648,22 -661,22 +648,22 @@@ /* * write implies read, hence set both */ - return __pte(pte_val(pte) | _PAGE_RW); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_RW)); } static inline pte_t pte_mkdirty(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_DIRTY | _PAGE_SOFT_DIRTY)); } static inline pte_t pte_mkyoung(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_ACCESSED); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_ACCESSED)); } static inline pte_t pte_mkspecial(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_SPECIAL); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SPECIAL)); } static inline pte_t pte_mkhuge(pte_t pte) @@@ -673,17 -686,7 +673,17 @@@ static inline pte_t pte_mkdevmap(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_SPECIAL|_PAGE_DEVMAP); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SPECIAL | _PAGE_DEVMAP)); +} + +static inline pte_t pte_mkprivileged(pte_t pte) +{ + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PRIVILEGED)); +} + +static inline pte_t pte_mkuser(pte_t pte) +{ + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_PRIVILEGED)); } /* @@@ -702,8 -705,12 +702,8 @@@ static inline int pte_devmap(pte_t pte static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { /* FIXME!! check whether this need to be a conditional */ - return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); -} - -static inline bool pte_user(pte_t pte) -{ - return !(pte_raw(pte) & cpu_to_be64(_PAGE_PRIVILEGED)); + return __pte_raw((pte_raw(pte) & cpu_to_be64(_PAGE_CHG_MASK)) | + cpu_to_be64(pgprot_val(newprot))); } /* Encode and de-code a swap entry */ @@@ -716,9 -723,7 +716,7 @@@ BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \ BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \ } while (0) - /* - * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT; - */ + #define SWP_TYPE_BITS 5 #define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ & ((1UL << SWP_TYPE_BITS) - 1)) @@@ -734,8 -739,6 +732,8 @@@ */ #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE }) #define __swp_entry_to_pte(x) __pte((x).val | _PAGE_PTE) +#define __pmd_to_swp_entry(pmd) (__pte_to_swp_entry(pmd_pte(pmd))) +#define __swp_entry_to_pmd(x) (pte_pmd(__swp_entry_to_pte(x))) #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE)) @@@ -746,7 -749,7 +744,7 @@@ #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SWP_SOFT_DIRTY)); } static inline bool pte_swp_soft_dirty(pte_t pte) @@@ -756,7 -759,7 +754,7 @@@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_SWP_SOFT_DIRTY)); } #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ @@@ -845,10 -848,10 +843,10 @@@ static inline pgprot_t pgprot_writecomb */ static inline bool pte_ci(pte_t pte) { - unsigned long pte_v = pte_val(pte); + __be64 pte_v = pte_raw(pte); - if (((pte_v & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) || - ((pte_v & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)) + if (((pte_v & cpu_to_be64(_PAGE_CACHE_CTL)) == cpu_to_be64(_PAGE_TOLERANT)) || + ((pte_v & cpu_to_be64(_PAGE_CACHE_CTL)) == cpu_to_be64(_PAGE_NON_IDEMPOTENT))) return true; return false; } @@@ -870,16 -873,8 +868,16 @@@ static inline int pmd_none(pmd_t pmd static inline int pmd_present(pmd_t pmd) { + /* + * A pmd is considerent present if _PAGE_PRESENT is set. + * We also need to consider the pmd present which is marked + * invalid during a split. Hence we look for _PAGE_INVALID + * if we find _PAGE_PRESENT cleared. + */ + if (pmd_raw(pmd) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID)) + return true; - return !pmd_none(pmd); + return false; } static inline int pmd_bad(pmd_t pmd) @@@ -906,7 -901,7 +904,7 @@@ static inline int pud_none(pud_t pud static inline int pud_present(pud_t pud) { - return !pud_none(pud); + return (pud_raw(pud) & cpu_to_be64(_PAGE_PRESENT)); } extern struct page *pud_page(pud_t pud); @@@ -953,7 -948,7 +951,7 @@@ static inline int pgd_none(pgd_t pgd static inline int pgd_present(pgd_t pgd) { - return !pgd_none(pgd); + return (pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT)); } static inline pte_t pgd_pte(pgd_t pgd) @@@ -1023,16 -1018,17 +1021,16 @@@ extern struct page *pgd_page(pgd_t pgd) #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) -static inline int map_kernel_page(unsigned long ea, unsigned long pa, - unsigned long flags) +static inline int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { if (radix_enabled()) { #if defined(CONFIG_PPC_RADIX_MMU) && defined(DEBUG_VM) unsigned long page_size = 1 << mmu_psize_defs[mmu_io_psize].shift; WARN((page_size != PAGE_SIZE), "I/O page size != PAGE_SIZE"); #endif - return radix__map_kernel_page(ea, pa, __pgprot(flags), PAGE_SIZE); + return radix__map_kernel_page(ea, pa, prot, PAGE_SIZE); } - return hash__map_kernel_page(ea, pa, flags); + return hash__map_kernel_page(ea, pa, prot); } static inline int __meminit vmemmap_create_mapping(unsigned long start, @@@ -1084,12 -1080,6 +1082,12 @@@ static inline pte_t *pmdp_ptep(pmd_t *p #define pmd_soft_dirty(pmd) pte_soft_dirty(pmd_pte(pmd)) #define pmd_mksoft_dirty(pmd) pte_pmd(pte_mksoft_dirty(pmd_pte(pmd))) #define pmd_clear_soft_dirty(pmd) pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd))) + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +#define pmd_swp_mksoft_dirty(pmd) pte_pmd(pte_swp_mksoft_dirty(pmd_pte(pmd))) +#define pmd_swp_soft_dirty(pmd) pte_swp_soft_dirty(pmd_pte(pmd)) +#define pmd_swp_clear_soft_dirty(pmd) pte_pmd(pte_swp_clear_soft_dirty(pmd_pte(pmd))) +#endif #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ #ifdef CONFIG_NUMA_BALANCING @@@ -1135,10 -1125,6 +1133,10 @@@ pmd_hugepage_update(struct mm_struct *m return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set); } +/* + * returns true for pmd migration entries, THP, devmap, hugetlb + * But compile time dependent on THP config + */ static inline int pmd_large(pmd_t pmd) { return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)); @@@ -1173,22 -1159,8 +1171,22 @@@ static inline void pmdp_set_wrprotect(s pmd_hugepage_update(mm, addr, pmdp, 0, _PAGE_PRIVILEGED); } +/* + * Only returns true for a THP. False for pmd migration entry. + * We also need to return true when we come across a pte that + * in between a thp split. While splitting THP, we mark the pmd + * invalid (pmdp_invalidate()) before we set it with pte page + * address. A pmd_trans_huge() check against a pmd entry during that time + * should return true. + * We should not call this on a hugetlb entry. We should check for HugeTLB + * entry using vma->vm_flags + * The page table walk rule is explained in Documentation/vm/transhuge.rst + */ static inline int pmd_trans_huge(pmd_t pmd) { + if (!pmd_present(pmd)) + return false; + if (radix_enabled()) return radix__pmd_trans_huge(pmd); return hash__pmd_trans_huge(pmd); diff --combined arch/powerpc/include/asm/nohash/64/pgtable.h index 67421f74efcf,05765c2d2c1f..e77ed9761632 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@@ -89,47 -89,11 +89,47 @@@ * Include the PTE bits definitions */ #include -#include + +#define _PAGE_SAO 0 + +#define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1)) + +/* + * _PAGE_CHG_MASK masks of bits that are to be preserved across + * pgprot changes. + */ +#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPECIAL) + +#define H_PAGE_4K_PFN 0 #ifndef __ASSEMBLY__ /* pte_clear moved to later in this file */ +static inline pte_t pte_mkwrite(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_RW); +} + +static inline pte_t pte_mkdirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_DIRTY); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_ACCESSED); +} + +static inline pte_t pte_wrprotect(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_RW); +} + +static inline pte_t pte_mkexec(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_EXEC); +} + #define PMD_BAD_BITS (PTE_TABLE_SIZE-1) #define PUD_BAD_BITS (PMD_TABLE_SIZE-1) @@@ -275,7 -239,6 +275,7 @@@ static inline void ptep_set_wrprotect(s pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); } +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@@ -350,9 -313,7 +350,7 @@@ static inline void __ptep_set_access_fl #define MAX_SWAPFILES_CHECK() do { \ BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \ } while (0) - /* - * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT; - */ + #define SWP_TYPE_BITS 5 #define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ & ((1UL << SWP_TYPE_BITS) - 1)) @@@ -364,7 -325,8 +362,7 @@@ #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __swp_entry_to_pte(x) __pte((x).val) -extern int map_kernel_page(unsigned long ea, unsigned long pa, - unsigned long flags); +int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot); extern int __meminit vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys); diff --combined drivers/pci/hotplug/acpiphp.h index cf3058404f41,14cef4cf592b..a2094c07af6a --- a/drivers/pci/hotplug/acpiphp.h +++ b/drivers/pci/hotplug/acpiphp.h @@@ -8,7 -8,7 +8,7 @@@ * Copyright (C) 2002 Hiroshi Aono (h-aono@ap.jp.nec.com) * Copyright (C) 2002,2003 Takayoshi Kochi (t-kochi@bq.jp.nec.com) * Copyright (C) 2002,2003 NEC Corporation - * Copyright (C) 2003-2005 Matthew Wilcox (matthew.wilcox@hp.com) + * Copyright (C) 2003-2005 Matthew Wilcox (willy@infradead.org) * Copyright (C) 2003-2005 Hewlett Packard * * All rights reserved. @@@ -33,19 -33,15 +33,19 @@@ struct acpiphp_slot * struct slot - slot information for each *physical* slot */ struct slot { - struct hotplug_slot *hotplug_slot; + struct hotplug_slot hotplug_slot; struct acpiphp_slot *acpi_slot; - struct hotplug_slot_info info; unsigned int sun; /* ACPI _SUN (Slot User Number) value */ }; static inline const char *slot_name(struct slot *slot) { - return hotplug_slot_name(slot->hotplug_slot); + return hotplug_slot_name(&slot->hotplug_slot); +} + +static inline struct slot *to_slot(struct hotplug_slot *hotplug_slot) +{ + return container_of(hotplug_slot, struct slot, hotplug_slot); } /* diff --combined drivers/pci/hotplug/acpiphp_core.c index c9e2bd40c038,0447b169e7ff..853e04ad272c --- a/drivers/pci/hotplug/acpiphp_core.c +++ b/drivers/pci/hotplug/acpiphp_core.c @@@ -8,7 -8,7 +8,7 @@@ * Copyright (C) 2002 Hiroshi Aono (h-aono@ap.jp.nec.com) * Copyright (C) 2002,2003 Takayoshi Kochi (t-kochi@bq.jp.nec.com) * Copyright (C) 2002,2003 NEC Corporation - * Copyright (C) 2003-2005 Matthew Wilcox (matthew.wilcox@hp.com) + * Copyright (C) 2003-2005 Matthew Wilcox (willy@infradead.org) * Copyright (C) 2003-2005 Hewlett Packard * * All rights reserved. @@@ -40,7 -40,7 +40,7 @@@ bool acpiphp_disabled static struct acpiphp_attention_info *attention_info; #define DRIVER_VERSION "0.5" - #define DRIVER_AUTHOR "Greg Kroah-Hartman , Takayoshi Kochi , Matthew Wilcox " + #define DRIVER_AUTHOR "Greg Kroah-Hartman , Takayoshi Kochi , Matthew Wilcox " #define DRIVER_DESC "ACPI Hot Plug PCI Controller Driver" MODULE_AUTHOR(DRIVER_AUTHOR); @@@ -57,7 -57,7 +57,7 @@@ static int get_attention_status(struct static int get_latch_status(struct hotplug_slot *slot, u8 *value); static int get_adapter_status(struct hotplug_slot *slot, u8 *value); -static struct hotplug_slot_ops acpi_hotplug_slot_ops = { +static const struct hotplug_slot_ops acpi_hotplug_slot_ops = { .enable_slot = enable_slot, .disable_slot = disable_slot, .set_attention_status = set_attention_status, @@@ -118,7 -118,7 +118,7 @@@ EXPORT_SYMBOL_GPL(acpiphp_unregister_at */ static int enable_slot(struct hotplug_slot *hotplug_slot) { - struct slot *slot = hotplug_slot->private; + struct slot *slot = to_slot(hotplug_slot); pr_debug("%s - physical_slot = %s\n", __func__, slot_name(slot)); @@@ -135,7 -135,7 +135,7 @@@ */ static int disable_slot(struct hotplug_slot *hotplug_slot) { - struct slot *slot = hotplug_slot->private; + struct slot *slot = to_slot(hotplug_slot); pr_debug("%s - physical_slot = %s\n", __func__, slot_name(slot)); @@@ -179,7 -179,7 +179,7 @@@ static int set_attention_status(struct */ static int get_power_status(struct hotplug_slot *hotplug_slot, u8 *value) { - struct slot *slot = hotplug_slot->private; + struct slot *slot = to_slot(hotplug_slot); pr_debug("%s - physical_slot = %s\n", __func__, slot_name(slot)); @@@ -225,7 -225,7 +225,7 @@@ static int get_attention_status(struct */ static int get_latch_status(struct hotplug_slot *hotplug_slot, u8 *value) { - struct slot *slot = hotplug_slot->private; + struct slot *slot = to_slot(hotplug_slot); pr_debug("%s - physical_slot = %s\n", __func__, slot_name(slot)); @@@ -245,7 -245,7 +245,7 @@@ */ static int get_adapter_status(struct hotplug_slot *hotplug_slot, u8 *value) { - struct slot *slot = hotplug_slot->private; + struct slot *slot = to_slot(hotplug_slot); pr_debug("%s - physical_slot = %s\n", __func__, slot_name(slot)); @@@ -266,26 -266,39 +266,26 @@@ int acpiphp_register_hotplug_slot(struc if (!slot) goto error; - slot->hotplug_slot = kzalloc(sizeof(*slot->hotplug_slot), GFP_KERNEL); - if (!slot->hotplug_slot) - goto error_slot; - - slot->hotplug_slot->info = &slot->info; - - slot->hotplug_slot->private = slot; - slot->hotplug_slot->ops = &acpi_hotplug_slot_ops; + slot->hotplug_slot.ops = &acpi_hotplug_slot_ops; slot->acpi_slot = acpiphp_slot; - slot->hotplug_slot->info->power_status = acpiphp_get_power_status(slot->acpi_slot); - slot->hotplug_slot->info->attention_status = 0; - slot->hotplug_slot->info->latch_status = acpiphp_get_latch_status(slot->acpi_slot); - slot->hotplug_slot->info->adapter_status = acpiphp_get_adapter_status(slot->acpi_slot); acpiphp_slot->slot = slot; slot->sun = sun; snprintf(name, SLOT_NAME_SIZE, "%u", sun); - retval = pci_hp_register(slot->hotplug_slot, acpiphp_slot->bus, + retval = pci_hp_register(&slot->hotplug_slot, acpiphp_slot->bus, acpiphp_slot->device, name); if (retval == -EBUSY) - goto error_hpslot; + goto error_slot; if (retval) { pr_err("pci_hp_register failed with error %d\n", retval); - goto error_hpslot; + goto error_slot; } pr_info("Slot [%s] registered\n", slot_name(slot)); return 0; -error_hpslot: - kfree(slot->hotplug_slot); error_slot: kfree(slot); error: @@@ -299,7 -312,8 +299,7 @@@ void acpiphp_unregister_hotplug_slot(st pr_info("Slot [%s] unregistered\n", slot_name(slot)); - pci_hp_deregister(slot->hotplug_slot); - kfree(slot->hotplug_slot); + pci_hp_deregister(&slot->hotplug_slot); kfree(slot); } diff --combined fs/btrfs/compression.c index 8703ce68fe9d,a65d144da00c..2955a4ea2fa8 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@@ -437,10 -437,8 +437,8 @@@ static noinline int add_ra_bio_pages(st if (pg_index > end_index) break; - rcu_read_lock(); - page = radix_tree_lookup(&mapping->i_pages, pg_index); - rcu_read_unlock(); - if (page && !radix_tree_exceptional_entry(page)) { + page = xa_load(&mapping->i_pages, pg_index); + if (page && !xa_is_value(page)) { misses++; if (misses > 4) break; @@@ -528,6 -526,7 +526,6 @@@ blk_status_t btrfs_submit_compressed_re int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *tree; struct extent_map_tree *em_tree; struct compressed_bio *cb; unsigned long compressed_len; @@@ -544,6 -543,7 +542,6 @@@ int faili = 0; u32 *sums; - tree = &BTRFS_I(inode)->io_tree; em_tree = &BTRFS_I(inode)->extent_tree; /* we need the actual starting offset of this extent in the file */ diff --combined fs/btrfs/extent_io.c index 6877a74c7469,d4ad015e4485..d228f706ff3e --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@@ -1424,15 -1424,20 +1424,15 @@@ int find_first_extent_bit(struct extent struct extent_state **cached_state) { struct extent_state *state; - struct rb_node *n; int ret = 1; spin_lock(&tree->lock); if (cached_state && *cached_state) { state = *cached_state; if (state->end == start - 1 && extent_state_in_tree(state)) { - n = rb_next(&state->rb_node); - while (n) { - state = rb_entry(n, struct extent_state, - rb_node); + while ((state = next_state(state)) != NULL) { if (state->state & bits) goto got_it; - n = rb_next(n); } free_extent_state(*cached_state); *cached_state = NULL; @@@ -1563,7 -1568,7 +1563,7 @@@ static noinline int lock_delalloc_pages * * 1 is returned if we find something, 0 if nothing was in the tree */ -STATIC u64 find_lock_delalloc_range(struct inode *inode, +static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end, u64 max_bytes) @@@ -1643,17 -1648,6 +1643,17 @@@ out_failed return found; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +u64 btrfs_find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end, u64 max_bytes) +{ + return find_lock_delalloc_range(inode, tree, locked_page, start, end, + max_bytes); +} +#endif + static int __process_pages_contig(struct address_space *mapping, struct page *locked_page, pgoff_t start_index, pgoff_t end_index, @@@ -3784,7 -3778,7 +3784,7 @@@ int btree_write_cache_pages(struct addr pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { @@@ -3909,7 -3903,7 +3909,7 @@@ static int extent_write_cache_pages(str pgoff_t done_index; int range_whole = 0; int scanned = 0; - int tag; + xa_mark_t tag; /* * We have to hold onto the inode so that ordered extents can do their @@@ -5159,11 -5153,9 +5159,9 @@@ void clear_extent_buffer_dirty(struct e clear_page_dirty_for_io(page); xa_lock_irq(&page->mapping->i_pages); - if (!PageDirty(page)) { - radix_tree_tag_clear(&page->mapping->i_pages, - page_index(page), - PAGECACHE_TAG_DIRTY); - } + if (!PageDirty(page)) + __xa_clear_mark(&page->mapping->i_pages, + page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irq(&page->mapping->i_pages); ClearPageError(page); unlock_page(page); @@@ -5171,11 -5163,11 +5169,11 @@@ WARN_ON(atomic_read(&eb->refs) == 0); } -int set_extent_buffer_dirty(struct extent_buffer *eb) +bool set_extent_buffer_dirty(struct extent_buffer *eb) { int i; int num_pages; - int was_dirty = 0; + bool was_dirty; check_buffer_tree_ref(eb); @@@ -5185,15 -5177,8 +5183,15 @@@ WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + if (!was_dirty) + for (i = 0; i < num_pages; i++) + set_page_dirty(eb->pages[i]); + +#ifdef CONFIG_BTRFS_DEBUG for (i = 0; i < num_pages; i++) - set_page_dirty(eb->pages[i]); + ASSERT(PageDirty(eb->pages[i])); +#endif + return was_dirty; } diff --combined fs/buffer.c index 109f55196866,1286c2b95498..d60d61e8ed7d --- a/fs/buffer.c +++ b/fs/buffer.c @@@ -562,7 -562,7 +562,7 @@@ void mark_buffer_dirty_inode(struct buf EXPORT_SYMBOL(mark_buffer_dirty_inode); /* - * Mark the page dirty, and set it dirty in the radix tree, and mark the inode + * Mark the page dirty, and set it dirty in the page cache, and mark the inode * dirty. * * If warn is true, then emit a warning if the page is not uptodate and has @@@ -579,8 -579,8 +579,8 @@@ void __set_page_dirty(struct page *page if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->i_pages, - page_index(page), PAGECACHE_TAG_DIRTY); + __xa_set_mark(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_DIRTY); } xa_unlock_irqrestore(&mapping->i_pages, flags); } @@@ -1050,7 -1050,7 +1050,7 @@@ __getblk_slow(struct block_device *bdev * The relationship between dirty buffers and dirty pages: * * Whenever a page has any dirty buffers, the page's dirty bit is set, and - * the page is tagged dirty in its radix tree. + * the page is tagged dirty in the page cache. * * At all times, the dirtiness of the buffers represents the dirtiness of * subsections of the page. If the page has buffers, the page dirty bit is @@@ -1073,9 -1073,9 +1073,9 @@@ * mark_buffer_dirty - mark a buffer_head as needing writeout * @bh: the buffer_head to mark dirty * - * mark_buffer_dirty() will set the dirty bit against the buffer, then set its - * backing page dirty, then tag the page as dirty in its address_space's radix - * tree and then attach the address_space's inode to its superblock's dirty + * mark_buffer_dirty() will set the dirty bit against the buffer, then set + * its backing page dirty, then tag the page as dirty in the page cache + * and then attach the address_space's inode to its superblock's dirty * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, @@@ -3060,6 -3060,11 +3060,6 @@@ static int submit_bh_wbc(int op, int op */ bio = bio_alloc(GFP_NOIO, 1); - if (wbc) { - wbc_init_bio(wbc, bio); - wbc_account_io(wbc, bh->b_page, bh->b_size); - } - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_write_hint = write_hint; @@@ -3079,11 -3084,6 +3079,11 @@@ op_flags |= REQ_PRIO; bio_set_op_attrs(bio, op, op_flags); + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } + submit_bio(bio); return 0; } diff --combined fs/ext4/inode.c index c3d9a42c561e,57bad3edfbed..05f01fbd9c7f --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@@ -577,8 -577,8 +577,8 @@@ int ext4_map_blocks(handle_t *handle, s EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && !(status & EXTENT_STATUS_WRITTEN) && - ext4_find_delalloc_range(inode, map->m_lblk, - map->m_lblk + map->m_len - 1)) + ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, + map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); @@@ -701,8 -701,8 +701,8 @@@ found EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && !(status & EXTENT_STATUS_WRITTEN) && - ext4_find_delalloc_range(inode, map->m_lblk, - map->m_lblk + map->m_len - 1)) + ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, + map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); @@@ -1595,7 -1595,7 +1595,7 @@@ static int ext4_da_reserve_space(struc return 0; /* success */ } -static void ext4_da_release_space(struct inode *inode, int to_free) +void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); @@@ -1634,11 -1634,13 +1634,11 @@@ static void ext4_da_page_release_reserv unsigned int offset, unsigned int length) { - int to_release = 0, contiguous_blks = 0; + int contiguous_blks = 0; struct buffer_head *head, *bh; unsigned int curr_off = 0; struct inode *inode = page->mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned int stop = offset + length; - int num_clusters; ext4_fsblk_t lblk; BUG_ON(stop > PAGE_SIZE || stop < length); @@@ -1652,6 -1654,7 +1652,6 @@@ break; if ((offset <= curr_off) && (buffer_delay(bh))) { - to_release++; contiguous_blks++; clear_buffer_delay(bh); } else if (contiguous_blks) { @@@ -1659,7 -1662,7 +1659,7 @@@ (PAGE_SHIFT - inode->i_blkbits); lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; - ext4_es_remove_extent(inode, lblk, contiguous_blks); + ext4_es_remove_blks(inode, lblk, contiguous_blks); contiguous_blks = 0; } curr_off = next_off; @@@ -1668,9 -1671,21 +1668,9 @@@ if (contiguous_blks) { lblk = page->index << (PAGE_SHIFT - inode->i_blkbits); lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; - ext4_es_remove_extent(inode, lblk, contiguous_blks); + ext4_es_remove_blks(inode, lblk, contiguous_blks); } - /* If we have released all the blocks belonging to a cluster, then we - * need to release the reserved space for that cluster. */ - num_clusters = EXT4_NUM_B2C(sbi, to_release); - while (num_clusters > 0) { - lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) + - ((num_clusters - 1) << sbi->s_cluster_bits); - if (sbi->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, lblk)) - ext4_da_release_space(inode, 1); - - num_clusters--; - } } /* @@@ -1765,65 -1780,6 +1765,65 @@@ static int ext4_bh_delay_or_unwritten(h return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); } +/* + * ext4_insert_delayed_block - adds a delayed block to the extents status + * tree, incrementing the reserved cluster/block + * count or making a pending reservation + * where needed + * + * @inode - file containing the newly added block + * @lblk - logical block to be added + * + * Returns 0 on success, negative error code on failure. + */ +static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int ret; + bool allocated = false; + + /* + * If the cluster containing lblk is shared with a delayed, + * written, or unwritten extent in a bigalloc file system, it's + * already been accounted for and does not need to be reserved. + * A pending reservation must be made for the cluster if it's + * shared with a written or unwritten extent and doesn't already + * have one. Written and unwritten extents can be purged from the + * extents status tree if the system is under memory pressure, so + * it's necessary to examine the extent tree if a search of the + * extents status tree doesn't get a match. + */ + if (sbi->s_cluster_ratio == 1) { + ret = ext4_da_reserve_space(inode); + if (ret != 0) /* ENOSPC */ + goto errout; + } else { /* bigalloc */ + if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { + if (!ext4_es_scan_clu(inode, + &ext4_es_is_mapped, lblk)) { + ret = ext4_clu_mapped(inode, + EXT4_B2C(sbi, lblk)); + if (ret < 0) + goto errout; + if (ret == 0) { + ret = ext4_da_reserve_space(inode); + if (ret != 0) /* ENOSPC */ + goto errout; + } else { + allocated = true; + } + } else { + allocated = true; + } + } + } + + ret = ext4_es_insert_delayed_block(inode, lblk, allocated); + +errout: + return ret; +} + /* * This function is grabs code from the very beginning of * ext4_map_blocks, but assumes that the caller is from delayed write @@@ -1903,14 -1859,28 +1903,14 @@@ static int ext4_da_map_blocks(struct in add_delayed: if (retval == 0) { int ret; + /* * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - /* - * If the block was allocated from previously allocated cluster, - * then we don't need to reserve it again. However we still need - * to reserve metadata for every block we're going to write. - */ - if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, map->m_lblk)) { - ret = ext4_da_reserve_space(inode); - if (ret) { - /* not enough space to reserve */ - retval = ret; - goto out_unlock; - } - } - ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - ~0, EXTENT_STATUS_DELAYED); - if (ret) { + ret = ext4_insert_delayed_block(inode, map->m_lblk); + if (ret != 0) { retval = ret; goto out_unlock; } @@@ -2643,7 -2613,7 +2643,7 @@@ static int mpage_prepare_extent_to_map( long left = mpd->wbc->nr_to_write; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; - int tag; + xa_mark_t tag; int i, err = 0; int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; @@@ -3480,8 -3450,7 +3480,8 @@@ static int ext4_iomap_begin(struct inod ext4_lblk_t end = map.m_lblk + map.m_len - 1; struct extent_status es; - ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, + map.m_lblk, end, &es); if (!es.es_len || es.es_lblk > end) { /* entire range is a hole */ @@@ -6184,14 -6153,13 +6184,14 @@@ static int ext4_bh_unmapped(handle_t *h return !buffer_mapped(bh); } -int ext4_page_mkwrite(struct vm_fault *vmf) +vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = vmf->page; loff_t size; unsigned long len; - int ret; + int err; + vm_fault_t ret; struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; @@@ -6204,8 -6172,8 +6204,8 @@@ down_read(&EXT4_I(inode)->i_mmap_sem); - ret = ext4_convert_inline_data(inode); - if (ret) + err = ext4_convert_inline_data(inode); + if (err) goto out_ret; /* Delalloc case is easy... */ @@@ -6213,9 -6181,9 +6213,9 @@@ !ext4_should_journal_data(inode) && !ext4_nonda_switch(inode->i_sb)) { do { - ret = block_page_mkwrite(vma, vmf, + err = block_page_mkwrite(vma, vmf, ext4_da_get_block_prep); - } while (ret == -ENOSPC && + } while (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); goto out_ret; } @@@ -6260,8 -6228,8 +6260,8 @@@ retry_alloc ret = VM_FAULT_SIGBUS; goto out; } - ret = block_page_mkwrite(vma, vmf, get_block); - if (!ret && ext4_should_journal_data(inode)) { + err = block_page_mkwrite(vma, vmf, get_block); + if (!err && ext4_should_journal_data(inode)) { if (ext4_walk_page_buffers(handle, page_buffers(page), 0, PAGE_SIZE, NULL, do_journal_get_write_access)) { unlock_page(page); @@@ -6272,24 -6240,24 +6272,24 @@@ ext4_set_inode_state(inode, EXT4_STATE_JDATA); } ext4_journal_stop(handle); - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_alloc; out_ret: - ret = block_page_mkwrite_return(ret); + ret = block_page_mkwrite_return(err); out: up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(inode->i_sb); return ret; } -int ext4_filemap_fault(struct vm_fault *vmf) +vm_fault_t ext4_filemap_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); - int err; + vm_fault_t ret; down_read(&EXT4_I(inode)->i_mmap_sem); - err = filemap_fault(vmf); + ret = filemap_fault(vmf); up_read(&EXT4_I(inode)->i_mmap_sem); - return err; + return ret; } diff --combined fs/f2fs/data.c index 106f116466bf,6962491172a5..b293cb3e27a2 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@@ -1,9 -1,12 +1,9 @@@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/data.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include @@@ -46,29 -49,12 +46,29 @@@ static bool __is_cp_guaranteed(struct p inode->i_ino == F2FS_NODE_INO(sbi) || S_ISDIR(inode->i_mode) || (S_ISREG(inode->i_mode) && - is_inode_flag_set(inode, FI_ATOMIC_FILE)) || + (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) || is_cold_data(page)) return true; return false; } +static enum count_type __read_io_type(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (mapping) { + struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_META_INO(sbi)) + return F2FS_RD_META; + + if (inode->i_ino == F2FS_NODE_INO(sbi)) + return F2FS_RD_NODE; + } + return F2FS_RD_DATA; +} + /* postprocessing steps for read bios */ enum bio_post_read_step { STEP_INITIAL = 0, @@@ -94,12 -80,10 +94,12 @@@ static void __read_end_io(struct bio *b /* PG_error was set if any post_read step failed */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); - SetPageError(page); + /* will re-read again later */ + ClearPageError(page); } else { SetPageUptodate(page); } + dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } if (bio->bi_private) @@@ -142,9 -126,8 +142,9 @@@ static bool f2fs_bio_post_read_required static void f2fs_read_end_io(struct bio *bio) { - if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), FAULT_IO)) { - f2fs_show_injection_info(FAULT_IO); + if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), + FAULT_READ_IO)) { + f2fs_show_injection_info(FAULT_READ_IO); bio->bi_status = BLK_STS_IOERR; } @@@ -165,11 -148,6 +165,11 @@@ static void f2fs_write_end_io(struct bi struct bio_vec *bvec; int i; + if (time_to_inject(sbi, FAULT_WRITE_IO)) { + f2fs_show_injection_info(FAULT_WRITE_IO); + bio->bi_status = BLK_STS_IOERR; + } + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); @@@ -341,8 -319,8 +341,8 @@@ static void __submit_merged_bio(struct io->bio = NULL; } -static bool __has_merged_page(struct f2fs_bio_info *io, - struct inode *inode, nid_t ino, pgoff_t idx) +static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, + struct page *page, nid_t ino) { struct bio_vec *bvec; struct page *target; @@@ -351,7 -329,7 +351,7 @@@ if (!io->bio) return false; - if (!inode && !ino) + if (!inode && !page && !ino) return true; bio_for_each_segment_all(bvec, io->bio, i) { @@@ -361,10 -339,11 +361,10 @@@ else target = fscrypt_control_page(bvec->bv_page); - if (idx != target->index) - continue; - if (inode && inode == target->mapping->host) return true; + if (page && page == target) + return true; if (ino && ino == ino_of_node(target)) return true; } @@@ -373,8 -352,7 +373,8 @@@ } static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, - nid_t ino, pgoff_t idx, enum page_type type) + struct page *page, nid_t ino, + enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); enum temp_type temp; @@@ -385,7 -363,7 +385,7 @@@ io = sbi->write_io[btype] + temp; down_read(&io->io_rwsem); - ret = __has_merged_page(io, inode, ino, idx); + ret = __has_merged_page(io, inode, page, ino); up_read(&io->io_rwsem); /* TODO: use HOT temp only for meta pages now. */ @@@ -416,12 -394,12 +416,12 @@@ static void __f2fs_submit_merged_write( } static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type, bool force) + struct inode *inode, struct page *page, + nid_t ino, enum page_type type, bool force) { enum temp_type temp; - if (!force && !has_merged_page(sbi, inode, ino, idx, type)) + if (!force && !has_merged_page(sbi, inode, page, ino, type)) return; for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { @@@ -440,10 -418,10 +440,10 @@@ void f2fs_submit_merged_write(struct f2 } void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type) + struct inode *inode, struct page *page, + nid_t ino, enum page_type type) { - __submit_merged_write_cond(sbi, inode, ino, idx, type, false); + __submit_merged_write_cond(sbi, inode, page, ino, type, false); } void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) @@@ -478,16 -456,12 +478,16 @@@ int f2fs_submit_page_bio(struct f2fs_io bio_put(bio); return -EFAULT; } + + if (fio->io_wbc && !is_read_io(fio->op)) + wbc_account_io(fio->io_wbc, page, PAGE_SIZE); + bio_set_op_attrs(bio, fio->op, fio->op_flags); - __submit_bio(fio->sbi, bio, fio->type); + inc_page_count(fio->sbi, is_read_io(fio->op) ? + __read_io_type(page): WB_DATA_TYPE(fio->page)); - if (!is_read_io(fio->op)) - inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page)); + __submit_bio(fio->sbi, bio, fio->type); return 0; } @@@ -559,9 -533,6 +559,9 @@@ skip if (fio->in_list) goto next; out: + if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || + f2fs_is_checkpoint_ready(sbi)) + __submit_merged_bio(io); up_write(&io->io_rwsem); } @@@ -594,6 -565,9 +594,6 @@@ static struct bio *f2fs_grab_read_bio(s ctx->bio = bio; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_block_writeback(sbi, blkaddr); } return bio; @@@ -608,15 -582,10 +608,15 @@@ static int f2fs_submit_page_read(struc if (IS_ERR(bio)) return PTR_ERR(bio); + /* wait for GCed page writeback via META_MAPPING */ + f2fs_wait_on_block_writeback(inode, blkaddr); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; } + ClearPageError(page); + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); __submit_bio(F2FS_I_SB(inode), bio, DATA); return 0; } @@@ -907,6 -876,7 +907,6 @@@ static int __allocate_data_block(struc struct f2fs_summary sum; struct node_info ni; block_t old_blkaddr; - pgoff_t fofs; blkcnt_t count = 1; int err; @@@ -919,7 -889,7 +919,7 @@@ dn->data_blkaddr = datablock_addr(dn->inode, dn->node_page, dn->ofs_in_node); - if (dn->data_blkaddr == NEW_ADDR) + if (dn->data_blkaddr != NULL_ADDR) goto alloc; if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) @@@ -935,10 -905,12 +935,10 @@@ alloc old_blkaddr, old_blkaddr); f2fs_set_data_blkaddr(dn); - /* update i_size */ - fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + - dn->ofs_in_node; - if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT)) - f2fs_i_size_write(dn->inode, - ((loff_t)(fofs + 1) << PAGE_SHIFT)); + /* + * i_size will be updated by direct_IO. Otherwise, we'll get stale + * data from unwritten block via dio_read. + */ return 0; } @@@ -973,7 -945,7 +973,7 @@@ int f2fs_preallocate_blocks(struct kioc if (direct_io) { map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint); - flag = f2fs_force_buffered_io(inode, WRITE) ? + flag = f2fs_force_buffered_io(inode, iocb, from) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; goto map_blocks; @@@ -998,7 -970,7 +998,7 @@@ map_blocks return err; } -static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) { if (flag == F2FS_GET_BLOCK_PRE_AIO) { if (lock) @@@ -1053,11 -1025,6 +1053,11 @@@ int f2fs_map_blocks(struct inode *inode map->m_flags = F2FS_MAP_MAPPED; if (map->m_next_extent) *map->m_next_extent = pgofs + map->m_len; + + /* for hardware encryption, but to avoid potential issue in future */ + if (flag == F2FS_GET_BLOCK_DIO) + f2fs_wait_on_block_writeback_range(inode, + map->m_pblk, map->m_len); goto out; } @@@ -1097,15 -1064,7 +1097,15 @@@ next_block goto sync_out; } - if (!is_valid_data_blkaddr(sbi, blkaddr)) { + if (is_valid_data_blkaddr(sbi, blkaddr)) { + /* use out-place-update for driect IO under LFS mode */ + if (test_opt(sbi, LFS) && create && + flag == F2FS_GET_BLOCK_DIO) { + err = __allocate_data_block(&dn, map->m_seg_type); + if (!err) + set_inode_flag(inode, FI_APPEND_WRITE); + } + } else { if (create) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@@ -1117,8 -1076,6 +1117,8 @@@ last_ofs_in_node = dn.ofs_in_node; } } else { + WARN_ON(flag != F2FS_GET_BLOCK_PRE_DIO && + flag != F2FS_GET_BLOCK_DIO); err = __allocate_data_block(&dn, map->m_seg_type); if (!err) @@@ -1216,12 -1173,6 +1216,12 @@@ skip goto next_dnode; sync_out: + + /* for hardware encryption, but to avoid potential issue in future */ + if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) + f2fs_wait_on_block_writeback_range(inode, + map->m_pblk, map->m_len); + if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; @@@ -1304,7 -1255,7 +1304,7 @@@ static int get_data_block_dio(struct in struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DEFAULT, NULL, + F2FS_GET_BLOCK_DIO, NULL, f2fs_rw_hint_to_seg_type( inode->i_write_hint)); } @@@ -1607,17 -1558,9 +1607,17 @@@ submit_and_realloc } } + /* + * If the page is under writeback, we need to wait for + * its completion to see the correct decrypted data. + */ + f2fs_wait_on_block_writeback(inode, block_nr); + if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); + ClearPageError(page); last_block_in_bio = block_nr; goto next_page; set_error_page: @@@ -1682,7 -1625,7 +1682,7 @@@ static int encrypt_one_page(struct f2fs return 0; /* wait for GCed page writeback via META_MAPPING */ - f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr); + f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); retry_encrypt: fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, @@@ -1739,10 -1682,6 +1739,10 @@@ static inline bool check_inplace_update is_inode_flag_set(inode, FI_NEED_IPU)) return true; + if (unlikely(fio && is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + !f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) + return true; + return false; } @@@ -1766,8 -1705,6 +1766,8 @@@ bool f2fs_should_update_outplace(struc return true; if (S_ISDIR(inode->i_mode)) return true; + if (IS_NOQUOTA(inode)) + return true; if (f2fs_is_atomic_file(inode)) return true; if (fio) { @@@ -1775,9 -1712,6 +1775,9 @@@ return true; if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) return true; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) + return true; } return false; } @@@ -1829,7 -1763,6 +1829,7 @@@ int f2fs_do_write_data_page(struct f2fs /* This page is already truncated */ if (fio->old_blkaddr == NULL_ADDR) { ClearPageUptodate(page); + clear_cold_data(page); goto out_writepage; } got_it: @@@ -2005,20 -1938,18 +2005,20 @@@ done out: inode_dec_dirty_pages(inode); - if (err) + if (err) { ClearPageUptodate(page); + clear_cold_data(page); + } if (wbc->for_reclaim) { - f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA); + f2fs_submit_merged_write_cond(sbi, NULL, page, 0, DATA); clear_inode_flag(inode, FI_HOT_DATA); f2fs_remove_dirty_inode(inode); submitted = NULL; } unlock_page(page); - if (!S_ISDIR(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode)) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { @@@ -2069,10 -2000,10 +2069,10 @@@ static int f2fs_write_cache_pages(struc pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; - pgoff_t last_idx = ULONG_MAX; int cycled; int range_whole = 0; - int tag; + xa_mark_t tag; + int nwritten = 0; pagevec_init(&pvec); @@@ -2175,7 -2106,7 +2175,7 @@@ continue_unlock done = 1; break; } else if (submitted) { - last_idx = page->index; + nwritten++; } if (--wbc->nr_to_write <= 0 && @@@ -2197,9 -2128,9 +2197,9 @@@ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; - if (last_idx != ULONG_MAX) + if (nwritten) f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host, - 0, last_idx, DATA); + NULL, 0, DATA); return ret; } @@@ -2209,8 -2140,6 +2209,8 @@@ static inline bool __should_serialize_i { if (!S_ISREG(inode->i_mode)) return false; + if (IS_NOQUOTA(inode)) + return false; if (wbc->sync_mode != WB_SYNC_ALL) return true; if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks) @@@ -2240,8 -2169,7 +2240,8 @@@ static int __f2fs_write_data_pages(stru if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; - if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && + if ((S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) && + wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && f2fs_available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; @@@ -2306,7 -2234,7 +2306,7 @@@ static void f2fs_write_failed(struct ad down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); - f2fs_truncate_blocks(inode, i_size, true); + f2fs_truncate_blocks(inode, i_size, true, true); up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@@ -2404,10 -2332,6 +2404,10 @@@ static int f2fs_write_begin(struct fil trace_f2fs_write_begin(inode, pos, len, flags); + err = f2fs_is_checkpoint_ready(sbi); + if (err) + goto fail; + if ((f2fs_is_atomic_file(inode) && !f2fs_available_free_memory(sbi, INMEM_PAGES)) || is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { @@@ -2445,8 -2369,7 +2445,8 @@@ repeat if (err) goto fail; - if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) { + if (need_balance && !IS_NOQUOTA(inode) && + has_not_enough_free_secs(sbi, 0, 0)) { unlock_page(page); f2fs_balance_fs(sbi, true); lock_page(page); @@@ -2459,6 -2382,10 +2459,6 @@@ f2fs_wait_on_page_writeback(page, DATA, false); - /* wait for GCed page writeback via META_MAPPING */ - if (f2fs_post_read_required(inode)) - f2fs_wait_on_block_writeback(sbi, blkaddr); - if (len == PAGE_SIZE || PageUptodate(page)) return 0; @@@ -2553,53 -2480,36 +2553,53 @@@ static ssize_t f2fs_direct_IO(struct ki struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); size_t count = iov_iter_count(iter); loff_t offset = iocb->ki_pos; int rw = iov_iter_rw(iter); int err; enum rw_hint hint = iocb->ki_hint; int whint_mode = F2FS_OPTION(sbi).whint_mode; + bool do_opu; err = check_direct_IO(inode, iter, offset); if (err) return err < 0 ? err : 0; - if (f2fs_force_buffered_io(inode, rw)) + if (f2fs_force_buffered_io(inode, iocb, iter)) return 0; + do_opu = allow_outplace_dio(inode, iocb, iter); + trace_f2fs_direct_IO_enter(inode, offset, count, rw); if (rw == WRITE && whint_mode == WHINT_MODE_OFF) iocb->ki_hint = WRITE_LIFE_NOT_SET; - if (!down_read_trylock(&F2FS_I(inode)->i_gc_rwsem[rw])) { - if (iocb->ki_flags & IOCB_NOWAIT) { + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!down_read_trylock(&fi->i_gc_rwsem[rw])) { iocb->ki_hint = hint; err = -EAGAIN; goto out; } - down_read(&F2FS_I(inode)->i_gc_rwsem[rw]); + if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) { + up_read(&fi->i_gc_rwsem[rw]); + iocb->ki_hint = hint; + err = -EAGAIN; + goto out; + } + } else { + down_read(&fi->i_gc_rwsem[rw]); + if (do_opu) + down_read(&fi->i_gc_rwsem[READ]); } err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio); - up_read(&F2FS_I(inode)->i_gc_rwsem[rw]); + + if (do_opu) + up_read(&fi->i_gc_rwsem[READ]); + + up_read(&fi->i_gc_rwsem[rw]); if (rw == WRITE) { if (whint_mode == WHINT_MODE_OFF) @@@ -2607,8 -2517,7 +2607,8 @@@ if (err > 0) { f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, err); - set_inode_flag(inode, FI_UPDATE_WRITE); + if (!do_opu) + set_inode_flag(inode, FI_UPDATE_WRITE); } else if (err < 0) { f2fs_write_failed(mapping, offset + count); } @@@ -2641,8 -2550,6 +2641,8 @@@ void f2fs_invalidate_page(struct page * } } + clear_cold_data(page); + /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) return f2fs_drop_inmem_page(inode, page); @@@ -2661,7 -2568,6 +2661,7 @@@ int f2fs_release_page(struct page *page if (IS_ATOMIC_WRITTEN_PAGE(page)) return 0; + clear_cold_data(page); set_page_private(page, 0); ClearPagePrivate(page); return 1; @@@ -2677,6 -2583,10 +2677,6 @@@ static int f2fs_set_data_page_dirty(str if (!PageUptodate(page)) SetPageUptodate(page); - /* don't remain PG_checked flag which was set during GC */ - if (is_cold_data(page)) - clear_cold_data(page); - if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { f2fs_register_inmem_page(inode, page); @@@ -2787,13 -2697,13 +2787,13 @@@ const struct address_space_operations f #endif }; - void f2fs_clear_radix_tree_dirty_tag(struct page *page) + void f2fs_clear_page_cache_dirty_tag(struct page *page) { struct address_space *mapping = page_mapping(page); unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); - radix_tree_tag_clear(&mapping->i_pages, page_index(page), + __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); } diff --combined fs/f2fs/dir.c index 2ef84b4590ea,01006085904a..bacc667950b6 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@@ -1,9 -1,12 +1,9 @@@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/dir.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include @@@ -655,9 -658,9 +655,9 @@@ int f2fs_do_tmpfile(struct inode *inode f2fs_put_page(page, 1); clear_inode_flag(inode, FI_NEW_INODE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); fail: up_write(&F2FS_I(inode)->i_sem); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return err; } @@@ -726,11 -729,10 +726,11 @@@ void f2fs_delete_entry(struct f2fs_dir_ if (bit_pos == NR_DENTRY_IN_BLOCK && !f2fs_truncate_hole(dir, page->index, page->index + 1)) { - f2fs_clear_radix_tree_dirty_tag(page); + f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); + clear_cold_data(page); inode_dec_dirty_pages(dir); f2fs_remove_dirty_inode(dir); } @@@ -782,15 -784,9 +782,15 @@@ int f2fs_fill_dentries(struct dir_conte struct f2fs_dir_entry *de = NULL; struct fscrypt_str de_name = FSTR_INIT(NULL, 0); struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); + struct blk_plug plug; + bool readdir_ra = sbi->readdir_ra == 1; + int err = 0; bit_pos = ((unsigned long)ctx->pos % d->max); + if (readdir_ra) + blk_start_plug(&plug); + while (bit_pos < d->max) { bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos); if (bit_pos >= d->max) @@@ -810,33 -806,29 +810,33 @@@ if (f2fs_encrypted_inode(d->inode)) { int save_len = fstr->len; - int err; err = fscrypt_fname_disk_to_usr(d->inode, (u32)de->hash_code, 0, &de_name, fstr); if (err) - return err; + goto out; de_name = *fstr; fstr->len = save_len; } if (!dir_emit(ctx, de_name.name, de_name.len, - le32_to_cpu(de->ino), d_type)) - return 1; + le32_to_cpu(de->ino), d_type)) { + err = 1; + goto out; + } - if (sbi->readdir_ra == 1) + if (readdir_ra) f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; } - return 0; +out: + if (readdir_ra) + blk_finish_plug(&plug); + return err; } static int f2fs_readdir(struct file *file, struct dir_context *ctx) diff --combined fs/f2fs/f2fs.h index 56204a8f8a12,3ccb8ed84fae..1e031971a466 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@@ -1,14 -1,16 +1,14 @@@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/f2fs.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef _LINUX_F2FS_H #define _LINUX_F2FS_H +#include #include #include #include @@@ -51,10 -53,9 +51,10 @@@ enum FAULT_DIR_DEPTH, FAULT_EVICT_INODE, FAULT_TRUNCATE, - FAULT_IO, + FAULT_READ_IO, FAULT_CHECKPOINT, FAULT_DISCARD, + FAULT_WRITE_IO, FAULT_MAX, }; @@@ -99,7 -100,6 +99,7 @@@ extern char *f2fs_fault_name[FAULT_MAX] #define F2FS_MOUNT_QUOTA 0x00400000 #define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 #define F2FS_MOUNT_RESERVE_ROOT 0x01000000 +#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@@ -150,7 -150,6 +150,7 @@@ struct f2fs_mount_info #define F2FS_FEATURE_INODE_CRTIME 0x0100 #define F2FS_FEATURE_LOST_FOUND 0x0200 #define F2FS_FEATURE_VERITY 0x0400 /* reserved */ +#define F2FS_FEATURE_SB_CHKSUM 0x0800 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@@ -179,7 -178,6 +179,7 @@@ enum #define CP_RECOVERY 0x00000008 #define CP_DISCARD 0x00000010 #define CP_TRIMMED 0x00000020 +#define CP_PAUSE 0x00000040 #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ @@@ -189,7 -187,6 +189,7 @@@ #define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ +#define DEF_DISABLE_INTERVAL 5 /* 5 secs */ struct cp_control { int reason; @@@ -206,7 -203,6 +206,7 @@@ enum META_NAT, META_SIT, META_SSA, + META_MAX, META_POR, DATA_GENERIC, META_GENERIC, @@@ -328,7 -324,7 +328,7 @@@ struct discard_cmd_control atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ - struct rb_root root; /* root of discard rb-tree */ + struct rb_root_cached root; /* root of discard rb-tree */ bool rbtree_check; /* config for consistence check */ }; @@@ -531,9 -527,6 +531,9 @@@ enum #define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */ +/* maximum retry quota flush count */ +#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 + #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ @@@ -573,13 -566,12 +573,13 @@@ struct extent_node struct extent_tree { nid_t ino; /* inode number */ - struct rb_root root; /* root of extent info rb-tree */ + struct rb_root_cached root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ struct extent_info largest; /* largested extent info */ struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ atomic_t node_cnt; /* # of extent node in rb-tree*/ + bool largest_updated; /* largest extent updated */ }; /* @@@ -608,7 -600,6 +608,7 @@@ enum F2FS_GET_BLOCK_DEFAULT, F2FS_GET_BLOCK_FIEMAP, F2FS_GET_BLOCK_BMAP, + F2FS_GET_BLOCK_DIO, F2FS_GET_BLOCK_PRE_DIO, F2FS_GET_BLOCK_PRE_AIO, F2FS_GET_BLOCK_PRECACHE, @@@ -763,12 -754,12 +763,12 @@@ static inline bool __is_front_mergeable } extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); -static inline void __try_update_largest_extent(struct inode *inode, - struct extent_tree *et, struct extent_node *en) +static inline void __try_update_largest_extent(struct extent_tree *et, + struct extent_node *en) { if (en->ei.len > et->largest.len) { et->largest = en->ei; - f2fs_mark_inode_dirty_sync(inode, true); + et->largest_updated = true; } } @@@ -953,9 -944,6 +953,9 @@@ enum count_type F2FS_DIRTY_IMETA, F2FS_WB_CP_DATA, F2FS_WB_DATA, + F2FS_RD_DATA, + F2FS_RD_NODE, + F2FS_RD_META, NR_COUNT_TYPE, }; @@@ -1100,19 -1088,11 +1100,19 @@@ enum SBI_NEED_SB_WRITE, /* need to recover superblock */ SBI_NEED_CP, /* need to checkpoint */ SBI_IS_SHUTDOWN, /* shutdown by ioctl */ + SBI_IS_RECOVERED, /* recovered orphan/data */ + SBI_CP_DISABLED, /* CP was disabled last mount */ + SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */ + SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ + SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ }; enum { CP_TIME, REQ_TIME, + DISCARD_TIME, + GC_TIME, + DISABLE_TIME, MAX_TIME, }; @@@ -1229,6 -1209,7 +1229,6 @@@ struct f2fs_sb_info unsigned int total_valid_node_count; /* valid node block count */ loff_t max_file_blocks; /* max block index of file */ int dir_level; /* directory level */ - unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */ int readdir_ra; /* readahead inode in readdir */ block_t user_block_count; /* # of user blocks */ @@@ -1238,9 -1219,6 +1238,9 @@@ block_t reserved_blocks; /* configurable reserved blocks */ block_t current_reserved_blocks; /* current reserved blocks */ + /* Additional tracking for no checkpoint mode */ + block_t unusable_block_count; /* # of blocks saved by last cp */ + unsigned int nquota_files; /* # of quota sysfile */ u32 s_next_generation; /* for NFS support */ @@@ -1279,7 -1257,6 +1279,7 @@@ */ #ifdef CONFIG_F2FS_STAT_FS struct f2fs_stat_info *stat_info; /* FS status information */ + atomic_t meta_count[META_MAX]; /* # of meta blocks */ unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ @@@ -1295,8 -1272,6 +1295,8 @@@ atomic_t max_aw_cnt; /* max # of atomic writes */ atomic_t max_vw_cnt; /* max # of volatile writes */ int bg_gc; /* background gc calls */ + unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ + unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif spinlock_t stat_lock; /* lock for stat operations */ @@@ -1331,9 -1306,9 +1331,9 @@@ }; #ifdef CONFIG_F2FS_FAULT_INJECTION -#define f2fs_show_injection_info(type) \ - printk("%sF2FS-fs : inject %s in %s of %pF\n", \ - KERN_INFO, f2fs_fault_name[type], \ +#define f2fs_show_injection_info(type) \ + printk_ratelimited("%sF2FS-fs : inject %s in %s of %pF\n", \ + KERN_INFO, f2fs_fault_name[type], \ __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { @@@ -1369,15 -1344,7 +1369,15 @@@ static inline bool time_to_inject(struc static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { - sbi->last_time[type] = jiffies; + unsigned long now = jiffies; + + sbi->last_time[type] = now; + + /* DISCARD_TIME and GC_TIME are based on REQ_TIME */ + if (type == REQ_TIME) { + sbi->last_time[DISCARD_TIME] = now; + sbi->last_time[GC_TIME] = now; + } } static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) @@@ -1387,18 -1354,16 +1387,18 @@@ return time_after(jiffies, sbi->last_time[type] + interval); } -static inline bool is_idle(struct f2fs_sb_info *sbi) +static inline unsigned int f2fs_time_to_wait(struct f2fs_sb_info *sbi, + int type) { - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - struct request_list *rl = &q->root_rl; + unsigned long interval = sbi->interval_time[type] * HZ; + unsigned int wait_ms = 0; + long delta; - if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC]) - return false; + delta = (sbi->last_time[type] + interval) - jiffies; + if (delta > 0) + wait_ms = jiffies_to_msecs(delta); - return f2fs_time_over(sbi, REQ_TIME); + return wait_ms; } /* @@@ -1739,8 -1704,7 +1739,8 @@@ static inline int inc_valid_block_count if (!__allow_reserved_blocks(sbi, inode, true)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; - + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + avail_user_block_count -= sbi->unusable_block_count; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { diff = sbi->total_valid_block_count - avail_user_block_count; if (diff > *count) @@@ -1791,9 -1755,7 +1791,9 @@@ static inline void inc_page_count(struc atomic_inc(&sbi->nr_pages[count_type]); if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || - count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA) + count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA || + count_type == F2FS_RD_DATA || count_type == F2FS_RD_NODE || + count_type == F2FS_RD_META) return; set_sbi_flag(sbi, SBI_IS_DIRTY); @@@ -1929,18 -1891,12 +1929,18 @@@ static inline int inc_valid_node_count( { block_t valid_block_count; unsigned int valid_node_count; - bool quota = inode && !is_inode; + int err; - if (quota) { - int ret = dquot_reserve_block(inode, 1); - if (ret) - return ret; + if (is_inode) { + if (inode) { + err = dquot_alloc_inode(inode); + if (err) + return err; + } + } else { + err = dquot_reserve_block(inode, 1); + if (err) + return err; } if (time_to_inject(sbi, FAULT_BLOCK)) { @@@ -1955,8 -1911,6 +1955,8 @@@ if (!__allow_reserved_blocks(sbi, inode, false)) valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + valid_block_count += sbi->unusable_block_count; if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); @@@ -1984,12 -1938,8 +1984,12 @@@ return 0; enospc: - if (quota) + if (is_inode) { + if (inode) + dquot_free_inode(inode); + } else { dquot_release_reservation_block(inode, 1); + } return -ENOSPC; } @@@ -2010,9 -1960,7 +2010,9 @@@ static inline void dec_valid_node_count spin_unlock(&sbi->stat_lock); - if (!is_inode) + if (is_inode) + dquot_free_inode(inode); + else f2fs_i_blocks_write(inode, 1, false, true); } @@@ -2142,15 -2090,6 +2142,15 @@@ static inline struct bio *f2fs_bio_allo return bio_alloc(GFP_KERNEL, npages); } +static inline bool is_idle(struct f2fs_sb_info *sbi, int type) +{ + if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) || + get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || + get_pages(sbi, F2FS_WB_CP_DATA)) + return false; + return f2fs_time_over(sbi, type); +} + static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) { @@@ -2800,8 -2739,7 +2800,8 @@@ static inline bool is_valid_data_blkadd */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void f2fs_truncate_data_blocks(struct dnode_of_data *dn); -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, + bool buf_write); int f2fs_truncate(struct inode *inode); int f2fs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); @@@ -2811,7 -2749,6 +2811,7 @@@ void f2fs_truncate_data_blocks_range(st int f2fs_precache_extents(struct inode *inode); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid); int f2fs_pin_file_control(struct inode *inode, bool inc); /* @@@ -2890,7 -2827,6 +2890,7 @@@ static inline int f2fs_add_link(struct int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); +int f2fs_quota_sync(struct super_block *sb, int type); void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); @@@ -2933,7 -2869,7 +2933,7 @@@ struct page *f2fs_new_node_page(struct void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); struct page *f2fs_get_node_page_ra(struct page *parent, int start); -void f2fs_move_node_page(struct page *node_page, int gc_type); +int f2fs_move_node_page(struct page *node_page, int gc_type); int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic, unsigned int *seq_id); @@@ -2950,7 -2886,7 +2950,7 @@@ int f2fs_recover_xattr_data(struct inod int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); -void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_node_manager_caches(void); @@@ -2978,8 -2914,6 +2978,8 @@@ void f2fs_stop_discard_thread(struct f2 bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi); +int f2fs_disable_cp_again(struct f2fs_sb_info *sbi); void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); @@@ -3008,9 -2942,7 +3008,9 @@@ void f2fs_allocate_data_block(struct f2 struct f2fs_io_info *fio, bool add_list); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered); -void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); +void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, + block_t len); void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, @@@ -3070,8 -3002,8 +3070,8 @@@ int f2fs_init_post_read_processing(void void f2fs_destroy_post_read_processing(void); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type); + struct inode *inode, struct page *page, + nid_t ino, enum page_type type); void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); int f2fs_submit_page_bio(struct f2fs_io_info *fio); void f2fs_submit_page_write(struct f2fs_io_info *fio); @@@ -3093,7 -3025,6 +3093,7 @@@ struct page *f2fs_get_lock_data_page(st struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size); int f2fs_do_write_data_page(struct f2fs_io_info *fio); +void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock); int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@@ -3108,7 -3039,7 +3108,7 @@@ int f2fs_migrate_page(struct address_sp struct page *page, enum migrate_mode mode); #endif bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); - void f2fs_clear_radix_tree_dirty_tag(struct page *page); + void f2fs_clear_page_cache_dirty_tag(struct page *page); /* * gc.c @@@ -3146,8 -3077,6 +3146,8 @@@ struct f2fs_stat_info int free_nids, avail_nids, alloc_nids; int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; + int nr_rd_data, nr_rd_node, nr_rd_meta; + unsigned int io_skip_bggc, other_skip_bggc; int nr_flushing, nr_flushed, flush_list_empty; int nr_discarding, nr_discarded; int nr_discard_cmd; @@@ -3169,7 -3098,6 +3169,7 @@@ int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; + unsigned int meta_count[META_MAX]; unsigned int segment_count[2]; unsigned int block_count[2]; unsigned int inplace_count; @@@ -3185,8 -3113,6 +3185,8 @@@ static inline struct f2fs_stat_info *F2 #define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) +#define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++) +#define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) #define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) #define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) #define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) @@@ -3223,17 -3149,6 +3223,17 @@@ if (f2fs_has_inline_dentry(inode)) \ (atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \ } while (0) +#define stat_inc_meta_count(sbi, blkaddr) \ + do { \ + if (blkaddr < SIT_I(sbi)->sit_base_addr) \ + atomic_inc(&(sbi)->meta_count[META_CP]); \ + else if (blkaddr < NM_I(sbi)->nat_blkaddr) \ + atomic_inc(&(sbi)->meta_count[META_SIT]); \ + else if (blkaddr < SM_I(sbi)->ssa_blkaddr) \ + atomic_inc(&(sbi)->meta_count[META_NAT]); \ + else if (blkaddr < SM_I(sbi)->main_blkaddr) \ + atomic_inc(&(sbi)->meta_count[META_SSA]); \ + } while (0) #define stat_inc_seg_type(sbi, curseg) \ ((sbi)->segment_count[(curseg)->alloc_type]++) #define stat_inc_block_count(sbi, curseg) \ @@@ -3303,8 -3218,6 +3303,8 @@@ void f2fs_destroy_root_stats(void) #define stat_inc_bg_cp_count(si) do { } while (0) #define stat_inc_call_count(si) do { } while (0) #define stat_inc_bggc_count(si) do { } while (0) +#define stat_io_skip_bggc_count(sbi) do { } while (0) +#define stat_other_skip_bggc_count(sbi) do { } while (0) #define stat_inc_dirty_inode(sbi, type) do { } while (0) #define stat_dec_dirty_inode(sbi, type) do { } while (0) #define stat_inc_total_hit(sb) do { } while (0) @@@ -3323,7 -3236,6 +3323,7 @@@ #define stat_inc_volatile_write(inode) do { } while (0) #define stat_dec_volatile_write(inode) do { } while (0) #define stat_update_max_volatile_write(inode) do { } while (0) +#define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) #define stat_inc_block_count(sbi, curseg) do { } while (0) #define stat_inc_inplace_blocks(sbi) do { } while (0) @@@ -3393,19 -3305,18 +3393,19 @@@ void f2fs_leave_shrinker(struct f2fs_sb /* * extent_cache.c */ -struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs); struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, - struct rb_root *root, struct rb_node **parent, - unsigned int ofs); -struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, + struct rb_root_cached *root, + struct rb_node **parent, + unsigned int ofs, bool *leftmost); +struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs, struct rb_entry **prev_entry, struct rb_entry **next_entry, struct rb_node ***insert_p, struct rb_node **insert_parent, - bool force); + bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root *root); + struct rb_root_cached *root); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); void f2fs_drop_extent_tree(struct inode *inode); @@@ -3445,7 -3356,7 +3445,7 @@@ static inline void f2fs_set_encrypted_i { #ifdef CONFIG_F2FS_FS_ENCRYPTION file_set_encrypt(inode); - inode->i_flags |= S_ENCRYPTED; + f2fs_set_inode_flags(inode); #endif } @@@ -3473,7 -3384,6 +3473,7 @@@ F2FS_FEATURE_FUNCS(flexible_inline_xatt F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); +F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, @@@ -3489,20 -3399,11 +3489,20 @@@ } #endif -static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) { - struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + return f2fs_sb_has_blkzoned(sbi->sb); +} - return blk_queue_discard(q) || f2fs_sb_has_blkzoned(sbi->sb); +static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) +{ + return blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev)); +} + +static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi) +{ + return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) || + f2fs_hw_should_discard(sbi); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) @@@ -3531,50 -3432,11 +3531,50 @@@ static inline bool f2fs_may_encrypt(str #endif } -static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) +static inline int block_unaligned_IO(struct inode *inode, + struct kiocb *iocb, struct iov_iter *iter) { - return (f2fs_post_read_required(inode) || - (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || - F2FS_I_SB(inode)->s_ndevs); + unsigned int i_blkbits = READ_ONCE(inode->i_blkbits); + unsigned int blocksize_mask = (1 << i_blkbits) - 1; + loff_t offset = iocb->ki_pos; + unsigned long align = offset | iov_iter_alignment(iter); + + return align & blocksize_mask; +} + +static inline int allow_outplace_dio(struct inode *inode, + struct kiocb *iocb, struct iov_iter *iter) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int rw = iov_iter_rw(iter); + + return (test_opt(sbi, LFS) && (rw == WRITE) && + !block_unaligned_IO(inode, iocb, iter)); +} + +static inline bool f2fs_force_buffered_io(struct inode *inode, + struct kiocb *iocb, struct iov_iter *iter) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int rw = iov_iter_rw(iter); + + if (f2fs_post_read_required(inode)) + return true; + if (sbi->s_ndevs) + return true; + /* + * for blkzoned device, fallback direct IO to buffered IO, so + * all IOs can be serialized by log-structured write. + */ + if (f2fs_sb_has_blkzoned(sbi->sb)) + return true; + if (test_opt(sbi, LFS) && (rw == WRITE) && + block_unaligned_IO(inode, iocb, iter)) + return true; + if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED)) + return true; + + return false; } #ifdef CONFIG_F2FS_FAULT_INJECTION @@@ -3585,16 -3447,3 +3585,16 @@@ extern void f2fs_build_fault_attr(struc #endif #endif + +static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) +{ +#ifdef CONFIG_QUOTA + if (f2fs_sb_has_quota_ino(sbi->sb)) + return true; + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) + return true; +#endif + return false; +} diff --combined fs/f2fs/inline.c index cb31a719b048,3e0a630a0168..7b0cff7e6051 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@@ -1,9 -1,11 +1,9 @@@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/inline.c * Copyright (c) 2013, Intel Corporation * Authors: Huajun Li * Haicheng Li - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include @@@ -243,7 -245,7 +243,7 @@@ int f2fs_write_inline_data(struct inod kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); - f2fs_clear_radix_tree_dirty_tag(page); + f2fs_clear_page_cache_dirty_tag(page); set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); @@@ -298,7 -300,7 +298,7 @@@ process_inline clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - if (f2fs_truncate_blocks(inode, 0, false)) + if (f2fs_truncate_blocks(inode, 0, false, false)) return false; goto process_inline; } @@@ -470,7 -472,7 +470,7 @@@ static int f2fs_add_inline_entries(stru return 0; punch_dentry_pages: truncate_inode_pages(&dir->i_data, 0); - f2fs_truncate_blocks(dir, 0, false); + f2fs_truncate_blocks(dir, 0, false, false); f2fs_remove_dirty_inode(dir); return err; } diff --combined fs/f2fs/node.c index 2b34206486d8,0bae5eda056a..d338740d0fda --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@@ -1,9 -1,12 +1,9 @@@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/node.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include @@@ -101,7 -104,7 +101,7 @@@ bool f2fs_available_free_memory(struct static void clear_node_page_dirty(struct page *page) { if (PageDirty(page)) { - f2fs_clear_radix_tree_dirty_tag(page); + f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); } @@@ -126,8 -129,6 +126,8 @@@ static struct page *get_next_nat_page(s /* get current nat block page with lock */ src_page = get_current_nat_page(sbi, nid); + if (IS_ERR(src_page)) + return src_page; dst_page = f2fs_grab_meta_page(sbi, dst_off); f2fs_bug_on(sbi, PageDirty(src_page)); @@@ -1306,9 -1307,7 +1306,7 @@@ void f2fs_ra_node_page(struct f2fs_sb_i if (f2fs_check_nid_range(sbi, nid)) return; - rcu_read_lock(); - apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid); - rcu_read_unlock(); + apage = xa_load(&NODE_MAPPING(sbi)->i_pages, nid); if (apage) return; @@@ -1541,10 -1540,8 +1539,10 @@@ static int __write_node_page(struct pag } if (__is_valid_data_blkaddr(ni.blk_addr) && - !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC)) + !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC)) { + up_read(&sbi->node_write); goto redirty_out; + } if (atomic && !test_opt(sbi, NOBARRIER)) fio.op_flags |= REQ_PREFLUSH | REQ_FUA; @@@ -1565,7 -1562,8 +1563,7 @@@ up_read(&sbi->node_write); if (wbc->for_reclaim) { - f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0, - page->index, NODE); + f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE); submitted = NULL; } @@@ -1587,10 -1585,8 +1585,10 @@@ redirty_out return AOP_WRITEPAGE_ACTIVATE; } -void f2fs_move_node_page(struct page *node_page, int gc_type) +int f2fs_move_node_page(struct page *node_page, int gc_type) { + int err = 0; + if (gc_type == FG_GC) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@@ -1602,16 -1598,12 +1600,16 @@@ f2fs_wait_on_page_writeback(node_page, NODE, true); f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page)); - if (!clear_page_dirty_for_io(node_page)) + if (!clear_page_dirty_for_io(node_page)) { + err = -EAGAIN; goto out_page; + } if (__write_node_page(node_page, false, NULL, - &wbc, false, FS_GC_NODE_IO, NULL)) + &wbc, false, FS_GC_NODE_IO, NULL)) { + err = -EAGAIN; unlock_page(node_page); + } goto release_page; } else { /* set page dirty and write it */ @@@ -1622,7 -1614,6 +1620,7 @@@ out_page unlock_page(node_page); release_page: f2fs_put_page(node_page, 0); + return err; } static int f2fs_write_node_page(struct page *page, @@@ -1637,13 -1628,13 +1635,13 @@@ int f2fs_fsync_node_pages(struct f2fs_s unsigned int *seq_id) { pgoff_t index; - pgoff_t last_idx = ULONG_MAX; struct pagevec pvec; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; int nr_pages; + int nwritten = 0; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@@ -1721,7 -1712,7 +1719,7 @@@ continue_unlock f2fs_put_page(last_page, 0); break; } else if (submitted) { - last_idx = page->index; + nwritten++; } if (page == last_page) { @@@ -1747,8 -1738,8 +1745,8 @@@ goto retry; } out: - if (last_idx != ULONG_MAX) - f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE); + if (nwritten) + f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE); return ret ? -EIO: 0; } @@@ -2275,19 -2266,15 +2273,19 @@@ static int __f2fs_build_free_nids(struc nm_i->nat_block_bitmap)) { struct page *page = get_current_nat_page(sbi, nid); - ret = scan_nat_page(sbi, page, nid); - f2fs_put_page(page, 1); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + } else { + ret = scan_nat_page(sbi, page, nid); + f2fs_put_page(page, 1); + } if (ret) { up_read(&nm_i->nat_tree_lock); f2fs_bug_on(sbi, !mount); f2fs_msg(sbi->sb, KERN_ERR, "NAT is corrupt, run fsck to fix it"); - return -EINVAL; + return ret; } } @@@ -2364,9 -2351,8 +2362,9 @@@ retry spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - f2fs_build_free_nids(sbi, true, false); - goto retry; + if (!f2fs_build_free_nids(sbi, true, false)) + goto retry; + return false; } /* @@@ -2549,7 -2535,7 +2547,7 @@@ retry if (!PageUptodate(ipage)) SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); - set_cold_node(page, false); + set_cold_node(ipage, false); src = F2FS_INODE(page); dst = F2FS_INODE(ipage); @@@ -2572,13 -2558,6 +2570,13 @@@ F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_projid)) dst->i_projid = src->i_projid; + + if (f2fs_sb_has_inode_crtime(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_crtime_nsec)) { + dst->i_crtime = src->i_crtime; + dst->i_crtime_nsec = src->i_crtime_nsec; + } } new_ni = old_ni; @@@ -2722,7 -2701,7 +2720,7 @@@ static void __update_nat_bits(struct f2 __clear_bit_le(nat_index, nm_i->full_nat_bits); } -static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, +static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, struct nat_entry_set *set, struct cp_control *cpc) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@@ -2746,9 -2725,6 +2744,9 @@@ down_write(&curseg->journal_rwsem); } else { page = get_next_nat_page(sbi, start_nid); + if (IS_ERR(page)) + return PTR_ERR(page); + nat_blk = page_address(page); f2fs_bug_on(sbi, !nat_blk); } @@@ -2794,13 -2770,12 +2792,13 @@@ radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); kmem_cache_free(nat_entry_set_slab, set); } + return 0; } /* * This function is called during the checkpointing process. */ -void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) +int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@@ -2810,7 -2785,6 +2808,7 @@@ unsigned int found; nid_t set_idx = 0; LIST_HEAD(sets); + int err = 0; /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */ if (enabled_nat_bits(sbi, cpc)) { @@@ -2820,7 -2794,7 +2818,7 @@@ } if (!nm_i->dirty_nat_cnt) - return; + return 0; down_write(&nm_i->nat_tree_lock); @@@ -2843,16 -2817,11 +2841,16 @@@ } /* flush dirty nats in nat entry set */ - list_for_each_entry_safe(set, tmp, &sets, set_list) - __flush_nat_entry_set(sbi, set, cpc); + list_for_each_entry_safe(set, tmp, &sets, set_list) { + err = __flush_nat_entry_set(sbi, set, cpc); + if (err) + break; + } up_write(&nm_i->nat_tree_lock); /* Allow dirty nats by node block allocation in write_begin */ + + return err; } static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) @@@ -2879,8 -2848,10 +2877,8 @@@ struct page *page; page = f2fs_get_meta_page(sbi, nat_bits_addr++); - if (IS_ERR(page)) { - disable_nat_bits(sbi, true); + if (IS_ERR(page)) return PTR_ERR(page); - } memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), page_address(page), F2FS_BLKSIZE); diff --combined fs/proc/task_mmu.c index a027473561c6,669abb617321..47c3764c469b --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@@ -521,7 -521,7 +521,7 @@@ static void smaps_pte_entry(pte_t *pte if (!page) return; - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) mss->swap += PAGE_SIZE; else put_page(page); @@@ -713,8 -713,6 +713,8 @@@ static void smap_gather_stats(struct vm smaps_walk.private = mss; #ifdef CONFIG_SHMEM + /* In case of smaps_rollup, reset the value from previous vma */ + mss->check_shmem_swap = false; if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@@ -730,7 -728,7 +730,7 @@@ if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || !(vma->vm_flags & VM_WRITE)) { - mss->swap = shmem_swapped; + mss->swap += shmem_swapped; } else { mss->check_shmem_swap = true; smaps_walk.pte_hole = smaps_pte_hole; diff --combined include/linux/fs.h index 897eae8faee1,e10278e4db66..771341470bce --- a/include/linux/fs.h +++ b/include/linux/fs.h @@@ -403,24 -403,40 +403,40 @@@ int pagecache_write_end(struct file *, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); + /** + * struct address_space - Contents of a cacheable, mappable object. + * @host: Owner, either the inode or the block_device. + * @i_pages: Cached pages. + * @gfp_mask: Memory allocation flags to use for allocating pages. + * @i_mmap_writable: Number of VM_SHARED mappings. + * @i_mmap: Tree of private and shared mappings. + * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. + * @nrpages: Number of page entries, protected by the i_pages lock. + * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock. + * @writeback_index: Writeback starts here. + * @a_ops: Methods. + * @flags: Error bits and flags (AS_*). + * @wb_err: The most recent error which has occurred. + * @private_lock: For use by the owner of the address_space. + * @private_list: For use by the owner of the address_space. + * @private_data: For use by the owner of the address_space. + */ struct address_space { - struct inode *host; /* owner: inode, block_device */ - struct radix_tree_root i_pages; /* cached pages */ - atomic_t i_mmap_writable;/* count VM_SHARED mappings */ - struct rb_root_cached i_mmap; /* tree of private and shared mappings */ - struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ - /* Protected by the i_pages lock */ - unsigned long nrpages; /* number of total pages */ - /* number of shadow or DAX exceptional entries */ + struct inode *host; + struct xarray i_pages; + gfp_t gfp_mask; + atomic_t i_mmap_writable; + struct rb_root_cached i_mmap; + struct rw_semaphore i_mmap_rwsem; + unsigned long nrpages; unsigned long nrexceptional; - pgoff_t writeback_index;/* writeback starts here */ - const struct address_space_operations *a_ops; /* methods */ - unsigned long flags; /* error bits */ - spinlock_t private_lock; /* for use by the address_space */ - gfp_t gfp_mask; /* implicit gfp mask for allocations */ - struct list_head private_list; /* for use by the address_space */ - void *private_data; /* ditto */ + pgoff_t writeback_index; + const struct address_space_operations *a_ops; + unsigned long flags; errseq_t wb_err; + spinlock_t private_lock; + struct list_head private_list; + void *private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but @@@ -467,15 -483,18 +483,18 @@@ struct block_device struct mutex bd_fsfreeze_mutex; } __randomize_layout; + /* XArray tags, for tagging dirty and writeback pages in the pagecache. */ + #define PAGECACHE_TAG_DIRTY XA_MARK_0 + #define PAGECACHE_TAG_WRITEBACK XA_MARK_1 + #define PAGECACHE_TAG_TOWRITE XA_MARK_2 + /* - * Radix-tree tags, for tagging dirty and writeback pages within the pagecache - * radix trees + * Returns true if any of the pages in the mapping are marked with the tag. */ - #define PAGECACHE_TAG_DIRTY 0 - #define PAGECACHE_TAG_WRITEBACK 1 - #define PAGECACHE_TAG_TOWRITE 2 - - int mapping_tagged(struct address_space *mapping, int tag); + static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag) + { + return xa_marked(&mapping->i_pages, tag); + } static inline void i_mmap_lock_write(struct address_space *mapping) { @@@ -1828,10 -1847,8 +1847,10 @@@ extern ssize_t vfs_copy_file_range(stru extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, struct inode *inode_out, loff_t pos_out, u64 *len, bool is_dedupe); +extern int do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len); extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len); + struct file *file_out, loff_t pos_out, u64 len); extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dest, loff_t destoff, loff_t len, bool *is_same); @@@ -2774,6 -2791,19 +2793,6 @@@ static inline void file_end_write(struc return; __sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE); } - -static inline int do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - u64 len) -{ - int ret; - - file_start_write(file_out); - ret = vfs_clone_file_range(file_in, pos_in, file_out, pos_out, len); - file_end_write(file_out); - - return ret; -} /* * get_write_access() gets write permission for a file. diff --combined include/linux/swap.h index 38195f5c96b1,cb479bf5842e..d8a07a4f171d --- a/include/linux/swap.h +++ b/include/linux/swap.h @@@ -167,14 -167,13 +167,14 @@@ enum SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ - SWP_FILE = (1 << 7), /* set after swap_activate success */ - SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ - SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ - SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ - SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */ + SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ + SWP_FS = (1 << 8), /* swap file goes through fs */ + SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ + SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ + SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ + SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ /* add others here before... */ - SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL @@@ -297,20 -296,15 +297,15 @@@ struct vma_swap_readahead /* linux/mm/workingset.c */ void *workingset_eviction(struct address_space *mapping, struct page *page); -bool workingset_refault(void *shadow); +void workingset_refault(struct page *page, void *shadow); void workingset_activation(struct page *page); - /* Do not use directly, use workingset_lookup_update */ - void workingset_update_node(struct radix_tree_node *node); - - /* Returns workingset_update_node() if the mapping has shadow entries. */ - #define workingset_lookup_update(mapping) \ - ({ \ - radix_tree_update_node_t __helper = workingset_update_node; \ - if (dax_mapping(mapping) || shmem_mapping(mapping)) \ - __helper = NULL; \ - __helper; \ - }) + /* Only track the nodes of mappings with shadow entries */ + void workingset_update_node(struct xa_node *node); + #define mapping_set_update(xas, mapping) do { \ + if (!dax_mapping(mapping) && !shmem_mapping(mapping)) \ + xas_set_update(xas, workingset_update_node); \ + } while (0) /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; @@@ -409,7 -403,7 +404,7 @@@ extern void show_swap_cache_info(void) extern int add_to_swap(struct page *page); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); extern int __add_to_swap_cache(struct page *page, swp_entry_t entry); - extern void __delete_from_swap_cache(struct page *); + extern void __delete_from_swap_cache(struct page *, swp_entry_t entry); extern void delete_from_swap_cache(struct page *); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); @@@ -563,7 -557,8 +558,8 @@@ static inline int add_to_swap_cache(str return -1; } - static inline void __delete_from_swap_cache(struct page *page) + static inline void __delete_from_swap_cache(struct page *page, + swp_entry_t entry) { } diff --combined kernel/memremap.c index 620fc4d2559a,e842fab9f184..9eced2cc9f94 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@@ -1,47 -1,21 +1,21 @@@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright(c) 2015 Intel Corporation. All rights reserved. */ - #include #include - #include - #include #include #include - #include #include + #include + #include #include #include + #include #include + #include - static DEFINE_MUTEX(pgmap_lock); - static RADIX_TREE(pgmap_radix, GFP_KERNEL); + static DEFINE_XARRAY(pgmap_array); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) - static unsigned long order_at(struct resource *res, unsigned long pgoff) - { - unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff; - unsigned long nr_pages, mask; - - nr_pages = PHYS_PFN(resource_size(res)); - if (nr_pages == pgoff) - return ULONG_MAX; - - /* - * What is the largest aligned power-of-2 range available from - * this resource pgoff to the end of the resource range, - * considering the alignment of the current pgoff? - */ - mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff); - if (!mask) - return ULONG_MAX; - - return find_first_bit(&mask, BITS_PER_LONG); - } - - #define foreach_order_pgoff(res, order, pgoff) \ - for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \ - pgoff += 1UL << order, order = order_at((res), pgoff)) - #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, unsigned long addr, @@@ -70,18 -44,10 +44,10 @@@ EXPORT_SYMBOL(device_private_entry_fault); #endif /* CONFIG_DEVICE_PRIVATE */ - static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff) + static void pgmap_array_delete(struct resource *res) { - unsigned long pgoff, order; - - mutex_lock(&pgmap_lock); - foreach_order_pgoff(res, order, pgoff) { - if (pgoff >= end_pgoff) - break; - radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff); - } - mutex_unlock(&pgmap_lock); - + xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), + NULL, GFP_KERNEL); synchronize_rcu(); } @@@ -142,7 -108,7 +108,7 @@@ static void devm_memremap_pages_release mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); - pgmap_radix_release(res, -1); + pgmap_array_delete(res); dev_WARN_ONCE(dev, pgmap->altmap.alloc, "%s: failed to free all reserved pages\n", __func__); } @@@ -175,10 -141,10 +141,9 @@@ void *devm_memremap_pages(struct devic struct vmem_altmap *altmap = pgmap->altmap_valid ? &pgmap->altmap : NULL; struct resource *res = &pgmap->res; - unsigned long pfn; + struct dev_pagemap *conflict_pgmap; pgprot_t pgprot = PAGE_KERNEL; - unsigned long pgoff, order; int error, nid, is_ram; - struct dev_pagemap *conflict_pgmap; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@@ -216,20 -182,10 +181,10 @@@ pgmap->dev = dev; - mutex_lock(&pgmap_lock); - error = 0; - - foreach_order_pgoff(res, order, pgoff) { - error = __radix_tree_insert(&pgmap_radix, - PHYS_PFN(res->start) + pgoff, order, pgmap); - if (error) { - dev_err(dev, "%s: failed: %d\n", __func__, error); - break; - } - } - mutex_unlock(&pgmap_lock); + error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), + PHYS_PFN(res->end), pgmap, GFP_KERNEL)); if (error) - goto err_radix; + goto err_array; nid = dev_to_node(dev); if (nid < 0) @@@ -256,14 -212,19 +211,14 @@@ if (error) goto err_add_memory; - for_each_device_pfn(pfn, pgmap) { - struct page *page = pfn_to_page(pfn); - - /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back - * pointer. It is a bug if a ZONE_DEVICE page is ever - * freed or placed on a driver-private list. Seed the - * storage with LIST_POISON* values. - */ - list_del(&page->lru); - page->pgmap = pgmap; - percpu_ref_get(pgmap->ref); - } + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, pgmap); + percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); devm_add_action(dev, devm_memremap_pages_release, pgmap); @@@ -274,8 -235,8 +229,8 @@@ err_kasan: untrack_pfn(NULL, PHYS_PFN(align_start), align_size); err_pfn_remap: - err_radix: - pgmap_radix_release(res, pgoff); + pgmap_array_delete(res); + err_array: return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); @@@ -315,7 -276,7 +270,7 @@@ struct dev_pagemap *get_dev_pagemap(uns /* fall back to slow path lookup */ rcu_read_lock(); - pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); + pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) pgmap = NULL; rcu_read_unlock(); diff --combined lib/Kconfig index d82f20609939,a9965f4af4dd..d1573a16aa92 --- a/lib/Kconfig +++ b/lib/Kconfig @@@ -399,8 -399,11 +399,11 @@@ config INTERVAL_TRE for more information. - config RADIX_TREE_MULTIORDER + config XARRAY_MULTI bool + help + Support entries which occupy multiple consecutive indices in the + XArray. config ASSOCIATIVE_ARRAY bool @@@ -621,6 -624,3 +624,6 @@@ config GENERIC_LIB_CMPDI config GENERIC_LIB_UCMPDI2 bool + +config GENERIC_LIB_UMODDI3 + bool diff --combined lib/Kconfig.debug index 04adfc3b185e,091155e12422..e0ba05e6f6bd --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@@ -1179,7 -1179,7 +1179,7 @@@ config LOCKDE bool depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT select STACKTRACE - select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !X86 + select FRAME_POINTER if !MIPS && !PPC && !ARM && !S390 && !MICROBLAZE && !ARC && !X86 select KALLSYMS select KALLSYMS_ALL @@@ -1590,7 -1590,7 +1590,7 @@@ config FAULT_INJECTION_STACKTRACE_FILTE depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT depends on !X86_64 select STACKTRACE - select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !X86 + select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM && !ARC && !X86 help Provide stacktrace filter for fault-injection capabilities @@@ -1599,7 -1599,7 +1599,7 @@@ config LATENCYTO depends on DEBUG_KERNEL depends on STACKTRACE_SUPPORT depends on PROC_FS - select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !X86 + select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM && !ARC && !X86 select KALLSYMS select KALLSYMS_ALL select STACKTRACE @@@ -1813,6 -1813,9 +1813,9 @@@ config TEST_BITFIEL config TEST_UUID tristate "Test functions located in the uuid module at runtime" + config TEST_XARRAY + tristate "Test the XArray code at runtime" + config TEST_OVERFLOW tristate "Test check_*_overflow() functions at runtime" @@@ -1965,14 -1968,6 +1968,14 @@@ config TEST_DEBUG_VIRTUA If unsure, say N. +config TEST_MEMCAT_P + tristate "Test memcat_p() helper function" + help + Test the memcat_p() helper for correctly merging two + pointer arrays together. + + If unsure, say N. + endif # RUNTIME_TESTING_MENU config MEMTEST diff --combined lib/Makefile index fa3eb1b4c0e3,e6f809556af5..3d341f59f756 --- a/lib/Makefile +++ b/lib/Makefile @@@ -18,13 -18,13 +18,13 @@@ KCOV_INSTRUMENT_debugobjects.o := KCOV_INSTRUMENT_dynamic_debug.o := n lib-y := ctype.o string.o vsprintf.o cmdline.o \ - rbtree.o radix-tree.o timerqueue.o\ + rbtree.o radix-tree.o timerqueue.o xarray.o \ idr.o int_sqrt.o extable.o \ sha1.o chacha20.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ - nmi_backtrace.o nodemask.o win_minmax.o + nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_MMU) += ioremap.o @@@ -68,10 -68,10 +68,11 @@@ obj-$(CONFIG_TEST_PRINTF) += test_print obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o obj-$(CONFIG_TEST_BITFIELD) += test_bitfield.o obj-$(CONFIG_TEST_UUID) += test_uuid.o + obj-$(CONFIG_TEST_XARRAY) += test_xarray.o obj-$(CONFIG_TEST_PARMAN) += test_parman.o obj-$(CONFIG_TEST_KMOD) += test_kmod.o obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o +obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG @@@ -120,6 -120,7 +121,6 @@@ obj-$(CONFIG_ZLIB_INFLATE) += zlib_infl obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ obj-$(CONFIG_BCH) += bch.o -CFLAGS_bch.o := $(call cc-option,-Wframe-larger-than=4500) obj-$(CONFIG_LZO_COMPRESS) += lzo/ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ obj-$(CONFIG_LZ4_COMPRESS) += lz4/ @@@ -271,4 -272,3 +272,4 @@@ obj-$(CONFIG_GENERIC_LIB_LSHRDI3) += ls obj-$(CONFIG_GENERIC_LIB_MULDI3) += muldi3.o obj-$(CONFIG_GENERIC_LIB_CMPDI2) += cmpdi2.o obj-$(CONFIG_GENERIC_LIB_UCMPDI2) += ucmpdi2.o +obj-$(CONFIG_GENERIC_LIB_UMODDI3) += umoddi3.o udivmoddi4.o diff --combined mm/filemap.c index 3968da1f7f5a,6b36516bc31d..218d0b2ec82d --- a/mm/filemap.c +++ b/mm/filemap.c @@@ -36,8 -36,6 +36,8 @@@ #include #include #include +#include +#include #include "internal.h" #define CREATE_TRACE_POINTS @@@ -113,60 -111,26 +113,26 @@@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ - static int page_cache_tree_insert(struct address_space *mapping, - struct page *page, void **shadowp) - { - struct radix_tree_node *node; - void **slot; - int error; - - error = __radix_tree_create(&mapping->i_pages, page->index, 0, - &node, &slot); - if (error) - return error; - if (*slot) { - void *p; - - p = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (!radix_tree_exceptional_entry(p)) - return -EEXIST; - - mapping->nrexceptional--; - if (shadowp) - *shadowp = p; - } - __radix_tree_replace(&mapping->i_pages, node, slot, page, - workingset_lookup_update(mapping)); - mapping->nrpages++; - return 0; - } - - static void page_cache_tree_delete(struct address_space *mapping, + static void page_cache_delete(struct address_space *mapping, struct page *page, void *shadow) { - int i, nr; + XA_STATE(xas, &mapping->i_pages, page->index); + unsigned int nr = 1; + + mapping_set_update(&xas, mapping); - /* hugetlb pages are represented by one entry in the radix tree */ - nr = PageHuge(page) ? 1 : hpage_nr_pages(page); + /* hugetlb pages are represented by a single entry in the xarray */ + if (!PageHuge(page)) { + xas_set_order(&xas, page->index, compound_order(page)); + nr = 1U << compound_order(page); + } VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(nr != 1 && shadow, page); - for (i = 0; i < nr; i++) { - struct radix_tree_node *node; - void **slot; - - __radix_tree_lookup(&mapping->i_pages, page->index + i, - &node, &slot); - - VM_BUG_ON_PAGE(!node && nr != 1, page); - - radix_tree_clear_tags(&mapping->i_pages, node, slot); - __radix_tree_replace(&mapping->i_pages, node, slot, shadow, - workingset_lookup_update(mapping)); - } + xas_store(&xas, shadow); + xas_init_marks(&xas); page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ @@@ -265,7 -229,7 +231,7 @@@ void __delete_from_page_cache(struct pa trace_mm_filemap_delete_from_page_cache(page); unaccount_page_cache_page(mapping, page); - page_cache_tree_delete(mapping, page, shadow); + page_cache_delete(mapping, page, shadow); } static void page_cache_free_page(struct address_space *mapping, @@@ -308,7 -272,7 +274,7 @@@ void delete_from_page_cache(struct pag EXPORT_SYMBOL(delete_from_page_cache); /* - * page_cache_tree_delete_batch - delete several pages from page cache + * page_cache_delete_batch - delete several pages from page cache * @mapping: the mapping to which pages belong * @pvec: pagevec with pages to delete * @@@ -321,24 -285,19 +287,19 @@@ * * The function expects the i_pages lock to be held. */ - static void - page_cache_tree_delete_batch(struct address_space *mapping, + static void page_cache_delete_batch(struct address_space *mapping, struct pagevec *pvec) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); int total_pages = 0; int i = 0, tail_pages = 0; struct page *page; - pgoff_t start; - start = pvec->pages[0]->index; - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { + mapping_set_update(&xas, mapping); + xas_for_each(&xas, page, ULONG_MAX) { if (i >= pagevec_count(pvec) && !tail_pages) break; - page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) continue; if (!tail_pages) { /* @@@ -346,8 -305,11 +307,11 @@@ * have our pages locked so they are protected from * being removed. */ - if (page != pvec->pages[i]) + if (page != pvec->pages[i]) { + VM_BUG_ON_PAGE(page->index > + pvec->pages[i]->index, page); continue; + } WARN_ON_ONCE(!PageLocked(page)); if (PageTransHuge(page) && !PageHuge(page)) tail_pages = HPAGE_PMD_NR - 1; @@@ -358,11 -320,11 +322,11 @@@ */ i++; } else { + VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages + != pvec->pages[i]->index, page); tail_pages--; } - radix_tree_clear_tags(&mapping->i_pages, iter.node, slot); - __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL, - workingset_lookup_update(mapping)); + xas_store(&xas, NULL); total_pages++; } mapping->nrpages -= total_pages; @@@ -383,7 -345,7 +347,7 @@@ void delete_from_page_cache_batch(struc unaccount_page_cache_page(mapping, pvec->pages[i]); } - page_cache_tree_delete_batch(mapping, pvec); + page_cache_delete_batch(mapping, pvec); xa_unlock_irqrestore(&mapping->i_pages, flags); for (i = 0; i < pagevec_count(pvec); i++) @@@ -493,20 -455,31 +457,31 @@@ EXPORT_SYMBOL(filemap_flush) bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - pgoff_t index = start_byte >> PAGE_SHIFT; - pgoff_t end = end_byte >> PAGE_SHIFT; struct page *page; + XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); + pgoff_t max = end_byte >> PAGE_SHIFT; if (end_byte < start_byte) return false; - if (mapping->nrpages == 0) - return false; + rcu_read_lock(); + for (;;) { + page = xas_find(&xas, max); + if (xas_retry(&xas, page)) + continue; + /* Shadow entries don't count */ + if (xa_is_value(page)) + continue; + /* + * We don't need to try to pin this page; we're about to + * release the RCU lock anyway. It is enough to know that + * there was a page here recently. + */ + break; + } + rcu_read_unlock(); - if (!find_get_pages_range(mapping, &index, end, 1, &page)) - return false; - put_page(page); - return true; + return page != NULL; } EXPORT_SYMBOL(filemap_range_has_page); @@@ -777,51 -750,44 +752,44 @@@ EXPORT_SYMBOL(file_write_and_wait_range * locked. This function does not add the new page to the LRU, the * caller must do that. * - * The remove + add is atomic. The only way this function can fail is - * memory allocation failure. + * The remove + add is atomic. This function cannot fail. */ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { - int error; + struct address_space *mapping = old->mapping; + void (*freepage)(struct page *) = mapping->a_ops->freepage; + pgoff_t offset = old->index; + XA_STATE(xas, &mapping->i_pages, offset); + unsigned long flags; VM_BUG_ON_PAGE(!PageLocked(old), old); VM_BUG_ON_PAGE(!PageLocked(new), new); VM_BUG_ON_PAGE(new->mapping, new); - error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK); - if (!error) { - struct address_space *mapping = old->mapping; - void (*freepage)(struct page *); - unsigned long flags; - - pgoff_t offset = old->index; - freepage = mapping->a_ops->freepage; - - get_page(new); - new->mapping = mapping; - new->index = offset; + get_page(new); + new->mapping = mapping; + new->index = offset; - xa_lock_irqsave(&mapping->i_pages, flags); - __delete_from_page_cache(old, NULL); - error = page_cache_tree_insert(mapping, new, NULL); - BUG_ON(error); + xas_lock_irqsave(&xas, flags); + xas_store(&xas, new); - /* - * hugetlb pages do not participate in page cache accounting. - */ - if (!PageHuge(new)) - __inc_node_page_state(new, NR_FILE_PAGES); - if (PageSwapBacked(new)) - __inc_node_page_state(new, NR_SHMEM); - xa_unlock_irqrestore(&mapping->i_pages, flags); - mem_cgroup_migrate(old, new); - radix_tree_preload_end(); - if (freepage) - freepage(old); - put_page(old); - } + old->mapping = NULL; + /* hugetlb pages do not participate in page cache accounting. */ + if (!PageHuge(old)) + __dec_node_page_state(new, NR_FILE_PAGES); + if (!PageHuge(new)) + __inc_node_page_state(new, NR_FILE_PAGES); + if (PageSwapBacked(old)) + __dec_node_page_state(new, NR_SHMEM); + if (PageSwapBacked(new)) + __inc_node_page_state(new, NR_SHMEM); + xas_unlock_irqrestore(&xas, flags); + mem_cgroup_migrate(old, new); + if (freepage) + freepage(old); + put_page(old); - return error; + return 0; } EXPORT_SYMBOL_GPL(replace_page_cache_page); @@@ -830,12 -796,15 +798,15 @@@ static int __add_to_page_cache_locked(s pgoff_t offset, gfp_t gfp_mask, void **shadowp) { + XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); struct mem_cgroup *memcg; int error; + void *old; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); + mapping_set_update(&xas, mapping); if (!huge) { error = mem_cgroup_try_charge(page, current->mm, @@@ -844,39 -813,47 +815,47 @@@ return error; } - error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK); - if (error) { - if (!huge) - mem_cgroup_cancel_charge(page, memcg, false); - return error; - } - get_page(page); page->mapping = mapping; page->index = offset; - xa_lock_irq(&mapping->i_pages); - error = page_cache_tree_insert(mapping, page, shadowp); - radix_tree_preload_end(); - if (unlikely(error)) - goto err_insert; + do { + xas_lock_irq(&xas); + old = xas_load(&xas); + if (old && !xa_is_value(old)) + xas_set_err(&xas, -EEXIST); + xas_store(&xas, page); + if (xas_error(&xas)) + goto unlock; + + if (xa_is_value(old)) { + mapping->nrexceptional--; + if (shadowp) + *shadowp = old; + } + mapping->nrpages++; + + /* hugetlb pages do not participate in page cache accounting */ + if (!huge) + __inc_node_page_state(page, NR_FILE_PAGES); + unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); + + if (xas_error(&xas)) + goto error; - /* hugetlb pages do not participate in page cache accounting. */ - if (!huge) - __inc_node_page_state(page, NR_FILE_PAGES); - xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); return 0; - err_insert: + error: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return error; + return xas_error(&xas); } /** @@@ -917,9 -894,12 +896,9 @@@ int add_to_page_cache_lru(struct page * * data from the working set, only to cache data that will * get overwritten with something else, is a waste of memory. */ - if (!(gfp_mask & __GFP_WRITE) && - shadow && workingset_refault(shadow)) { - SetPageActive(page); - workingset_activation(page); - } else - ClearPageActive(page); + WARN_ON_ONCE(PageActive(page)); + if (!(gfp_mask & __GFP_WRITE) && shadow) + workingset_refault(page, shadow); lru_cache_add(page); } return ret; @@@ -1075,18 -1055,8 +1054,18 @@@ static inline int wait_on_page_bit_comm { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; + bool thrashing = false; + unsigned long pflags; int ret = 0; + if (bit_nr == PG_locked && + !PageUptodate(page) && PageWorkingset(page)) { + if (!PageSwapBacked(page)) + delayacct_thrashing_start(); + psi_memstall_enter(&pflags); + thrashing = true; + } + init_wait(wait); wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function; @@@ -1125,12 -1095,6 +1104,12 @@@ finish_wait(q, wait); + if (thrashing) { + if (!PageSwapBacked(page)) + delayacct_thrashing_end(); + psi_memstall_leave(&pflags); + } + /* * A signal could leave PageWaiters set. Clearing it here if * !waitqueue_active would be possible (by open-coding finish_wait), @@@ -1341,86 -1305,76 +1320,76 @@@ int __lock_page_or_retry(struct page *p } /** - * page_cache_next_hole - find the next hole (not-present entry) - * @mapping: mapping - * @index: index - * @max_scan: maximum range to search - * - * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the - * lowest indexed hole. - * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'return - index >= - * max_scan' will be true). In rare cases of index wrap-around, 0 will - * be returned. - * - * page_cache_next_hole may be called under rcu_read_lock. However, - * like radix_tree_gang_lookup, this will not atomically search a - * snapshot of the tree at a single point in time. For example, if a - * hole is created at index 5, then subsequently a hole is created at - * index 10, page_cache_next_hole covering both indexes may return 10 - * if called under rcu_read_lock. + * page_cache_next_miss() - Find the next gap in the page cache. + * @mapping: Mapping. + * @index: Index. + * @max_scan: Maximum range to search. + * + * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the + * gap with the lowest index. + * + * This function may be called under the rcu_read_lock. However, this will + * not atomically search a snapshot of the cache at a single point in time. + * For example, if a gap is created at index 5, then subsequently a gap is + * created at index 10, page_cache_next_miss covering both indices may + * return 10 if called under the rcu_read_lock. + * + * Return: The index of the gap if found, otherwise an index outside the + * range specified (in which case 'return - index >= max_scan' will be true). + * In the rare case of index wrap-around, 0 will be returned. */ - pgoff_t page_cache_next_hole(struct address_space *mapping, + pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { - unsigned long i; + XA_STATE(xas, &mapping->i_pages, index); - for (i = 0; i < max_scan; i++) { - struct page *page; - - page = radix_tree_lookup(&mapping->i_pages, index); - if (!page || radix_tree_exceptional_entry(page)) + while (max_scan--) { + void *entry = xas_next(&xas); + if (!entry || xa_is_value(entry)) break; - index++; - if (index == 0) + if (xas.xa_index == 0) break; } - return index; + return xas.xa_index; } - EXPORT_SYMBOL(page_cache_next_hole); + EXPORT_SYMBOL(page_cache_next_miss); /** - * page_cache_prev_hole - find the prev hole (not-present entry) - * @mapping: mapping - * @index: index - * @max_scan: maximum range to search - * - * Search backwards in the range [max(index-max_scan+1, 0), index] for - * the first hole. - * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'index - return >= - * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX - * will be returned. - * - * page_cache_prev_hole may be called under rcu_read_lock. However, - * like radix_tree_gang_lookup, this will not atomically search a - * snapshot of the tree at a single point in time. For example, if a - * hole is created at index 10, then subsequently a hole is created at - * index 5, page_cache_prev_hole covering both indexes may return 5 if - * called under rcu_read_lock. + * page_cache_prev_miss() - Find the next gap in the page cache. + * @mapping: Mapping. + * @index: Index. + * @max_scan: Maximum range to search. + * + * Search the range [max(index - max_scan + 1, 0), index] for the + * gap with the highest index. + * + * This function may be called under the rcu_read_lock. However, this will + * not atomically search a snapshot of the cache at a single point in time. + * For example, if a gap is created at index 10, then subsequently a gap is + * created at index 5, page_cache_prev_miss() covering both indices may + * return 5 if called under the rcu_read_lock. + * + * Return: The index of the gap if found, otherwise an index outside the + * range specified (in which case 'index - return >= max_scan' will be true). + * In the rare case of wrap-around, ULONG_MAX will be returned. */ - pgoff_t page_cache_prev_hole(struct address_space *mapping, + pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { - unsigned long i; + XA_STATE(xas, &mapping->i_pages, index); - for (i = 0; i < max_scan; i++) { - struct page *page; - - page = radix_tree_lookup(&mapping->i_pages, index); - if (!page || radix_tree_exceptional_entry(page)) + while (max_scan--) { + void *entry = xas_prev(&xas); + if (!entry || xa_is_value(entry)) break; - index--; - if (index == ULONG_MAX) + if (xas.xa_index == ULONG_MAX) break; } - return index; + return xas.xa_index; } - EXPORT_SYMBOL(page_cache_prev_hole); + EXPORT_SYMBOL(page_cache_prev_miss); /** * find_get_entry - find and get a page cache entry @@@ -1437,47 -1391,40 +1406,40 @@@ */ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { - void **pagep; + XA_STATE(xas, &mapping->i_pages, offset); struct page *head, *page; rcu_read_lock(); repeat: - page = NULL; - pagep = radix_tree_lookup_slot(&mapping->i_pages, offset); - if (pagep) { - page = radix_tree_deref_slot(pagep); - if (unlikely(!page)) - goto out; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto repeat; - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Return - * it without attempting to raise page count. - */ - goto out; - } + xas_reset(&xas); + page = xas_load(&xas); + if (xas_retry(&xas, page)) + goto repeat; + /* + * A shadow entry of a recently evicted page, or a swap entry from + * shmem/tmpfs. Return it without attempting to raise page count. + */ + if (!page || xa_is_value(page)) + goto out; - head = compound_head(page); - if (!page_cache_get_speculative(head)) - goto repeat; + head = compound_head(page); + if (!page_cache_get_speculative(head)) + goto repeat; - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } - /* - * Has the page moved? - * This is part of the lockless pagecache protocol. See - * include/linux/pagemap.h for details. - */ - if (unlikely(page != *pagep)) { - put_page(head); - goto repeat; - } + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != xas_reload(&xas))) { + put_page(head); + goto repeat; } out: rcu_read_unlock(); @@@ -1508,7 -1455,7 +1470,7 @@@ struct page *find_lock_entry(struct add repeat: page = find_get_entry(mapping, offset); - if (page && !radix_tree_exception(page)) { + if (page && !xa_is_value(page)) { lock_page(page); /* Has the page been truncated? */ if (unlikely(page_mapping(page) != mapping)) { @@@ -1554,7 -1501,7 +1516,7 @@@ struct page *pagecache_get_page(struct repeat: page = find_get_entry(mapping, offset); - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) page = NULL; if (!page) goto no_page; @@@ -1640,53 -1587,48 +1602,48 @@@ unsigned find_get_entries(struct addres pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices) { - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; unsigned int ret = 0; - struct radix_tree_iter iter; if (!nr_entries) return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - struct page *head, *page; - repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each(&xas, page, ULONG_MAX) { + struct page *head; + if (xas_retry(&xas, page)) continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (xa_is_value(page)) goto export; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; + export: - indices[ret] = iter.index; + indices[ret] = xas.xa_index; entries[ret] = page; if (++ret == nr_entries) break; + continue; + put_page: + put_page(head); + retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; @@@ -1717,64 -1659,50 +1674,50 @@@ unsigned find_get_pages_range(struct ad pgoff_t end, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, *start); + struct page *page; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) { - struct page *head, *page; - - if (iter.index > end) - break; - repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each(&xas, page, end) { + struct page *head; + if (xas_retry(&xas, page)) continue; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Skip - * over it. - */ + /* Skip over shadow, swap and DAX entries */ + if (xa_is_value(page)) continue; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; pages[ret] = page; if (++ret == nr_pages) { - *start = pages[ret - 1]->index + 1; + *start = page->index + 1; goto out; } + continue; + put_page: + put_page(head); + retry: + xas_reset(&xas); } /* * We come here when there is no page beyond @end. We take care to not * overflow the index @start as it confuses some of the callers. This - * breaks the iteration when there is page at index -1 but that is + * breaks the iteration when there is a page at index -1 but that is * already broken anyway. */ if (end == (pgoff_t)-1) @@@ -1802,57 -1730,43 +1745,43 @@@ out unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, index); + struct page *page; unsigned int ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) { - struct page *head, *page; - repeat: - page = radix_tree_deref_slot(slot); - /* The hole, there no reason to continue */ - if (unlikely(!page)) - break; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Stop - * looking for contiguous pages. - */ + for (page = xas_load(&xas); page; page = xas_next(&xas)) { + struct page *head; + if (xas_retry(&xas, page)) + continue; + /* + * If the entry has been swapped out, we can stop looking. + * No current caller is looking for DAX entries. + */ + if (xa_is_value(page)) break; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; /* * must check mapping and index after taking the ref. * otherwise we can get both false positives and false * negatives, which is just confusing to the caller. */ - if (page->mapping == NULL || page_to_pgoff(page) != iter.index) { + if (!page->mapping || page_to_pgoff(page) != xas.xa_index) { put_page(page); break; } @@@ -1860,6 -1774,11 +1789,11 @@@ pages[ret] = page; if (++ret == nr_pages) break; + continue; + put_page: + put_page(head); + retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; @@@ -1879,74 -1798,58 +1813,58 @@@ EXPORT_SYMBOL(find_get_pages_contig) * @tag. We update @index to index the next page for the traversal. */ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, - pgoff_t end, int tag, unsigned int nr_pages, + pgoff_t end, xa_mark_t tag, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, *index); + struct page *page; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) { - struct page *head, *page; - - if (iter.index > end) - break; - repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each_marked(&xas, page, end, tag) { + struct page *head; + if (xas_retry(&xas, page)) continue; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page. - * - * Those entries should never be tagged, but - * this tree walk is lockless and the tags are - * looked up in bulk, one radix tree node at a - * time, so there is a sizable window for page - * reclaim to evict a page we saw tagged. - * - * Skip over it. - */ + /* + * Shadow entries should never be tagged, but this iteration + * is lockless so there is a window for page reclaim to evict + * a page we saw tagged. Skip over it. + */ + if (xa_is_value(page)) continue; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; pages[ret] = page; if (++ret == nr_pages) { - *index = pages[ret - 1]->index + 1; + *index = page->index + 1; goto out; } + continue; + put_page: + put_page(head); + retry: + xas_reset(&xas); } /* - * We come here when we got at @end. We take care to not overflow the + * We come here when we got to @end. We take care to not overflow the * index @index as it confuses some of the callers. This breaks the - * iteration when there is page at index -1 but that is already broken - * anyway. + * iteration when there is a page at index -1 but that is already + * broken anyway. */ if (end == (pgoff_t)-1) *index = (pgoff_t)-1; @@@ -1972,57 -1875,51 +1890,51 @@@ EXPORT_SYMBOL(find_get_pages_range_tag) * @tag. */ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, - int tag, unsigned int nr_entries, + xa_mark_t tag, unsigned int nr_entries, struct page **entries, pgoff_t *indices) { - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; unsigned int ret = 0; - struct radix_tree_iter iter; if (!nr_entries) return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) { - struct page *head, *page; - repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each_marked(&xas, page, ULONG_MAX, tag) { + struct page *head; + if (xas_retry(&xas, page)) continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (xa_is_value(page)) goto export; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; + export: - indices[ret] = iter.index; + indices[ret] = xas.xa_index; entries[ret] = page; if (++ret == nr_entries) break; + continue; + put_page: + put_page(head); + retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; @@@ -2596,7 -2493,9 +2508,7 @@@ no_cached_page * system is low on memory, or a problem occurs while trying * to schedule I/O. */ - if (error == -ENOMEM) - return VM_FAULT_OOM; - return VM_FAULT_SIGBUS; + return vmf_error(error); page_not_uptodate: /* @@@ -2626,45 -2525,31 +2538,31 @@@ EXPORT_SYMBOL(filemap_fault) void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { - struct radix_tree_iter iter; - void **slot; struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; + XA_STATE(xas, &mapping->i_pages, start_pgoff); struct page *head, *page; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) { - if (iter.index > end_pgoff) - break; - repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) - goto next; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } + xas_for_each(&xas, page, end_pgoff) { + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) goto next; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto next; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto skip; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto skip; if (!PageUptodate(page) || PageReadahead(page) || @@@ -2683,10 -2568,10 +2581,10 @@@ if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; + vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT; if (vmf->pte) - vmf->pte += iter.index - last_pgoff; - last_pgoff = iter.index; + vmf->pte += xas.xa_index - last_pgoff; + last_pgoff = xas.xa_index; if (alloc_set_pte(vmf, NULL, page)) goto unlock; unlock_page(page); @@@ -2699,8 -2584,6 +2597,6 @@@ next /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) break; - if (iter.index == end_pgoff) - break; } rcu_read_unlock(); } @@@ -2761,9 -2644,9 +2657,9 @@@ int generic_file_readonly_mmap(struct f return generic_file_mmap(file, vma); } #else -int filemap_page_mkwrite(struct vm_fault *vmf) +vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { - return -ENOSYS; + return VM_FAULT_SIGBUS; } int generic_file_mmap(struct file * file, struct vm_area_struct * vma) { @@@ -2810,7 -2693,7 +2706,7 @@@ repeat put_page(page); if (err == -EEXIST) goto repeat; - /* Presumably ENOMEM for radix tree node */ + /* Presumably ENOMEM for xarray node */ return ERR_PTR(err); } @@@ -3025,7 -2908,7 +2921,7 @@@ generic_file_direct_write(struct kiocb if (iocb->ki_flags & IOCB_NOWAIT) { /* If there are pages to writeback, return */ if (filemap_range_has_page(inode->i_mapping, pos, - pos + iov_iter_count(from))) + pos + write_len)) return -EAGAIN; } else { written = filemap_write_and_wait_range(mapping, pos, diff --combined mm/huge_memory.c index 25ef59b7ee34,9eb79c384616..4e4ef8fa479d --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@@ -852,10 -852,11 +852,10 @@@ static void touch_pmd(struct vm_area_st } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, int flags) + pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pmd_pfn(*pmd); struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap; struct page *page; assert_spin_locked(pmd_lockptr(mm, pmd)); @@@ -885,11 -886,12 +885,11 @@@ return ERR_PTR(-EEXIST); pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; - pgmap = get_dev_pagemap(pfn, NULL); - if (!pgmap) + *pgmap = get_dev_pagemap(pfn, *pgmap); + if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); get_page(page); - put_dev_pagemap(pgmap); return page; } @@@ -998,10 -1000,11 +998,10 @@@ static void touch_pud(struct vm_area_st } struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, int flags) + pud_t *pud, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pud_pfn(*pud); struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap; struct page *page; assert_spin_locked(pud_lockptr(mm, pud)); @@@ -1025,11 -1028,12 +1025,11 @@@ return ERR_PTR(-EEXIST); pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; - pgmap = get_dev_pagemap(pfn, NULL); - if (!pgmap) + *pgmap = get_dev_pagemap(pfn, *pgmap); + if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); get_page(page); - put_dev_pagemap(pgmap); return page; } @@@ -1558,20 -1562,8 +1558,20 @@@ vm_fault_t do_huge_pmd_numa_page(struc * We are not sure a pending tlb flush here is for a huge page * mapping or not. Hence use the tlb range variant */ - if (mm_tlb_flush_pending(vma->vm_mm)) + if (mm_tlb_flush_pending(vma->vm_mm)) { flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + /* + * change_huge_pmd() released the pmd lock before + * invalidating the secondary MMUs sharing the primary + * MMU pagetables (with ->invalidate_range()). The + * mmu_notifier_invalidate_range_end() (which + * internally calls ->invalidate_range()) in + * change_pmd_range() will run after us, so we can't + * rely on it here and we need an explicit invalidate. + */ + mmu_notifier_invalidate_range(vma->vm_mm, haddr, + haddr + HPAGE_PMD_SIZE); + } /* * Migrate the THP to the requested node, returns with page unlocked @@@ -1788,7 -1780,7 +1788,7 @@@ static pmd_t move_soft_dirty_pmd(pmd_t bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, - pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush) + pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; pmd_t pmd; @@@ -1819,7 -1811,7 +1819,7 @@@ if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); - if (pmd_present(pmd) && pmd_dirty(pmd)) + if (pmd_present(pmd)) force_flush = true; VM_BUG_ON(!pmd_none(*new_pmd)); @@@ -1830,10 -1822,12 +1830,10 @@@ } pmd = move_soft_dirty_pmd(pmd); set_pmd_at(mm, new_addr, new_pmd, pmd); - if (new_ptl != old_ptl) - spin_unlock(new_ptl); if (force_flush) flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); - else - *need_flush = true; + if (new_ptl != old_ptl) + spin_unlock(new_ptl); spin_unlock(old_ptl); return true; } @@@ -2377,7 -2371,6 +2377,7 @@@ static void __split_huge_page_tail(stru (1L << PG_mlocked) | (1L << PG_uptodate) | (1L << PG_active) | + (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | (1L << PG_dirty))); @@@ -2450,13 -2443,13 +2450,13 @@@ static void __split_huge_page(struct pa ClearPageCompound(head); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { - /* Additional pin to radix tree of swap cache */ + /* Additional pin to swap cache */ if (PageSwapCache(head)) page_ref_add(head, 2); else page_ref_inc(head); } else { - /* Additional pin to radix tree */ + /* Additional pin to page cache */ page_ref_add(head, 2); xa_unlock(&head->mapping->i_pages); } @@@ -2568,7 -2561,7 +2568,7 @@@ bool can_split_huge_page(struct page *p { int extra_pins; - /* Additional pins from radix tree */ + /* Additional pins from page cache */ if (PageAnon(page)) extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; else @@@ -2664,17 -2657,14 +2664,14 @@@ int split_huge_page_to_list(struct pag spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); if (mapping) { - void **pslot; + XA_STATE(xas, &mapping->i_pages, page_index(head)); - xa_lock(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->i_pages, - page_index(head)); /* - * Check if the head page is present in radix tree. + * Check if the head page is present in page cache. * We assume all tail are present too, if head is there. */ - if (radix_tree_deref_slot_protected(pslot, - &mapping->i_pages.xa_lock) != head) + xa_lock(&mapping->i_pages); + if (xas_load(&xas) != head) goto fail; } @@@ -2892,6 -2882,9 +2889,6 @@@ void set_pmd_migration_entry(struct pag if (!(pvmw->pmd && !pvmw->pte)) return; - mmu_notifier_invalidate_range_start(mm, address, - address + HPAGE_PMD_SIZE); - flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); pmdval = *pvmw->pmd; pmdp_invalidate(vma, address, pvmw->pmd); @@@ -2904,6 -2897,9 +2901,6 @@@ set_pmd_at(mm, address, pvmw->pmd, pmdswp); page_remove_rmap(page, true); put_page(page); - - mmu_notifier_invalidate_range_end(mm, address, - address + HPAGE_PMD_SIZE); } void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) @@@ -2932,7 -2928,7 +2929,7 @@@ else page_add_file_rmap(new, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); - if (vma->vm_flags & VM_LOCKED) + if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) mlock_vma_page(new); update_mmu_cache_pmd(vma, address, pvmw->pmd); } diff --combined mm/madvise.c index 71d21df2a3f3,9d802566c494..6cb1ca93e290 --- a/mm/madvise.c +++ b/mm/madvise.c @@@ -96,7 -96,7 +96,7 @@@ static long madvise_behavior(struct vm_ new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: - if (new_flags & VM_SPECIAL) { + if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { error = -EINVAL; goto out; } @@@ -251,7 -251,7 +251,7 @@@ static void force_shm_swapin_readahead( index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; page = find_get_entry(mapping, index); - if (!radix_tree_exceptional_entry(page)) { + if (!xa_is_value(page)) { if (page) put_page(page); continue; diff --combined mm/memcontrol.c index 10a9b554d69f,29d9d1a69b36..54920cbc46bf --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@@ -1669,8 -1669,6 +1669,8 @@@ static enum oom_status mem_cgroup_oom(s if (order > PAGE_ALLOC_COSTLY_ORDER) return OOM_SKIPPED; + memcg_memory_event(memcg, MEMCG_OOM); + /* * We are in the middle of the charge context here, so we * don't want to block when potentially sitting on a callstack @@@ -2252,6 -2250,8 +2252,6 @@@ retry if (fatal_signal_pending(current)) goto force; - memcg_memory_event(mem_over_limit, MEMCG_OOM); - /* * keep retrying as long as the memcg oom killer is able to make * a forward progress or bypass the charge if the oom killer @@@ -2460,7 -2460,7 +2460,7 @@@ static void memcg_kmem_cache_create_fun /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, +static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, struct kmem_cache *cachep) { struct memcg_kmem_cache_create_work *cw; @@@ -2478,6 -2478,25 +2478,6 @@@ queue_work(memcg_kmem_cache_wq, &cw->work); } -static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, - struct kmem_cache *cachep) -{ - /* - * We need to stop accounting when we kmalloc, because if the - * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_schedule_kmem_cache_create will recurse. - * - * However, it is better to enclose the whole function. Depending on - * the debugging options enabled, INIT_WORK(), for instance, can - * trigger an allocation. This too, will make us recurse. Because at - * this point we can't allow ourselves back into memcg_kmem_get_cache, - * the safest choice is to do it like this, wrapping the whole function. - */ - current->memcg_kmem_skip_account = 1; - __memcg_schedule_kmem_cache_create(memcg, cachep); - current->memcg_kmem_skip_account = 0; -} - static inline bool memcg_kmem_bypass(void) { if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) @@@ -2512,6 -2531,9 +2512,6 @@@ struct kmem_cache *memcg_kmem_get_cache if (memcg_kmem_bypass()) return cachep; - if (current->memcg_kmem_skip_account) - return cachep; - memcg = get_mem_cgroup_from_current(); kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) @@@ -4299,12 -4321,14 +4299,12 @@@ static void mem_cgroup_id_remove(struc static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { - VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); - atomic_add(n, &memcg->id.ref); + refcount_add(n, &memcg->id.ref); } static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { - VM_BUG_ON(atomic_read(&memcg->id.ref) < n); - if (atomic_sub_and_test(n, &memcg->id.ref)) { + if (refcount_sub_and_test(n, &memcg->id.ref)) { mem_cgroup_id_remove(memcg); /* Memcg ID pins CSS */ @@@ -4521,7 -4545,7 +4521,7 @@@ static int mem_cgroup_css_online(struc } /* Online state pins memcg ID, memcg ID pins CSS */ - atomic_set(&memcg->id.ref, 1); + refcount_set(&memcg->id.ref, 1); css_get(css); return 0; } @@@ -4549,8 -4573,6 +4549,8 @@@ static void mem_cgroup_css_offline(stru memcg_offline_kmem(memcg); wb_memcg_offline(memcg); + drain_all_stock(memcg); + mem_cgroup_id_put(memcg); } @@@ -4728,7 -4750,7 +4728,7 @@@ static struct page *mc_handle_file_pte( /* shmem/tmpfs may report page out on swap: account for that too. */ if (shmem_mapping(mapping)) { page = find_get_entry(mapping, pgoff); - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); if (do_memsw_account()) *entry = swp; @@@ -5573,13 -5595,6 +5573,13 @@@ static int memory_stat_show(struct seq_ seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); + seq_printf(m, "workingset_refault %lu\n", + acc.stat[WORKINGSET_REFAULT]); + seq_printf(m, "workingset_activate %lu\n", + acc.stat[WORKINGSET_ACTIVATE]); + seq_printf(m, "workingset_nodereclaim %lu\n", + acc.stat[WORKINGSET_NODERECLAIM]); + seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + acc.events[PGSCAN_DIRECT]); @@@ -5590,6 -5605,13 +5590,6 @@@ seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); - seq_printf(m, "workingset_refault %lu\n", - acc.stat[WORKINGSET_REFAULT]); - seq_printf(m, "workingset_activate %lu\n", - acc.stat[WORKINGSET_ACTIVATE]); - seq_printf(m, "workingset_nodereclaim %lu\n", - acc.stat[WORKINGSET_NODERECLAIM]); - return 0; } @@@ -6355,7 -6377,7 +6355,7 @@@ subsys_initcall(mem_cgroup_init) #ifdef CONFIG_MEMCG_SWAP static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) { - while (!atomic_inc_not_zero(&memcg->id.ref)) { + while (!refcount_inc_not_zero(&memcg->id.ref)) { /* * The root cgroup cannot be destroyed, so it's refcount must * always be >= 1. diff --combined mm/migrate.c index b6700f2962f3,b3cde3fd094a..f7e4bfdc13b7 --- a/mm/migrate.c +++ b/mm/migrate.c @@@ -275,9 -275,6 +275,9 @@@ static bool remove_migration_pte(struc if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); + if (PageTransHuge(page) && PageMlocked(page)) + clear_page_mlock(page); + /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); } @@@ -326,7 -323,7 +326,7 @@@ void __migration_entry_wait(struct mm_s page = migration_entry_to_page(entry); /* - * Once radix-tree replacement of page migration started, page_count + * Once page cache replacement of page migration started, page_count * *must* be zero. And, we don't want to call wait_on_page_locked() * against a page without get_page(). * So, we use get_page_unless_zero(), here. Even failed, page fault @@@ -441,10 -438,10 +441,10 @@@ int migrate_page_move_mapping(struct ad struct buffer_head *head, enum migrate_mode mode, int extra_count) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); struct zone *oldzone, *newzone; int dirty; int expected_count = 1 + extra_count; - void **pslot; /* * Device public or private pages have an extra refcount as they are @@@ -470,21 -467,16 +470,16 @@@ oldzone = page_zone(page); newzone = page_zone(newpage); - xa_lock_irq(&mapping->i_pages); - - pslot = radix_tree_lookup_slot(&mapping->i_pages, - page_index(page)); + xas_lock_irq(&xas); expected_count += hpage_nr_pages(page) + page_has_private(page); - if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, - &mapping->i_pages.xa_lock) != page) { - xa_unlock_irq(&mapping->i_pages); + if (page_count(page) != expected_count || xas_load(&xas) != page) { + xas_unlock_irq(&xas); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@@ -498,7 -490,7 +493,7 @@@ if (mode == MIGRATE_ASYNC && head && !buffer_migrate_lock_buffers(head, mode)) { page_ref_unfreeze(page, expected_count); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@@ -526,16 -518,13 +521,13 @@@ SetPageDirty(newpage); } - radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + xas_store(&xas, newpage); if (PageTransHuge(page)) { int i; - int index = page_index(page); for (i = 1; i < HPAGE_PMD_NR; i++) { - pslot = radix_tree_lookup_slot(&mapping->i_pages, - index + i); - radix_tree_replace_slot(&mapping->i_pages, pslot, - newpage + i); + xas_next(&xas); + xas_store(&xas, newpage + i); } } @@@ -546,7 -535,7 +538,7 @@@ */ page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); - xa_unlock(&mapping->i_pages); + xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ /* @@@ -586,22 -575,18 +578,18 @@@ EXPORT_SYMBOL(migrate_page_move_mapping int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); int expected_count; - void **pslot; - - xa_lock_irq(&mapping->i_pages); - - pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); + xas_lock_irq(&xas); expected_count = 2 + page_has_private(page); - if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { - xa_unlock_irq(&mapping->i_pages); + if (page_count(page) != expected_count || xas_load(&xas) != page) { + xas_unlock_irq(&xas); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@@ -610,11 -595,11 +598,11 @@@ get_page(newpage); - radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + xas_store(&xas, newpage); page_ref_unfreeze(page, expected_count - 1); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return MIGRATEPAGE_SUCCESS; } @@@ -685,8 -670,6 +673,8 @@@ void migrate_page_states(struct page *n SetPageActive(newpage); } else if (TestClearPageUnevictable(page)) SetPageUnevictable(newpage); + if (PageWorkingset(page)) + SetPageWorkingset(newpage); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) @@@ -1416,7 -1399,7 +1404,7 @@@ retry * we encounter them after the rest of the list * is processed. */ - if (PageTransHuge(page)) { + if (PageTransHuge(page) && !PageHuge(page)) { lock_page(page); rc = split_huge_page_to_list(page, from); unlock_page(page); @@@ -1860,6 -1843,46 +1848,6 @@@ static struct page *alloc_misplaced_dst return newpage; } -/* - * page migration rate limiting control. - * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs - * window of time. Default here says do not migrate more than 1280M per second. - */ -static unsigned int migrate_interval_millisecs __read_mostly = 100; -static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); - -/* Returns true if the node is migrate rate-limited after the update */ -static bool numamigrate_update_ratelimit(pg_data_t *pgdat, - unsigned long nr_pages) -{ - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { - spin_lock(&pgdat->numabalancing_migrate_lock); - pgdat->numabalancing_migrate_nr_pages = 0; - pgdat->numabalancing_migrate_next_window = jiffies + - msecs_to_jiffies(migrate_interval_millisecs); - spin_unlock(&pgdat->numabalancing_migrate_lock); - } - if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { - trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, - nr_pages); - return true; - } - - /* - * This is an unlocked non-atomic update so errors are possible. - * The consequences are failing to migrate when we potentiall should - * have which is not severe enough to warrant locking. If it is ever - * a problem, it can be converted to a per-cpu counter. - */ - pgdat->numabalancing_migrate_nr_pages += nr_pages; - return false; -} - static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; @@@ -1932,6 -1955,14 +1920,6 @@@ int migrate_misplaced_page(struct page if (page_is_file_cache(page) && PageDirty(page)) goto out; - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (numamigrate_update_ratelimit(pgdat, 1)) - goto out; - isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) goto out; @@@ -1975,7 -2006,16 +1963,7 @@@ int migrate_misplaced_transhuge_page(st int isolated = 0; struct page *new_page = NULL; int page_lru = page_is_file_cache(page); - unsigned long mmun_start = address & HPAGE_PMD_MASK; - unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; - - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) - goto out_dropref; + unsigned long start = address & HPAGE_PMD_MASK; new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), @@@ -1998,15 -2038,15 +1986,15 @@@ /* anon mapping, we can simply copy page->mapping to the new page: */ new_page->mapping = page->mapping; new_page->index = page->index; + /* flush the cache before copying using the kernel virtual address */ + flush_cache_range(vma, start, start + HPAGE_PMD_SIZE); migrate_page_copy(new_page, page); WARN_ON(PageLRU(new_page)); /* Recheck the target PMD */ - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); /* Reverse changes made by migrate_page_copy() */ if (TestClearPageActive(new_page)) @@@ -2030,26 -2070,16 +2018,26 @@@ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); /* - * Clear the old entry under pagetable lock and establish the new PTE. - * Any parallel GUP will either observe the old page blocking on the - * page lock, block on the page table lock or observe the new page. - * The SetPageUptodate on the new page and page_add_new_anon_rmap - * guarantee the copy is visible before the pagetable update. + * Overwrite the old entry under pagetable lock and establish + * the new PTE. Any parallel GUP will either observe the old + * page blocking on the page lock, block on the page table + * lock or observe the new page. The SetPageUptodate on the + * new page and page_add_new_anon_rmap guarantee the copy is + * visible before the pagetable update. + */ + page_add_anon_rmap(new_page, vma, start, true); + /* + * At this point the pmd is numa/protnone (i.e. non present) and the TLB + * has already been flushed globally. So no TLB can be currently + * caching this non present pmd mapping. There's no need to clear the + * pmd before doing set_pmd_at(), nor to flush the TLB after + * set_pmd_at(). Clearing the pmd here would introduce a race + * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the + * mmap_sem for reading. If the pmd is set to NULL at any given time, + * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this + * pmd. */ - flush_cache_range(vma, mmun_start, mmun_end); - page_add_anon_rmap(new_page, vma, mmun_start, true); - pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); - set_pmd_at(mm, mmun_start, pmd, entry); + set_pmd_at(mm, start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); page_ref_unfreeze(page, 2); @@@ -2058,6 -2088,11 +2046,6 @@@ set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); /* Take an "isolate" reference and put new page on the LRU. */ get_page(new_page); @@@ -2078,10 -2113,11 +2066,10 @@@ out_fail: count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); -out_dropref: ptl = pmd_lock(mm, pmd); if (pmd_same(*pmd, entry)) { entry = pmd_modify(entry, vma->vm_page_prot); - set_pmd_at(mm, mmun_start, pmd, entry); + set_pmd_at(mm, start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); } spin_unlock(ptl); diff --combined mm/page-writeback.c index 439a304a6c92,fc6e5743b0bf..3f690bae6b78 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@@ -2097,34 -2097,25 +2097,25 @@@ void __init page_writeback_init(void * dirty pages in the file (thus it is important for this function to be quick * so that it can tag pages faster than a dirtying process can create them). */ - /* - * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock - * latency. - */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { - #define WRITEBACK_TAG_BATCH 4096 - unsigned long tagged = 0; - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + unsigned int tagged = 0; + void *page; - xa_lock_irq(&mapping->i_pages); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, - PAGECACHE_TAG_DIRTY) { - if (iter.index > end) - break; - radix_tree_iter_tag_set(&mapping->i_pages, &iter, - PAGECACHE_TAG_TOWRITE); - tagged++; - if ((tagged % WRITEBACK_TAG_BATCH) != 0) + xas_lock_irq(&xas); + xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { + xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); + if (++tagged % XA_CHECK_SCHED) continue; - slot = radix_tree_iter_resume(slot, &iter); - xa_unlock_irq(&mapping->i_pages); + + xas_pause(&xas); + xas_unlock_irq(&xas); cond_resched(); - xa_lock_irq(&mapping->i_pages); + xas_lock_irq(&xas); } - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); } EXPORT_SYMBOL(tag_pages_for_writeback); @@@ -2149,13 -2140,6 +2140,13 @@@ * not miss some pages (e.g., because some other process has cleared TOWRITE * tag we set). The rule we follow is that TOWRITE tag can be cleared only * by the process clearing the DIRTY tag (and submitting the page for IO). + * + * To avoid deadlocks between range_cyclic writeback and callers that hold + * pages in PageWriteback to aggregate IO until write_cache_pages() returns, + * we do not loop back to the start of the file. Doing so causes a page + * lock/page writeback access order inversion - we should only ever lock + * multiple pages in ascending page->index order, and looping back to the start + * of the file violates that rule and causes deadlocks. */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, @@@ -2169,24 -2153,31 +2160,24 @@@ pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; - int cycled; int range_whole = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; - if (index == 0) - cycled = 1; - else - cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; -retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; @@@ -2272,14 -2263,17 +2263,14 @@@ continue_unlock pagevec_release(&pvec); cond_resched(); } - if (!cycled && !done) { - /* - * range_cyclic: - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - cycled = 1; - index = 0; - end = writeback_index - 1; - goto retry; - } + + /* + * If we hit the last page and there is more work to be done: wrap + * back the index back to the start of the file for the next + * time we are called. + */ + if (wbc->range_cyclic && !done) + done_index = 0; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; @@@ -2442,7 -2436,7 +2433,7 @@@ void account_page_cleaned(struct page * /* * For address_spaces which do not use buffers. Just tag the page as dirty in - * its radix tree. + * the xarray. * * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" @@@ -2468,7 -2462,7 +2459,7 @@@ int __set_page_dirty_nobuffers(struct p BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->i_pages, page_index(page), + __xa_set_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); @@@ -2631,13 -2625,13 +2622,13 @@@ EXPORT_SYMBOL(__cancel_dirty_page) * Returns true if the page was previously dirty. * * This is for preparing to put the page under writeout. We leave the page - * tagged as dirty in the radix tree so that a concurrent write-for-sync + * tagged as dirty in the xarray so that a concurrent write-for-sync * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage * implementation will run either set_page_writeback() or set_page_dirty(), - * at which stage we bring the page's dirty flag and radix-tree dirty tag + * at which stage we bring the page's dirty flag and xarray dirty tag * back into sync. * - * This incoherency between the page's dirty flag and radix-tree tag is + * This incoherency between the page's dirty flag and xarray tag is * unfortunate, but it only exists while the page is locked. */ int clear_page_dirty_for_io(struct page *page) @@@ -2718,7 -2712,7 +2709,7 @@@ int test_clear_page_writeback(struct pa xa_lock_irqsave(&mapping->i_pages, flags); ret = TestClearPageWriteback(page); if (ret) { - radix_tree_tag_clear(&mapping->i_pages, page_index(page), + __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); @@@ -2758,11 -2752,13 +2749,13 @@@ int __test_set_page_writeback(struct pa lock_page_memcg(page); if (mapping && mapping_use_writeback_tags(mapping)) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; - xa_lock_irqsave(&mapping->i_pages, flags); + xas_lock_irqsave(&xas, flags); + xas_load(&xas); ret = TestSetPageWriteback(page); if (!ret) { bool on_wblist; @@@ -2770,8 -2766,7 +2763,7 @@@ on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); - radix_tree_tag_set(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_WRITEBACK); + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); @@@ -2784,12 -2779,10 +2776,10 @@@ sb_mark_inode_writeback(mapping->host); } if (!PageDirty(page)) - radix_tree_tag_clear(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_DIRTY); + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) - radix_tree_tag_clear(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_TOWRITE); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); + xas_unlock_irqrestore(&xas, flags); } else { ret = TestSetPageWriteback(page); } @@@ -2803,16 -2796,6 +2793,6 @@@ } EXPORT_SYMBOL(__test_set_page_writeback); - /* - * Return true if any of the pages in the mapping are marked with the - * passed tag. - */ - int mapping_tagged(struct address_space *mapping, int tag) - { - return radix_tree_tagged(&mapping->i_pages, tag); - } - EXPORT_SYMBOL(mapping_tagged); - /** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. diff --combined mm/swap.c index 87a54c8dee34,6861f3140a13..aa483719922e --- a/mm/swap.c +++ b/mm/swap.c @@@ -29,6 -29,7 +29,6 @@@ #include #include #include -#include #include #include #include @@@ -964,7 -965,7 +964,7 @@@ void pagevec_remove_exceptionals(struc for (i = 0, j = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - if (!radix_tree_exceptional_entry(page)) + if (!xa_is_value(page)) pvec->pages[j++] = page; } pvec->nr = j; @@@ -1001,7 -1002,7 +1001,7 @@@ EXPORT_SYMBOL(pagevec_lookup_range) unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag) + xa_mark_t tag) { pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, PAGEVEC_SIZE, pvec->pages); @@@ -1011,7 -1012,7 +1011,7 @@@ EXPORT_SYMBOL(pagevec_lookup_range_tag) unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag, unsigned max_pages) + xa_mark_t tag, unsigned max_pages) { pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages); diff --combined mm/swap_state.c index 0d6a7f268d2e,31c45a25b2d3..fd2f21e1c60a --- a/mm/swap_state.c +++ b/mm/swap_state.c @@@ -107,14 -107,15 +107,15 @@@ void show_swap_cache_info(void } /* - * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, + * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ - int __add_to_swap_cache(struct page *page, swp_entry_t entry) + int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) { - int error, i, nr = hpage_nr_pages(page); - struct address_space *address_space; + struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); + XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); + unsigned long i, nr = 1UL << compound_order(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); @@@ -123,73 -124,52 +124,52 @@@ page_ref_add(page, nr); SetPageSwapCache(page); - address_space = swap_address_space(entry); - xa_lock_irq(&address_space->i_pages); - for (i = 0; i < nr; i++) { - set_page_private(page + i, entry.val + i); - error = radix_tree_insert(&address_space->i_pages, - idx + i, page + i); - if (unlikely(error)) - break; - } - if (likely(!error)) { + do { + xas_lock_irq(&xas); + xas_create_range(&xas); + if (xas_error(&xas)) + goto unlock; + for (i = 0; i < nr; i++) { + VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); + set_page_private(page + i, entry.val + i); + xas_store(&xas, page + i); + xas_next(&xas); + } address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); ADD_CACHE_INFO(add_total, nr); - } else { - /* - * Only the context which have set SWAP_HAS_CACHE flag - * would call add_to_swap_cache(). - * So add_to_swap_cache() doesn't returns -EEXIST. - */ - VM_BUG_ON(error == -EEXIST); - set_page_private(page + i, 0UL); - while (i--) { - radix_tree_delete(&address_space->i_pages, idx + i); - set_page_private(page + i, 0UL); - } - ClearPageSwapCache(page); - page_ref_sub(page, nr); - } - xa_unlock_irq(&address_space->i_pages); + unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp)); - return error; - } - - - int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) - { - int error; + if (!xas_error(&xas)) + return 0; - error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); - if (!error) { - error = __add_to_swap_cache(page, entry); - radix_tree_preload_end(); - } - return error; + ClearPageSwapCache(page); + page_ref_sub(page, nr); + return xas_error(&xas); } /* * This must be called only on pages that have * been verified to be in the swap cache. */ - void __delete_from_swap_cache(struct page *page) + void __delete_from_swap_cache(struct page *page, swp_entry_t entry) { - struct address_space *address_space; + struct address_space *address_space = swap_address_space(entry); int i, nr = hpage_nr_pages(page); - swp_entry_t entry; - pgoff_t idx; + pgoff_t idx = swp_offset(entry); + XA_STATE(xas, &address_space->i_pages, idx); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); VM_BUG_ON_PAGE(PageWriteback(page), page); - entry.val = page_private(page); - address_space = swap_address_space(entry); - idx = swp_offset(entry); for (i = 0; i < nr; i++) { - radix_tree_delete(&address_space->i_pages, idx + i); + void *entry = xas_store(&xas, NULL); + VM_BUG_ON_PAGE(entry != page + i, entry); set_page_private(page + i, 0); + xas_next(&xas); } ClearPageSwapCache(page); address_space->nrpages -= nr; @@@ -217,7 -197,7 +197,7 @@@ int add_to_swap(struct page *page return 0; /* - * Radix-tree node allocations from PF_MEMALLOC contexts could + * XArray node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC * stops emergency reserves from being allocated. * @@@ -229,7 -209,6 +209,6 @@@ */ err = add_to_swap_cache(page, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - /* -ENOMEM radix-tree allocation failure */ if (err) /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely @@@ -263,14 -242,11 +242,11 @@@ fail */ void delete_from_swap_cache(struct page *page) { - swp_entry_t entry; - struct address_space *address_space; + swp_entry_t entry = { .val = page_private(page) }; + struct address_space *address_space = swap_address_space(entry); - entry.val = page_private(page); - - address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); - __delete_from_swap_cache(page); + __delete_from_swap_cache(page, entry); xa_unlock_irq(&address_space->i_pages); put_swap_page(page, entry); @@@ -413,19 -389,11 +389,11 @@@ struct page *__read_swap_cache_async(sw break; /* Out of memory */ } - /* - * call radix_tree_preload() while we can wait. - */ - err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); - if (err) - break; - /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); if (err == -EEXIST) { - radix_tree_preload_end(); /* * We might race against get_swap_page() and stumble * across a SWAP_HAS_CACHE swap_map entry whose page @@@ -433,27 -401,19 +401,20 @@@ */ cond_resched(); continue; - } - if (err) { /* swp entry is obsolete ? */ - radix_tree_preload_end(); + } else if (err) /* swp entry is obsolete ? */ break; - } - /* May fail (-ENOMEM) if radix-tree node allocation failed. */ + /* May fail (-ENOMEM) if XArray node allocation failed. */ __SetPageLocked(new_page); __SetPageSwapBacked(new_page); - err = __add_to_swap_cache(new_page, entry); + err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); if (likely(!err)) { - radix_tree_preload_end(); - /* - * Initiate read into locked page and return. - */ + /* Initiate read into locked page */ + SetPageWorkingset(new_page); lru_cache_add_anon(new_page); *new_page_allocated = true; return new_page; } - radix_tree_preload_end(); __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely @@@ -626,7 -586,7 +587,7 @@@ int init_swap_address_space(unsigned in return -ENOMEM; for (i = 0; i < nr; i++) { space = spaces + i; - INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN); + xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); atomic_set(&space->i_mmap_writable, 0); space->a_ops = &swap_aops; /* swap cache doesn't use writeback related tags */ diff --combined mm/vmscan.c index 28c9ae5633b9,f9cc86e91812..62ac0c488624 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@@ -49,7 -49,6 +49,7 @@@ #include #include #include +#include #include #include @@@ -474,18 -473,9 +474,18 @@@ static unsigned long do_shrink_slab(str nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); total_scan = nr; - delta = freeable >> priority; - delta *= 4; - do_div(delta, shrinker->seeks); + if (shrinker->seeks) { + delta = freeable >> priority; + delta *= 4; + do_div(delta, shrinker->seeks); + } else { + /* + * These objects don't require any IO to create. Trim + * them aggressively under memory pressure to keep + * them from causing refetches in the IO caches. + */ + delta = freeable / 2; + } /* * Make sure we apply some minimal pressure on default priority @@@ -590,8 -580,8 +590,8 @@@ static unsigned long shrink_slab_memcg( struct mem_cgroup *memcg, int priority) { struct memcg_shrinker_map *map; - unsigned long freed = 0; - int ret, i; + unsigned long ret, freed = 0; + int i; if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) return 0; @@@ -687,8 -677,9 +687,8 @@@ static unsigned long shrink_slab(gfp_t struct mem_cgroup *memcg, int priority) { + unsigned long ret, freed = 0; struct shrinker *shrinker; - unsigned long freed = 0; - int ret; if (!mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); @@@ -751,12 -742,12 +751,12 @@@ static inline int is_page_cache_freeabl { /* * A freeable page cache page is referenced only by the caller - * that isolated the page, the page cache radix tree and - * optional buffer heads at page->private. + * that isolated the page, the page cache and optional buffer + * heads at page->private. */ - int radix_pins = PageTransHuge(page) && PageSwapCache(page) ? + int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ? HPAGE_PMD_NR : 1; - return page_count(page) - page_has_private(page) == 1 + radix_pins; + return page_count(page) - page_has_private(page) == 1 + page_cache_pins; } static int may_write_to_inode(struct inode *inode, struct scan_control *sc) @@@ -932,7 -923,7 +932,7 @@@ static int __remove_mapping(struct addr if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); - __delete_from_swap_cache(page); + __delete_from_swap_cache(page, swap); xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); } else { @@@ -2155,7 -2146,6 +2155,7 @@@ static void shrink_active_list(unsigne } ClearPageActive(page); /* we are de-activating */ + SetPageWorkingset(page); list_add(&page->lru, &l_inactive); } @@@ -2467,11 -2457,9 +2467,11 @@@ out /* * Scan types proportional to swappiness and * their relative recent reclaim efficiency. + * Make sure we don't miss the last page + * because of a round-off error. */ - scan = div64_u64(scan * fraction[file], - denominator); + scan = DIV64_U64_ROUND_UP(scan * fraction[file], + denominator); break; case SCAN_FILE: case SCAN_ANON: @@@ -3315,7 -3303,6 +3315,7 @@@ unsigned long try_to_free_mem_cgroup_pa { struct zonelist *zonelist; unsigned long nr_reclaimed; + unsigned long pflags; int nid; unsigned int noreclaim_flag; struct scan_control sc = { @@@ -3344,13 -3331,9 +3344,13 @@@ sc.gfp_mask, sc.reclaim_idx); + psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@@ -3515,7 -3498,6 +3515,7 @@@ static int balance_pgdat(pg_data_t *pgd int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + unsigned long pflags; struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@@ -3526,7 -3508,6 +3526,7 @@@ .may_swap = 1, }; + psi_memstall_enter(&pflags); __fs_reclaim_acquire(); count_vm_event(PAGEOUTRUN); @@@ -3628,7 -3609,6 +3628,7 @@@ out: snapshot_refaults(NULL, pgdat); __fs_reclaim_release(); + psi_memstall_leave(&pflags); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller diff --combined mm/workingset.c index cbc13d4dfa79,5cfb29ec3fd9..d46f8c92aa2f --- a/mm/workingset.c +++ b/mm/workingset.c @@@ -121,7 -121,7 +121,7 @@@ * the only thing eating into inactive list space is active pages. * * - * Activating refaulting pages + * Refaulting inactive pages * * All that is known about the active list is that the pages have been * accessed more than once in the past. This means that at any given @@@ -134,10 -134,6 +134,10 @@@ * used less frequently than the refaulting page - or even not used at * all anymore. * + * That means if inactive cache is refaulting with a suitable refault + * distance, we assume the cache workingset is transitioning and put + * pressure on the current active list. + * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently * used once will be evicted from memory. @@@ -145,14 -141,6 +145,14 @@@ * But if this is right, the stale pages will be pushed out of memory * and the used pages get to stay in cache. * + * Refaulting active pages + * + * If on the other hand the refaulting pages have recently been + * deactivated, it means that the active list is no longer protecting + * actively used cache from reclaim. The cache is NOT transitioning to + * a different workingset; the existing workingset is thrashing in the + * space allocated to the page cache. + * * * Implementation * @@@ -160,20 -148,21 +160,20 @@@ * and activations is maintained (node->inactive_age). * * On eviction, a snapshot of this counter (along with some bits to - * identify the node) is stored in the now empty page cache radix tree + * identify the node) is stored in the now empty page cache * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible * refault distance will immediately activate the refaulting page. */ - #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ + #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ - NODES_SHIFT + \ - MEM_CGROUP_ID_SHIFT) + 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* * Eviction timestamps need to be able to cover the full range of - * actionable refaults. However, bits are tight in the radix tree + * actionable refaults. However, bits are tight in the xarray * entry, and after storing the identifier for the lruvec there might * not be enough left to represent every single actionable refault. In * that case, we have to sacrifice granularity for distance, and group @@@ -181,28 -170,22 +181,27 @@@ */ static unsigned int bucket_order __read_mostly; -static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) +static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, + bool workingset) { eviction >>= bucket_order; + eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; + eviction = (eviction << 1) | workingset; - eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); - return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); + return xa_mk_value(eviction); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, - unsigned long *evictionp) + unsigned long *evictionp, bool *workingsetp) { - unsigned long entry = (unsigned long)shadow; + unsigned long entry = xa_to_value(shadow); int memcgid, nid; + bool workingset; - entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; + workingset = entry & 1; + entry >>= 1; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); @@@ -211,7 -194,6 +210,7 @@@ *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry << bucket_order; + *workingsetp = workingset; } /** @@@ -224,8 -206,8 +223,8 @@@ */ void *workingset_eviction(struct address_space *mapping, struct page *page) { - struct mem_cgroup *memcg = page_memcg(page); struct pglist_data *pgdat = page_pgdat(page); + struct mem_cgroup *memcg = page_memcg(page); int memcgid = mem_cgroup_id(memcg); unsigned long eviction; struct lruvec *lruvec; @@@ -237,30 -219,30 +236,30 @@@ lruvec = mem_cgroup_lruvec(pgdat, memcg); eviction = atomic_long_inc_return(&lruvec->inactive_age); - return pack_shadow(memcgid, pgdat, eviction); + return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } /** * workingset_refault - evaluate the refault of a previously evicted page + * @page: the freshly allocated replacement page * @shadow: shadow entry of the evicted page * * Calculates and evaluates the refault distance of the previously * evicted page in the context of the node it was allocated in. - * - * Returns %true if the page should be activated, %false otherwise. */ -bool workingset_refault(void *shadow) +void workingset_refault(struct page *page, void *shadow) { unsigned long refault_distance; + struct pglist_data *pgdat; unsigned long active_file; struct mem_cgroup *memcg; unsigned long eviction; struct lruvec *lruvec; unsigned long refault; - struct pglist_data *pgdat; + bool workingset; int memcgid; - unpack_shadow(shadow, &memcgid, &pgdat, &eviction); + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); rcu_read_lock(); /* @@@ -280,51 -262,41 +279,51 @@@ * configurations instead. */ memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && !memcg) { - rcu_read_unlock(); - return false; - } + if (!mem_cgroup_disabled() && !memcg) + goto out; lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); /* - * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. + * Calculate the refault distance * - * There is a special case: usually, shadow entries have a - * short lifetime and are either refaulted or reclaimed along - * with the inode before they get too old. But it is not - * impossible for the inactive_age to lap a shadow entry in - * the field, which can then can result in a false small - * refault distance, leading to a false activation should this - * old entry actually refault again. However, earlier kernels - * used to deactivate unconditionally with *every* reclaim - * invocation for the longest time, so the occasional - * inappropriate activation leading to pressure on the active - * list is not a problem. + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. There is a + * special case: usually, shadow entries have a short lifetime + * and are either refaulted or reclaimed along with the inode + * before they get too old. But it is not impossible for the + * inactive_age to lap a shadow entry in the field, which can + * then result in a false small refault distance, leading to a + * false activation should this old entry actually refault + * again. However, earlier kernels used to deactivate + * unconditionally with *every* reclaim invocation for the + * longest time, so the occasional inappropriate activation + * leading to pressure on the active list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; inc_lruvec_state(lruvec, WORKINGSET_REFAULT); - if (refault_distance <= active_file) { - inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); - rcu_read_unlock(); - return true; + /* + * Compare the distance to the existing workingset size. We + * don't act on pages that couldn't stay resident even if all + * the memory was available to the page cache. + */ + if (refault_distance > active_file) + goto out; + + SetPageActive(page); + atomic_long_inc(&lruvec->inactive_age); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); + + /* Page was active prior to eviction */ + if (workingset) { + SetPageWorkingset(page); + inc_lruvec_state(lruvec, WORKINGSET_RESTORE); } +out: rcu_read_unlock(); - return false; } /** @@@ -367,7 -339,7 +366,7 @@@ out static struct list_lru shadow_nodes; - void workingset_update_node(struct radix_tree_node *node) + void workingset_update_node(struct xa_node *node) { /* * Track non-empty nodes that contain only shadow entries; @@@ -377,20 -349,12 +376,20 @@@ * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ + VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ + - if (node->count && node->count == node->exceptional) { + if (node->count && node->count == node->nr_values) { - if (list_empty(&node->private_list)) + if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); + __inc_lruvec_page_state(virt_to_page(node), + WORKINGSET_NODES); + } } else { - if (!list_empty(&node->private_list)) + if (!list_empty(&node->private_list)) { list_lru_del(&shadow_nodes, &node->private_list); + __dec_lruvec_page_state(virt_to_page(node), + WORKINGSET_NODES); + } } } @@@ -399,12 -363,12 +398,12 @@@ static unsigned long count_shadow_nodes { unsigned long max_nodes; unsigned long nodes; - unsigned long cache; + unsigned long pages; nodes = list_lru_shrink_count(&shadow_nodes, sc); /* - * Approximate a reasonable limit for the radix tree nodes + * Approximate a reasonable limit for the nodes * containing shadow entries. We don't need to keep more * shadow entries than possible pages on the active list, * since refault distances bigger than that are dismissed. @@@ -419,26 -383,20 +418,26 @@@ * worst-case density of 1/8th. Below that, not all eligible * refaults can be detected anymore. * - * On 64-bit with 7 radix_tree_nodes per page and 64 slots + * On 64-bit with 7 xa_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume * ~1.8% of available memory: * - * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE + * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ +#ifdef CONFIG_MEMCG if (sc->memcg) { - cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL_FILE); - } else { - cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + - node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); - } - max_nodes = cache >> (XA_CHUNK_SHIFT - 3); + struct lruvec *lruvec; + + pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL); + lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); + pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE); + pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE); + } else +#endif + pages = node_present_pages(sc->nid); + - max_nodes = pages >> (RADIX_TREE_MAP_SHIFT - 3); ++ max_nodes = pages >> (XA_CHUNK_SHIFT - 3); if (!nodes) return SHRINK_EMPTY; @@@ -451,11 -409,11 +450,11 @@@ static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, - void *arg) + void *arg) __must_hold(lru_lock) { + struct xa_node *node = container_of(item, struct xa_node, private_list); + XA_STATE(xas, node->array, 0); struct address_space *mapping; - struct radix_tree_node *node; - unsigned int i; int ret; /* @@@ -463,15 -421,14 +462,14 @@@ * the shadow node LRU under the i_pages lock and the * lru_lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the lru_lock pins any - * address_space that has radix tree nodes on the LRU. + * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ - node = container_of(item, struct radix_tree_node, private_list); - mapping = container_of(node->root, struct address_space, i_pages); + mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { @@@ -481,8 -438,6 +479,8 @@@ } list_lru_isolate(lru, item); + __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES); + spin_unlock(lru_lock); /* @@@ -490,29 -445,21 +488,21 @@@ * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - if (WARN_ON_ONCE(!node->exceptional)) + if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; - if (WARN_ON_ONCE(node->count != node->exceptional)) - goto out_invalid; - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { - if (node->slots[i]) { - if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) - goto out_invalid; - if (WARN_ON_ONCE(!node->exceptional)) - goto out_invalid; - if (WARN_ON_ONCE(!mapping->nrexceptional)) - goto out_invalid; - node->slots[i] = NULL; - node->exceptional--; - node->count--; - mapping->nrexceptional--; - } - } - if (WARN_ON_ONCE(node->exceptional)) + if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; + mapping->nrexceptional -= node->nr_values; + xas.xa_node = xa_parent_locked(&mapping->i_pages, node); + xas.xa_offset = node->offset; + xas.xa_shift = node->shift + XA_CHUNK_SHIFT; + xas_set_update(&xas, workingset_update_node); + /* + * We could store a shadow entry here which was the minimum of the + * shadow entries we were tracking ... + */ + xas_store(&xas, NULL); - inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); + __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); - __radix_tree_delete_node(&mapping->i_pages, node, - workingset_lookup_update(mapping)); out_invalid: xa_unlock_irq(&mapping->i_pages); @@@ -534,7 -481,7 +524,7 @@@ static unsigned long scan_shadow_nodes( static struct shrinker workingset_shadow_shrinker = { .count_objects = count_shadow_nodes, .scan_objects = scan_shadow_nodes, - .seeks = DEFAULT_SEEKS, + .seeks = 0, /* ->count reports only fully expendable nodes */ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, };