local_ops
workqueue
genericirq
+ xarray
flexible-arrays
librs
genalloc
errseq
printk-formats
circular-buffers
+ memory-allocation
mm-api
gfp_mask-from-fs-io
timekeeping
boot-time-mm
+ memory-hotplug
+
Interfaces for kernel debugging
===============================
F: Documentation/ABI/testing/configfs-acpi
F: drivers/pci/*acpi*
F: drivers/pci/*/*acpi*
-F: drivers/pci/*/*/*acpi*
F: tools/power/acpi/
ACPI APEI
F: drivers/hwmon/adt7475.c
ADVANSYS SCSI DRIVER
S: Maintained
S: Supported
F: drivers/mux/adgs1408.c
-F: Documentation/devicetree/bindings/mux/adgs1408.txt
+F: Documentation/devicetree/bindings/mux/adi,adgs1408.txt
ANALOG DEVICES INC ADP5061 DRIVER
T: git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git
S: Supported
F: arch/arm/mach-actions/
F: arch/arm/boot/dts/owl-*
F: arch/arm64/boot/dts/actions/
-F: drivers/clocksource/owl-*
+F: drivers/clocksource/timer-owl*
F: drivers/pinctrl/actions/*
F: drivers/soc/actions/
F: include/dt-bindings/power/owl-*
ARM/Annapurna Labs ALPINE ARCHITECTURE
-M: Antoine Tenart <antoine.tenart@free-electrons.com>
+M: Antoine Tenart <antoine.tenart@bootlin.com>
S: Maintained
F: arch/arm/mach-alpine/
S: Maintained
F: arch/arm/boot/dts/lpc43*
F: drivers/clk/nxp/clk-lpc18xx*
-F: drivers/clocksource/time-lpc32xx.c
+F: drivers/clocksource/timer-lpc32xx.c
F: drivers/i2c/busses/i2c-lpc2k.c
F: drivers/memory/pl172.c
F: drivers/mtd/spi-nor/nxp-spifi.c
F: drivers/gpio/gpio-uniphier.c
F: drivers/i2c/busses/i2c-uniphier*
F: drivers/irqchip/irq-uniphier-aidet.c
+F: drivers/mmc/host/uniphier-sd.c
F: drivers/pinctrl/uniphier/
F: drivers/reset/reset-uniphier.c
F: drivers/tty/serial/8250/8250_uniphier.c
F: */*/vexpress*
F: */*/*/vexpress*
F: drivers/clk/versatile/clk-vexpress-osc.c
-F: drivers/clocksource/versatile.c
+F: drivers/clocksource/timer-versatile.c
N: mps2
ARM/VFP SUPPORT
S: Maintained
F: arch/arm/mach-vt8500/
-F: drivers/clocksource/vt8500_timer.c
+F: drivers/clocksource/timer-vt8500.c
F: drivers/i2c/busses/i2c-wmt.c
F: drivers/mmc/host/wmt-sdmmc.c
F: drivers/pwm/pwm-vt8500.c
F: drivers/block/xsysace.c
N: zynq
N: xilinx
-F: drivers/clocksource/cadence_ttc_timer.c
+F: drivers/clocksource/timer-cadence-ttc.c
F: drivers/i2c/busses/i2c-cadence.c
F: drivers/mmc/host/sdhci-of-arasan.c
F: drivers/edac/synopsys_edac.c
BROADCOM BNX2 GIGABIT ETHERNET DRIVER
S: Supported
BROADCOM BNX2X 10 GIGABIT ETHERNET DRIVER
S: Supported
F: drivers/gpio/gpio-brcmstb.c
F: Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.txt
+BROADCOM BRCMSTB I2C DRIVER
+S: Supported
+F: drivers/i2c/busses/i2c-brcmstb.c
+F: Documentation/devicetree/bindings/i2c/i2c-brcmstb.txt
+
BROADCOM BRCMSTB USB2 and USB3 PHY DRIVER
F: Documentation/devicetree/bindings/memory-controllers/brcm,dpfe-cpu.txt
F: drivers/memory/brcmstb_dpfe.c
+BROADCOM SPI DRIVER
+S: Maintained
+F: Documentation/devicetree/bindings/spi/brcm,spi-bcm-qspi.txt
+F: drivers/spi/spi-bcm-qspi.*
+F: drivers/spi/spi-brcmstb-qspi.c
+F: drivers/spi/spi-iproc-qspi.c
+
BROADCOM SYSTEMPORT ETHERNET DRIVER
F: Documentation/devicetree/bindings/media/coda.txt
F: drivers/media/platform/coda/
+CODE OF CONDUCT
+S: Supported
+F: Documentation/process/code-of-conduct.rst
+F: Documentation/process/code-of-conduct-interpretation.rst
+
COMMON CLK FRAMEWORK
S: Supported
F: drivers/scsi/cxlflash/
-F: include/uapi/scsi/cxlflash_ioctls.h
+F: include/uapi/scsi/cxlflash_ioctl.h
F: Documentation/powerpc/cxlflash.txt
CYBERPRO FB DRIVER
S: Supported
-F: drivers/input/dlink-dir685-touchkeys.c
+F: drivers/input/keyboard/dlink-dir685-touchkeys.c
DALLAS/MAXIM DS1685-FAMILY REAL TIME CLOCK
F: drivers/platform/x86/dell-smbios-wmi.c
F: tools/wmi/dell-smbios-example.c
+DEFZA FDDI NETWORK DRIVER
+S: Maintained
+F: drivers/net/fddi/defza.*
+
DELL LAPTOP DRIVER
F: drivers/i2c/busses/i2c-diolan-u2c.c
FILESYSTEM DIRECT ACCESS (DAX)
F: Documentation/
F: scripts/kernel-doc
X: Documentation/ABI/
+X: Documentation/acpi/
X: Documentation/devicetree/
-X: Documentation/acpi
-X: Documentation/power
-X: Documentation/spi
-X: Documentation/media
+X: Documentation/i2c/
+X: Documentation/media/
+X: Documentation/power/
+X: Documentation/spi/
T: git git://git.lwn.net/linux.git docs-next
DOCUMENTATION/ITALIAN
DPAA2 ETHERNET DRIVER
-L: linux-kernel@vger.kernel.org
+L: netdev@vger.kernel.org
S: Maintained
-F: drivers/staging/fsl-dpaa2/ethernet
+F: drivers/net/ethernet/freescale/dpaa2/dpaa2-eth*
+F: drivers/net/ethernet/freescale/dpaa2/dpni*
+F: drivers/net/ethernet/freescale/dpaa2/dpkg.h
+F: drivers/net/ethernet/freescale/dpaa2/Makefile
+F: drivers/net/ethernet/freescale/dpaa2/Kconfig
DPAA2 ETHERNET SWITCH DRIVER
DPAA2 PTP CLOCK DRIVER
-L: linux-kernel@vger.kernel.org
+L: netdev@vger.kernel.org
S: Maintained
-F: drivers/staging/fsl-dpaa2/rtc
+F: drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp*
+F: drivers/net/ethernet/freescale/dpaa2/dprtc*
DPT_I2O SCSI RAID DRIVER
F: drivers/edac/r82600_edac.c
EDAC-SBRIDGE
S: Maintained
F: drivers/edac/sb_edac.c
F: drivers/net/ethernet/agere/
ETHERNET BRIDGE
W: http://www.linuxfoundation.org/en/Net:Bridge
Q: http://patchwork.ozlabs.org/project/linux-ext4/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git
S: Maintained
-F: Documentation/filesystems/ext4.txt
+F: Documentation/filesystems/ext4/ext4.rst
F: fs/ext4/
Extended Verification Module (EVM)
F: Documentation/hwmon/
F: drivers/hwmon/
F: include/linux/hwmon*.h
+F: include/trace/events/hwmon*.h
HARDWARE RANDOM NUMBER GENERATOR CORE
F: mm/memory-failure.c
F: mm/hwpoison-inject.c
+HYGON PROCESSOR SUPPORT
+S: Maintained
+F: arch/x86/kernel/cpu/hygon.c
+
Hyper-V CORE AND DRIVERS
S: Supported
F: Documentation/networking/e100.rst
F: Documentation/networking/e1000.rst
-F: Documentation/networking/e1000e.txt
-F: Documentation/networking/igb.txt
-F: Documentation/networking/igbvf.txt
-F: Documentation/networking/ixgb.txt
-F: Documentation/networking/ixgbe.txt
-F: Documentation/networking/ixgbevf.txt
-F: Documentation/networking/i40e.txt
-F: Documentation/networking/i40evf.txt
-F: Documentation/networking/ice.txt
+F: Documentation/networking/e1000e.rst
+F: Documentation/networking/fm10k.rst
+F: Documentation/networking/igb.rst
+F: Documentation/networking/igbvf.rst
+F: Documentation/networking/ixgb.rst
+F: Documentation/networking/ixgbe.rst
+F: Documentation/networking/ixgbevf.rst
+F: Documentation/networking/i40e.rst
+F: Documentation/networking/iavf.rst
+F: Documentation/networking/ice.rst
F: drivers/net/ethernet/intel/
F: drivers/net/ethernet/intel/*/
F: include/linux/avf/virtchnl.h
S: Supported
F: drivers/gpu/drm/i915/gvt/
+INTEL PMIC GPIO DRIVER
+S: Maintained
+F: drivers/gpio/gpio-*cove.c
+F: drivers/gpio/gpio-msic.c
+
INTEL HID EVENT DRIVER
F: arch/x86/include/asm/intel_pmc_ipc.h
F: arch/x86/include/asm/intel_punit_ipc.h
+INTEL MULTIFUNCTION PMIC DEVICE DRIVERS
+S: Maintained
+F: drivers/mfd/intel_msic.c
+F: drivers/mfd/intel_soc_pmic*
+F: include/linux/mfd/intel_msic.h
+F: include/linux/mfd/intel_soc_pmic*
+
INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT
F: drivers/infiniband/hw/i40iw/
F: include/uapi/rdma/i40iw-abi.h
-INTEL SHA MULTIBUFFER DRIVER
-S: Supported
-F: arch/x86/crypto/sha*-mb/
-F: crypto/mcryptd.c
-
INTEL TELEMETRY DRIVER
W: http://openipmi.sourceforge.net/
S: Supported
+F: Documentation/devicetree/bindings/ipmi/
F: Documentation/IPMI.txt
F: drivers/char/ipmi/
F: include/linux/ipmi*
KEYS-TRUSTED
F: net/l3mdev
F: include/net/l3mdev.h
+L7 BPF FRAMEWORK
+S: Maintained
+F: include/linux/skmsg.h
+F: net/core/skmsg.c
+F: net/core/sock_map.c
+F: net/ipv4/tcp_bpf.c
+
+LANTIQ / INTEL Ethernet drivers
+S: Maintained
+F: net/dsa/tag_gswip.c
+F: drivers/net/ethernet/lantiq_xrx200.c
+F: drivers/net/dsa/lantiq_pce.h
+F: drivers/net/dsa/lantiq_gswip.c
+
LANTIQ MIPS ARCHITECTURE
F: arch/*/include/asm/spinlock*.h
F: include/linux/rwlock*.h
F: include/linux/mutex*.h
-F: arch/*/include/asm/mutex*.h
F: include/linux/rwsem*.h
F: arch/*/include/asm/rwsem.h
F: include/linux/seqlock.h
F: drivers/scsi/mpt3sas/
LSILOGIC/SYMBIOS/NCR 53C8XX and 53C1010 PCI-SCSI drivers
S: Maintained
F: drivers/scsi/sym53c8xx_2/
S: Maintained
F: drivers/net/dsa/mv88e6xxx/
-F: linux/platform_data/mv88e6xxx.h
+F: include/linux/platform_data/mv88e6xxx.h
F: Documentation/devicetree/bindings/net/dsa/marvell.txt
MARVELL ARMADA DRM SUPPORT
F: drivers/mmc/host/sdhci-xenon*
F: Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt
+MARVELL OCTEONTX2 RVU ADMIN FUNCTION DRIVER
+S: Supported
+F: drivers/net/ethernet/marvell/octeontx2/af/
+
MATROX FRAMEBUFFER DRIVER
S: Orphan
F: Documentation/hwmon/max16065
F: drivers/hwmon/max16065.c
-MAX20751 HARDWARE MONITOR DRIVER
-S: Maintained
-F: Documentation/hwmon/max20751
-F: drivers/hwmon/max20751.c
-
MAX2175 SDR TUNER DRIVER
S: Maintained
-F: drivers/watchdog/menz069_wdt.c
+F: drivers/watchdog/menz69_wdt.c
MESON AO CEC DRIVER FOR AMLOGIC SOCS
S: Maintained
F: drivers/tty/serial/atmel_serial.c
F: drivers/tty/serial/atmel_serial.h
+F: Documentation/devicetree/bindings/mfd/atmel-usart.txt
MICROCHIP / ATMEL DMA DRIVER
F: drivers/mtd/nand/raw/atmel/*
F: Documentation/devicetree/bindings/mtd/atmel-nand.txt
+MICROCHIP AT91 USART MFD DRIVER
+S: Supported
+F: drivers/mfd/at91-usart.c
+F: include/dt-bindings/mfd/at91-usart.h
+F: Documentation/devicetree/bindings/mfd/atmel-usart.txt
+
+MICROCHIP AT91 USART SPI DRIVER
+S: Supported
+F: drivers/spi/spi-at91-usart.c
+F: Documentation/devicetree/bindings/mfd/atmel-usart.txt
+
MICROCHIP KSZ SERIES ETHERNET SWITCH DRIVER
S: Maintained
-F: arch/mips/loongson64/*{2e/2f}*
+F: arch/mips/loongson64/fuloong-2e/
+F: arch/mips/loongson64/lemote-2f/
F: arch/mips/include/asm/mach-loongson64/
F: drivers/*/*loongson2*
F: drivers/*/*/*loongson2*
F: arch/arm/boot/dts/mmp*
F: arch/arm/mach-mmp/
+MMU GATHER AND TLB INVALIDATION
+S: Maintained
+F: arch/*/include/asm/tlb.h
+F: include/asm-generic/tlb.h
+F: mm/mmu_gather.c
+
MN88472 MEDIA DRIVER
S: Maintained
F: Documentation/ABI/testing/sysfs-class-mux*
F: Documentation/devicetree/bindings/mux/
-F: include/linux/dt-bindings/mux/
+F: include/dt-bindings/mux/
F: include/linux/mux/
F: drivers/mux/
F: drivers/gpu/drm/mxsfb/
F: Documentation/devicetree/bindings/display/mxsfb.txt
+MYLEX DAC960 PCI RAID Controller
+S: Supported
+F: drivers/scsi/myrb.*
+F: drivers/scsi/myrs.*
+
MYRICOM MYRI-10G 10GbE DRIVER (MYRI10GE)
T: git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git
T: git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git
S: Maintained
-F: net/core/flow.c
F: net/xfrm/
F: net/key/
F: net/ipv4/xfrm*
S: Maintained
F: net/tls/*
S: Odd Fixes
F: Documentation/auxdisplay/lcd-panel-cgram.txt
-F: drivers/misc/panel.c
+F: drivers/auxdisplay/panel.c
PARALLEL PORT SUBSYSTEM
S: Maintained
-F: drivers/pci/controller/dwc/*keystone*
+F: drivers/pci/controller/dwc/pci-keystone.c
PCI ENDPOINT SUBSYSTEM
F: drivers/pinctrl/intel/
PIN CONTROLLER - MEDIATEK
-M: Sean Wang <sean.wang@mediatek.com>
+M: Sean Wang <sean.wang@kernel.org>
S: Maintained
F: Documentation/devicetree/bindings/pinctrl/pinctrl-mt65xx.txt
F: Documentation/devicetree/bindings/pinctrl/pinctrl-mt7622.txt
-F: drivers/pinctrl/mediatek/mtk-eint.*
-F: drivers/pinctrl/mediatek/pinctrl-mtk-common.*
-F: drivers/pinctrl/mediatek/pinctrl-mt2701.c
-F: drivers/pinctrl/mediatek/pinctrl-mt7622.c
+F: drivers/pinctrl/mediatek/
PIN CONTROLLER - QUALCOMM
W: http://www.roeck-us.net/linux/drivers/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git
S: Maintained
+F: Documentation/devicetree/bindings/hwmon/ibm,cffps1.txt
+F: Documentation/devicetree/bindings/hwmon/max31785.txt
+F: Documentation/devicetree/bindings/hwmon/ltc2978.txt
+F: Documentation/hwmon/adm1275
+F: Documentation/hwmon/ibm-cffps
+F: Documentation/hwmon/ir35221
+F: Documentation/hwmon/lm25066
+F: Documentation/hwmon/ltc2978
+F: Documentation/hwmon/ltc3815
+F: Documentation/hwmon/max16064
+F: Documentation/hwmon/max20751
+F: Documentation/hwmon/max31785
+F: Documentation/hwmon/max34440
+F: Documentation/hwmon/max8688
F: Documentation/hwmon/pmbus
+F: Documentation/hwmon/pmbus-core
+F: Documentation/hwmon/tps40422
+F: Documentation/hwmon/ucd9000
+F: Documentation/hwmon/ucd9200
+F: Documentation/hwmon/zl6100
F: drivers/hwmon/pmbus/
F: include/linux/pmbus.h
F: drivers/scsi/qla4xxx/
QLOGIC QLCNIC (1/10)Gb ETHERNET DRIVER
-M: Harish Patil <harish.patil@cavium.com>
+M: Shahed Shaikh <Shahed.Shaikh@cavium.com>
F: drivers/net/ethernet/qlogic/qlcnic/
QLOGIC QLGE 10Gb ETHERNET DRIVER
S: Supported
F: drivers/s390/crypto/
+S390 VFIO AP DRIVER
+W: http://www.ibm.com/developerworks/linux/linux390/
+S: Supported
+F: drivers/s390/crypto/vfio_ap_drv.c
+F: drivers/s390/crypto/vfio_ap_private.h
+F: drivers/s390/crypto/vfio_ap_ops.c
+F: Documentation/s390/vfio-ap.txt
+
S390 ZFCP DRIVER
-L: selinux@tycho.nsa.gov (moderated for non-subscribers)
+L: selinux@vger.kernel.org
W: https://selinuxproject.org
W: https://github.com/SELinuxProject
T: git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git
S: Supported
F: drivers/siox/*
+F: drivers/gpio/gpio-siox.c
F: include/trace/events/siox.h
SIS 190 ETHERNET DRIVER
S: Maintained
F: Documentation/devicetree/bindings/arm/firmware/sdei.txt
F: drivers/firmware/arm_sdei.c
-F: include/linux/sdei.h
-F: include/uapi/linux/sdei.h
+F: include/linux/arm_sdei.h
+F: include/uapi/linux/arm_sdei.h
SOFTWARE RAID (Multiple Disks) SUPPORT
F: include/sound/soc*
SOUNDWIRE SUBSYSTEM
F: drivers/reset/reset-axs10x.c
F: Documentation/devicetree/bindings/reset/snps,axs10x-reset.txt
+SYNOPSYS CREG GPIO DRIVER
+S: Maintained
+F: drivers/gpio/gpio-creg-snps.c
+F: Documentation/devicetree/bindings/gpio/snps,creg-gpio.txt
+
SYNOPSYS DESIGNWARE 8250 UART DRIVER
S: Maintained
S: Maintained
F: drivers/net/ethernet/ti/netcp*
+TI PCM3060 ASoC CODEC DRIVER
+S: Maintained
+F: Documentation/devicetree/bindings/sound/pcm3060.txt
+F: sound/soc/codecs/pcm3060*
+
TI TAS571X FAMILY ASoC CODEC DRIVER
F: drivers/usb/typec/altmodes/
F: include/linux/usb/typec_altmode.h
+USB TYPEC PORT CONTROLLER DRIVERS
+S: Maintained
+F: drivers/usb/typec/tcpm/
+
USB UHCI DRIVER
F: fs/hostfs/
F: fs/hppfs/
+USERSPACE COPYIN/COPYOUT (UIOVEC)
+S: Maintained
+F: lib/iov_iter.c
+F: include/linux/uio.h
+
USERSPACE I/O (UIO)
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
F: Documentation/driver-api/uio-howto.rst
F: drivers/uio/
-F: include/linux/uio*.h
+F: include/linux/uio_driver.h
UTIL-LINUX PACKAGE
UVESAFB DRIVER
-W: http://dev.gentoo.org/~spock/projects/uvesafb/
+W: https://github.com/mjanusz/v86d
S: Maintained
F: Documentation/fb/uvesafb.txt
F: drivers/video/fbdev/uvesafb.*
VRF
-M: Shrijeet Mukherjee <shm@cumulusnetworks.com>
+M: Shrijeet Mukherjee <shrijeet@gmail.com>
S: Maintained
F: drivers/net/vrf.c
S: Maintained
F: arch/x86/entry/vdso/
+ XARRAY
+ S: Supported
+ F: Documentation/core-api/xarray.rst
+ F: lib/idr.c
+ F: lib/xarray.c
+ F: include/linux/idr.h
+ F: include/linux/xarray.h
+ F: tools/testing/radix-tree
+
XC2028/3028 TUNER DRIVER
* Linux/PA-RISC Project (http://www.parisc-linux.org/)
*
* System call entry code / Linux gateway page
- * Copyright (c) Matthew Wilcox 1999 <willy@bofh.ai>
+ * Copyright (c) Matthew Wilcox 1999 <willy@infradead.org>
* Licensed under the GNU GPL.
* thanks to Philipp Rumpf, Mike Shaver and various others
* sorry about the wall, puffin..
sub,<> %r28, %r25, %r0
2: stw %r24, 0(%r26)
/* Free lock */
- sync
- stw %r20, 0(%sr2,%r20)
+ stw,ma %r20, 0(%sr2,%r20)
#if ENABLE_LWS_DEBUG
/* Clear thread register indicator */
stw %r0, 4(%sr2,%r20)
3:
/* Error occurred on load or store */
/* Free lock */
- sync
- stw %r20, 0(%sr2,%r20)
+ stw,ma %r20, 0(%sr2,%r20)
#if ENABLE_LWS_DEBUG
stw %r0, 4(%sr2,%r20)
#endif
cas2_end:
/* Free lock */
- sync
- stw %r20, 0(%sr2,%r20)
+ stw,ma %r20, 0(%sr2,%r20)
/* Enable interrupts */
ssm PSW_SM_I, %r0
/* Return to userspace, set no error */
22:
/* Error occurred on load or store */
/* Free lock */
- sync
- stw %r20, 0(%sr2,%r20)
+ stw,ma %r20, 0(%sr2,%r20)
ssm PSW_SM_I, %r0
ldo 1(%r0),%r28
b lws_exit
*/
#define _PAGE_BIT_SWAP_TYPE 0
-#define _PAGE_NA 0
-#define _PAGE_RO 0
-#define _PAGE_USER 0
-
#define _PAGE_EXEC 0x00001 /* execute permission */
#define _PAGE_WRITE 0x00002 /* write access allowed */
#define _PAGE_READ 0x00004 /* read access allowed */
*/
#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
_PAGE_ACCESSED | H_PAGE_THP_HUGE | _PAGE_PTE | \
- _PAGE_SOFT_DIRTY)
+ _PAGE_SOFT_DIRTY | _PAGE_DEVMAP)
/*
* user access blocked by key
*/
#define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_READ)
#define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | \
_PAGE_RW | _PAGE_EXEC)
-/*
- * No page size encoding in the linux PTE
- */
-#define _PAGE_PSIZE 0
/*
* _PAGE_CHG_MASK masks of bits that are to be preserved across
* pgprot changes
*/
#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
_PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \
- _PAGE_SOFT_DIRTY)
+ _PAGE_SOFT_DIRTY | _PAGE_DEVMAP)
#define H_PTE_PKEY (H_PTE_PKEY_BIT0 | H_PTE_PKEY_BIT1 | H_PTE_PKEY_BIT2 | \
H_PTE_PKEY_BIT3 | H_PTE_PKEY_BIT4)
-/*
- * Mask of bits returned by pte_pgprot()
- */
-#define PAGE_PROT_BITS (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT | \
- H_PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \
- _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_EXEC | \
- _PAGE_SOFT_DIRTY | H_PTE_PKEY)
/*
* We define 2 sets of base prot bits, one for basic pages (ie,
* cacheable kernel and user pages) and one for non cacheable
* pages. We always set _PAGE_COHERENT when SMP is enabled or
* the processor might need it for DMA coherency.
*/
-#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE)
+#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED)
#define _PAGE_BASE (_PAGE_BASE_NC)
/* Permission masks used to generate the __P and __S table,
* Write permissions imply read permissions for now (we could make write-only
* pages on BookE but we don't bother for now). Execute permission control is
* possible on platforms that define _PAGE_EXEC
- *
- * Note due to the way vm flags are laid out, the bits are XWR
*/
#define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_PRIVILEGED)
#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_RW)
#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_READ)
#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC)
-#define __P000 PAGE_NONE
-#define __P001 PAGE_READONLY
-#define __P010 PAGE_COPY
-#define __P011 PAGE_COPY
-#define __P100 PAGE_READONLY_X
-#define __P101 PAGE_READONLY_X
-#define __P110 PAGE_COPY_X
-#define __P111 PAGE_COPY_X
-
-#define __S000 PAGE_NONE
-#define __S001 PAGE_READONLY
-#define __S010 PAGE_SHARED
-#define __S011 PAGE_SHARED
-#define __S100 PAGE_READONLY_X
-#define __S101 PAGE_READONLY_X
-#define __S110 PAGE_SHARED_X
-#define __S111 PAGE_SHARED_X
-
/* Permission masks used for kernel mappings */
#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)
#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 0);
}
+#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
return !!(pte_raw(pte) & cpu_to_be64(_PAGE_SPECIAL));
}
-static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
+static inline bool pte_exec(pte_t pte)
+{
+ return !!(pte_raw(pte) & cpu_to_be64(_PAGE_EXEC));
+}
+
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline bool pte_soft_dirty(pte_t pte)
static inline pte_t pte_mksoft_dirty(pte_t pte)
{
- return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY);
+ return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SOFT_DIRTY));
}
static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
- return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY);
+ return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_SOFT_DIRTY));
}
#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
*/
VM_BUG_ON((pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_RWX | _PAGE_PRIVILEGED)) !=
cpu_to_be64(_PAGE_PRESENT | _PAGE_PRIVILEGED));
- return __pte(pte_val(pte) & ~_PAGE_PRIVILEGED);
+ return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_PRIVILEGED));
}
#define pte_clear_savedwrite pte_clear_savedwrite
* Used by KSM subsystem to make a protnone pte readonly.
*/
VM_BUG_ON(!pte_protnone(pte));
- return __pte(pte_val(pte) | _PAGE_PRIVILEGED);
+ return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PRIVILEGED));
}
#else
#define pte_clear_savedwrite pte_clear_savedwrite
static inline pte_t pte_clear_savedwrite(pte_t pte)
{
VM_WARN_ON(1);
- return __pte(pte_val(pte) & ~_PAGE_WRITE);
+ return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_WRITE));
}
#endif /* CONFIG_NUMA_BALANCING */
return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
}
+static inline bool pte_hw_valid(pte_t pte)
+{
+ return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT));
+}
+
#ifdef CONFIG_PPC_MEM_KEYS
extern bool arch_pte_access_permitted(u64 pte, bool write, bool execute);
#else
}
#endif /* CONFIG_PPC_MEM_KEYS */
+static inline bool pte_user(pte_t pte)
+{
+ return !(pte_raw(pte) & cpu_to_be64(_PAGE_PRIVILEGED));
+}
+
#define pte_access_permitted pte_access_permitted
static inline bool pte_access_permitted(pte_t pte, bool write)
{
- unsigned long pteval = pte_val(pte);
- /* Also check for pte_user */
- unsigned long clear_pte_bits = _PAGE_PRIVILEGED;
/*
* _PAGE_READ is needed for any access and will be
* cleared for PROT_NONE
*/
- unsigned long need_pte_bits = _PAGE_PRESENT | _PAGE_READ;
-
- if (write)
- need_pte_bits |= _PAGE_WRITE;
-
- if ((pteval & need_pte_bits) != need_pte_bits)
+ if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte))
return false;
- if ((pteval & clear_pte_bits) == clear_pte_bits)
+ if (write && !pte_write(pte))
return false;
return arch_pte_access_permitted(pte_val(pte), write, 0);
{
if (unlikely(pte_savedwrite(pte)))
return pte_clear_savedwrite(pte);
- return __pte(pte_val(pte) & ~_PAGE_WRITE);
+ return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_WRITE));
+}
+
+static inline pte_t pte_exprotect(pte_t pte)
+{
+ return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_EXEC));
}
static inline pte_t pte_mkclean(pte_t pte)
{
- return __pte(pte_val(pte) & ~_PAGE_DIRTY);
+ return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_DIRTY));
}
static inline pte_t pte_mkold(pte_t pte)
{
- return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
+ return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_ACCESSED));
+}
+
+static inline pte_t pte_mkexec(pte_t pte)
+{
+ return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_EXEC));
+}
+
+static inline pte_t pte_mkpte(pte_t pte)
+{
+ return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE));
}
static inline pte_t pte_mkwrite(pte_t pte)
/*
* write implies read, hence set both
*/
- return __pte(pte_val(pte) | _PAGE_RW);
+ return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_RW));
}
static inline pte_t pte_mkdirty(pte_t pte)
{
- return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_DIRTY | _PAGE_SOFT_DIRTY));
}
static inline pte_t pte_mkyoung(pte_t pte)
{
- return __pte(pte_val(pte) | _PAGE_ACCESSED);
+ return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_ACCESSED));
}
static inline pte_t pte_mkspecial(pte_t pte)
{
- return __pte(pte_val(pte) | _PAGE_SPECIAL);
+ return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SPECIAL));
}
static inline pte_t pte_mkhuge(pte_t pte)
static inline pte_t pte_mkdevmap(pte_t pte)
{
- return __pte(pte_val(pte) | _PAGE_SPECIAL|_PAGE_DEVMAP);
+ return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SPECIAL | _PAGE_DEVMAP));
+}
+
+static inline pte_t pte_mkprivileged(pte_t pte)
+{
+ return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PRIVILEGED));
+}
+
+static inline pte_t pte_mkuser(pte_t pte)
+{
+ return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_PRIVILEGED));
}
/*
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
/* FIXME!! check whether this need to be a conditional */
- return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
-}
-
-static inline bool pte_user(pte_t pte)
-{
- return !(pte_raw(pte) & cpu_to_be64(_PAGE_PRIVILEGED));
+ return __pte_raw((pte_raw(pte) & cpu_to_be64(_PAGE_CHG_MASK)) |
+ cpu_to_be64(pgprot_val(newprot)));
}
/* Encode and de-code a swap entry */
BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \
BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \
} while (0)
- /*
- * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
- */
+
#define SWP_TYPE_BITS 5
#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \
& ((1UL << SWP_TYPE_BITS) - 1))
*/
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE })
#define __swp_entry_to_pte(x) __pte((x).val | _PAGE_PTE)
+#define __pmd_to_swp_entry(pmd) (__pte_to_swp_entry(pmd_pte(pmd)))
+#define __swp_entry_to_pmd(x) (pte_pmd(__swp_entry_to_pte(x)))
#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE))
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
- return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY);
+ return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SWP_SOFT_DIRTY));
}
static inline bool pte_swp_soft_dirty(pte_t pte)
static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
- return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY);
+ return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_SWP_SOFT_DIRTY));
}
#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
*/
static inline bool pte_ci(pte_t pte)
{
- unsigned long pte_v = pte_val(pte);
+ __be64 pte_v = pte_raw(pte);
- if (((pte_v & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) ||
- ((pte_v & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT))
+ if (((pte_v & cpu_to_be64(_PAGE_CACHE_CTL)) == cpu_to_be64(_PAGE_TOLERANT)) ||
+ ((pte_v & cpu_to_be64(_PAGE_CACHE_CTL)) == cpu_to_be64(_PAGE_NON_IDEMPOTENT)))
return true;
return false;
}
static inline int pmd_present(pmd_t pmd)
{
+ /*
+ * A pmd is considerent present if _PAGE_PRESENT is set.
+ * We also need to consider the pmd present which is marked
+ * invalid during a split. Hence we look for _PAGE_INVALID
+ * if we find _PAGE_PRESENT cleared.
+ */
+ if (pmd_raw(pmd) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID))
+ return true;
- return !pmd_none(pmd);
+ return false;
}
static inline int pmd_bad(pmd_t pmd)
static inline int pud_present(pud_t pud)
{
- return !pud_none(pud);
+ return (pud_raw(pud) & cpu_to_be64(_PAGE_PRESENT));
}
extern struct page *pud_page(pud_t pud);
static inline int pgd_present(pgd_t pgd)
{
- return !pgd_none(pgd);
+ return (pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT));
}
static inline pte_t pgd_pte(pgd_t pgd)
#define pgd_ERROR(e) \
pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
-static inline int map_kernel_page(unsigned long ea, unsigned long pa,
- unsigned long flags)
+static inline int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
{
if (radix_enabled()) {
#if defined(CONFIG_PPC_RADIX_MMU) && defined(DEBUG_VM)
unsigned long page_size = 1 << mmu_psize_defs[mmu_io_psize].shift;
WARN((page_size != PAGE_SIZE), "I/O page size != PAGE_SIZE");
#endif
- return radix__map_kernel_page(ea, pa, __pgprot(flags), PAGE_SIZE);
+ return radix__map_kernel_page(ea, pa, prot, PAGE_SIZE);
}
- return hash__map_kernel_page(ea, pa, flags);
+ return hash__map_kernel_page(ea, pa, prot);
}
static inline int __meminit vmemmap_create_mapping(unsigned long start,
#define pmd_soft_dirty(pmd) pte_soft_dirty(pmd_pte(pmd))
#define pmd_mksoft_dirty(pmd) pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)))
#define pmd_clear_soft_dirty(pmd) pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)))
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+#define pmd_swp_mksoft_dirty(pmd) pte_pmd(pte_swp_mksoft_dirty(pmd_pte(pmd)))
+#define pmd_swp_soft_dirty(pmd) pte_swp_soft_dirty(pmd_pte(pmd))
+#define pmd_swp_clear_soft_dirty(pmd) pte_pmd(pte_swp_clear_soft_dirty(pmd_pte(pmd)))
+#endif
#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
#ifdef CONFIG_NUMA_BALANCING
return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set);
}
+/*
+ * returns true for pmd migration entries, THP, devmap, hugetlb
+ * But compile time dependent on THP config
+ */
static inline int pmd_large(pmd_t pmd)
{
return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
pmd_hugepage_update(mm, addr, pmdp, 0, _PAGE_PRIVILEGED);
}
+/*
+ * Only returns true for a THP. False for pmd migration entry.
+ * We also need to return true when we come across a pte that
+ * in between a thp split. While splitting THP, we mark the pmd
+ * invalid (pmdp_invalidate()) before we set it with pte page
+ * address. A pmd_trans_huge() check against a pmd entry during that time
+ * should return true.
+ * We should not call this on a hugetlb entry. We should check for HugeTLB
+ * entry using vma->vm_flags
+ * The page table walk rule is explained in Documentation/vm/transhuge.rst
+ */
static inline int pmd_trans_huge(pmd_t pmd)
{
+ if (!pmd_present(pmd))
+ return false;
+
if (radix_enabled())
return radix__pmd_trans_huge(pmd);
return hash__pmd_trans_huge(pmd);
* Include the PTE bits definitions
*/
#include <asm/nohash/pte-book3e.h>
-#include <asm/pte-common.h>
+
+#define _PAGE_SAO 0
+
+#define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1))
+
+/*
+ * _PAGE_CHG_MASK masks of bits that are to be preserved across
+ * pgprot changes.
+ */
+#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPECIAL)
+
+#define H_PAGE_4K_PFN 0
#ifndef __ASSEMBLY__
/* pte_clear moved to later in this file */
+static inline pte_t pte_mkwrite(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_RW);
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_RW);
+}
+
+static inline pte_t pte_mkexec(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_EXEC);
+}
+
#define PMD_BAD_BITS (PTE_TABLE_SIZE-1)
#define PUD_BAD_BITS (PMD_TABLE_SIZE-1)
pte_update(mm, addr, ptep, _PAGE_RW, 0, 0);
}
+#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
#define MAX_SWAPFILES_CHECK() do { \
BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \
} while (0)
- /*
- * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
- */
+
#define SWP_TYPE_BITS 5
#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \
& ((1UL << SWP_TYPE_BITS) - 1))
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
#define __swp_entry_to_pte(x) __pte((x).val)
-extern int map_kernel_page(unsigned long ea, unsigned long pa,
- unsigned long flags);
+int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot);
extern int __meminit vmemmap_create_mapping(unsigned long start,
unsigned long page_size,
unsigned long phys);
* Copyright (C) 2002,2003 NEC Corporation
* Copyright (C) 2003-2005 Hewlett Packard
*
* All rights reserved.
* struct slot - slot information for each *physical* slot
*/
struct slot {
- struct hotplug_slot *hotplug_slot;
+ struct hotplug_slot hotplug_slot;
struct acpiphp_slot *acpi_slot;
- struct hotplug_slot_info info;
unsigned int sun; /* ACPI _SUN (Slot User Number) value */
};
static inline const char *slot_name(struct slot *slot)
{
- return hotplug_slot_name(slot->hotplug_slot);
+ return hotplug_slot_name(&slot->hotplug_slot);
+}
+
+static inline struct slot *to_slot(struct hotplug_slot *hotplug_slot)
+{
+ return container_of(hotplug_slot, struct slot, hotplug_slot);
}
/*
* Copyright (C) 2002,2003 NEC Corporation
* Copyright (C) 2003-2005 Hewlett Packard
*
* All rights reserved.
static struct acpiphp_attention_info *attention_info;
#define DRIVER_VERSION "0.5"
#define DRIVER_DESC "ACPI Hot Plug PCI Controller Driver"
MODULE_AUTHOR(DRIVER_AUTHOR);
static int get_latch_status(struct hotplug_slot *slot, u8 *value);
static int get_adapter_status(struct hotplug_slot *slot, u8 *value);
-static struct hotplug_slot_ops acpi_hotplug_slot_ops = {
+static const struct hotplug_slot_ops acpi_hotplug_slot_ops = {
.enable_slot = enable_slot,
.disable_slot = disable_slot,
.set_attention_status = set_attention_status,
*/
static int enable_slot(struct hotplug_slot *hotplug_slot)
{
- struct slot *slot = hotplug_slot->private;
+ struct slot *slot = to_slot(hotplug_slot);
pr_debug("%s - physical_slot = %s\n", __func__, slot_name(slot));
*/
static int disable_slot(struct hotplug_slot *hotplug_slot)
{
- struct slot *slot = hotplug_slot->private;
+ struct slot *slot = to_slot(hotplug_slot);
pr_debug("%s - physical_slot = %s\n", __func__, slot_name(slot));
*/
static int get_power_status(struct hotplug_slot *hotplug_slot, u8 *value)
{
- struct slot *slot = hotplug_slot->private;
+ struct slot *slot = to_slot(hotplug_slot);
pr_debug("%s - physical_slot = %s\n", __func__, slot_name(slot));
*/
static int get_latch_status(struct hotplug_slot *hotplug_slot, u8 *value)
{
- struct slot *slot = hotplug_slot->private;
+ struct slot *slot = to_slot(hotplug_slot);
pr_debug("%s - physical_slot = %s\n", __func__, slot_name(slot));
*/
static int get_adapter_status(struct hotplug_slot *hotplug_slot, u8 *value)
{
- struct slot *slot = hotplug_slot->private;
+ struct slot *slot = to_slot(hotplug_slot);
pr_debug("%s - physical_slot = %s\n", __func__, slot_name(slot));
if (!slot)
goto error;
- slot->hotplug_slot = kzalloc(sizeof(*slot->hotplug_slot), GFP_KERNEL);
- if (!slot->hotplug_slot)
- goto error_slot;
-
- slot->hotplug_slot->info = &slot->info;
-
- slot->hotplug_slot->private = slot;
- slot->hotplug_slot->ops = &acpi_hotplug_slot_ops;
+ slot->hotplug_slot.ops = &acpi_hotplug_slot_ops;
slot->acpi_slot = acpiphp_slot;
- slot->hotplug_slot->info->power_status = acpiphp_get_power_status(slot->acpi_slot);
- slot->hotplug_slot->info->attention_status = 0;
- slot->hotplug_slot->info->latch_status = acpiphp_get_latch_status(slot->acpi_slot);
- slot->hotplug_slot->info->adapter_status = acpiphp_get_adapter_status(slot->acpi_slot);
acpiphp_slot->slot = slot;
slot->sun = sun;
snprintf(name, SLOT_NAME_SIZE, "%u", sun);
- retval = pci_hp_register(slot->hotplug_slot, acpiphp_slot->bus,
+ retval = pci_hp_register(&slot->hotplug_slot, acpiphp_slot->bus,
acpiphp_slot->device, name);
if (retval == -EBUSY)
- goto error_hpslot;
+ goto error_slot;
if (retval) {
pr_err("pci_hp_register failed with error %d\n", retval);
- goto error_hpslot;
+ goto error_slot;
}
pr_info("Slot [%s] registered\n", slot_name(slot));
return 0;
-error_hpslot:
- kfree(slot->hotplug_slot);
error_slot:
kfree(slot);
error:
pr_info("Slot [%s] unregistered\n", slot_name(slot));
- pci_hp_deregister(slot->hotplug_slot);
- kfree(slot->hotplug_slot);
+ pci_hp_deregister(&slot->hotplug_slot);
kfree(slot);
}
if (pg_index > end_index)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&mapping->i_pages, pg_index);
- rcu_read_unlock();
- if (page && !radix_tree_exceptional_entry(page)) {
+ page = xa_load(&mapping->i_pages, pg_index);
+ if (page && !xa_is_value(page)) {
misses++;
if (misses > 4)
break;
int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_io_tree *tree;
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
unsigned long compressed_len;
int faili = 0;
u32 *sums;
- tree = &BTRFS_I(inode)->io_tree;
em_tree = &BTRFS_I(inode)->extent_tree;
/* we need the actual starting offset of this extent in the file */
struct extent_state **cached_state)
{
struct extent_state *state;
- struct rb_node *n;
int ret = 1;
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
if (state->end == start - 1 && extent_state_in_tree(state)) {
- n = rb_next(&state->rb_node);
- while (n) {
- state = rb_entry(n, struct extent_state,
- rb_node);
+ while ((state = next_state(state)) != NULL) {
if (state->state & bits)
goto got_it;
- n = rb_next(n);
}
free_extent_state(*cached_state);
*cached_state = NULL;
*
* 1 is returned if we find something, 0 if nothing was in the tree
*/
-STATIC u64 find_lock_delalloc_range(struct inode *inode,
+static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end, u64 max_bytes)
return found;
}
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+u64 btrfs_find_lock_delalloc_range(struct inode *inode,
+ struct extent_io_tree *tree,
+ struct page *locked_page, u64 *start,
+ u64 *end, u64 max_bytes)
+{
+ return find_lock_delalloc_range(inode, tree, locked_page, start, end,
+ max_bytes);
+}
+#endif
+
static int __process_pages_contig(struct address_space *mapping,
struct page *locked_page,
pgoff_t start_index, pgoff_t end_index,
pgoff_t index;
pgoff_t end; /* Inclusive */
int scanned = 0;
- int tag;
+ xa_mark_t tag;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
pgoff_t done_index;
int range_whole = 0;
int scanned = 0;
- int tag;
+ xa_mark_t tag;
/*
* We have to hold onto the inode so that ordered extents can do their
clear_page_dirty_for_io(page);
xa_lock_irq(&page->mapping->i_pages);
- if (!PageDirty(page)) {
- radix_tree_tag_clear(&page->mapping->i_pages,
- page_index(page),
- PAGECACHE_TAG_DIRTY);
- }
+ if (!PageDirty(page))
+ __xa_clear_mark(&page->mapping->i_pages,
+ page_index(page), PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&page->mapping->i_pages);
ClearPageError(page);
unlock_page(page);
WARN_ON(atomic_read(&eb->refs) == 0);
}
-int set_extent_buffer_dirty(struct extent_buffer *eb)
+bool set_extent_buffer_dirty(struct extent_buffer *eb)
{
int i;
int num_pages;
- int was_dirty = 0;
+ bool was_dirty;
check_buffer_tree_ref(eb);
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+ if (!was_dirty)
+ for (i = 0; i < num_pages; i++)
+ set_page_dirty(eb->pages[i]);
+
+#ifdef CONFIG_BTRFS_DEBUG
for (i = 0; i < num_pages; i++)
- set_page_dirty(eb->pages[i]);
+ ASSERT(PageDirty(eb->pages[i]));
+#endif
+
return was_dirty;
}
EXPORT_SYMBOL(mark_buffer_dirty_inode);
/*
- * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
+ * Mark the page dirty, and set it dirty in the page cache, and mark the inode
* dirty.
*
* If warn is true, then emit a warning if the page is not uptodate and has
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->i_pages,
- page_index(page), PAGECACHE_TAG_DIRTY);
+ __xa_set_mark(&mapping->i_pages, page_index(page),
+ PAGECACHE_TAG_DIRTY);
}
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
* The relationship between dirty buffers and dirty pages:
*
* Whenever a page has any dirty buffers, the page's dirty bit is set, and
- * the page is tagged dirty in its radix tree.
+ * the page is tagged dirty in the page cache.
*
* At all times, the dirtiness of the buffers represents the dirtiness of
* subsections of the page. If the page has buffers, the page dirty bit is
* mark_buffer_dirty - mark a buffer_head as needing writeout
* @bh: the buffer_head to mark dirty
*
- * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
- * backing page dirty, then tag the page as dirty in its address_space's radix
- * tree and then attach the address_space's inode to its superblock's dirty
+ * mark_buffer_dirty() will set the dirty bit against the buffer, then set
+ * its backing page dirty, then tag the page as dirty in the page cache
+ * and then attach the address_space's inode to its superblock's dirty
* inode list.
*
* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
*/
bio = bio_alloc(GFP_NOIO, 1);
- if (wbc) {
- wbc_init_bio(wbc, bio);
- wbc_account_io(wbc, bh->b_page, bh->b_size);
- }
-
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_write_hint = write_hint;
op_flags |= REQ_PRIO;
bio_set_op_attrs(bio, op, op_flags);
+ if (wbc) {
+ wbc_init_bio(wbc, bio);
+ wbc_account_io(wbc, bh->b_page, bh->b_size);
+ }
+
submit_bio(bio);
return 0;
}
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
!(status & EXTENT_STATUS_WRITTEN) &&
- ext4_find_delalloc_range(inode, map->m_lblk,
- map->m_lblk + map->m_len - 1))
+ ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
ret = ext4_es_insert_extent(inode, map->m_lblk,
map->m_len, map->m_pblk, status);
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
!(status & EXTENT_STATUS_WRITTEN) &&
- ext4_find_delalloc_range(inode, map->m_lblk,
- map->m_lblk + map->m_len - 1))
+ ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
return 0; /* success */
}
-static void ext4_da_release_space(struct inode *inode, int to_free)
+void ext4_da_release_space(struct inode *inode, int to_free)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int offset,
unsigned int length)
{
- int to_release = 0, contiguous_blks = 0;
+ int contiguous_blks = 0;
struct buffer_head *head, *bh;
unsigned int curr_off = 0;
struct inode *inode = page->mapping->host;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned int stop = offset + length;
- int num_clusters;
ext4_fsblk_t lblk;
BUG_ON(stop > PAGE_SIZE || stop < length);
break;
if ((offset <= curr_off) && (buffer_delay(bh))) {
- to_release++;
contiguous_blks++;
clear_buffer_delay(bh);
} else if (contiguous_blks) {
(PAGE_SHIFT - inode->i_blkbits);
lblk += (curr_off >> inode->i_blkbits) -
contiguous_blks;
- ext4_es_remove_extent(inode, lblk, contiguous_blks);
+ ext4_es_remove_blks(inode, lblk, contiguous_blks);
contiguous_blks = 0;
}
curr_off = next_off;
if (contiguous_blks) {
lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
- ext4_es_remove_extent(inode, lblk, contiguous_blks);
+ ext4_es_remove_blks(inode, lblk, contiguous_blks);
}
- /* If we have released all the blocks belonging to a cluster, then we
- * need to release the reserved space for that cluster. */
- num_clusters = EXT4_NUM_B2C(sbi, to_release);
- while (num_clusters > 0) {
- lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) +
- ((num_clusters - 1) << sbi->s_cluster_bits);
- if (sbi->s_cluster_ratio == 1 ||
- !ext4_find_delalloc_cluster(inode, lblk))
- ext4_da_release_space(inode, 1);
-
- num_clusters--;
- }
}
/*
return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
}
+/*
+ * ext4_insert_delayed_block - adds a delayed block to the extents status
+ * tree, incrementing the reserved cluster/block
+ * count or making a pending reservation
+ * where needed
+ *
+ * @inode - file containing the newly added block
+ * @lblk - logical block to be added
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int ret;
+ bool allocated = false;
+
+ /*
+ * If the cluster containing lblk is shared with a delayed,
+ * written, or unwritten extent in a bigalloc file system, it's
+ * already been accounted for and does not need to be reserved.
+ * A pending reservation must be made for the cluster if it's
+ * shared with a written or unwritten extent and doesn't already
+ * have one. Written and unwritten extents can be purged from the
+ * extents status tree if the system is under memory pressure, so
+ * it's necessary to examine the extent tree if a search of the
+ * extents status tree doesn't get a match.
+ */
+ if (sbi->s_cluster_ratio == 1) {
+ ret = ext4_da_reserve_space(inode);
+ if (ret != 0) /* ENOSPC */
+ goto errout;
+ } else { /* bigalloc */
+ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
+ if (!ext4_es_scan_clu(inode,
+ &ext4_es_is_mapped, lblk)) {
+ ret = ext4_clu_mapped(inode,
+ EXT4_B2C(sbi, lblk));
+ if (ret < 0)
+ goto errout;
+ if (ret == 0) {
+ ret = ext4_da_reserve_space(inode);
+ if (ret != 0) /* ENOSPC */
+ goto errout;
+ } else {
+ allocated = true;
+ }
+ } else {
+ allocated = true;
+ }
+ }
+ }
+
+ ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
+
+errout:
+ return ret;
+}
+
/*
* This function is grabs code from the very beginning of
* ext4_map_blocks, but assumes that the caller is from delayed write
add_delayed:
if (retval == 0) {
int ret;
+
/*
* XXX: __block_prepare_write() unmaps passed block,
* is it OK?
*/
- /*
- * If the block was allocated from previously allocated cluster,
- * then we don't need to reserve it again. However we still need
- * to reserve metadata for every block we're going to write.
- */
- if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
- !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
- ret = ext4_da_reserve_space(inode);
- if (ret) {
- /* not enough space to reserve */
- retval = ret;
- goto out_unlock;
- }
- }
- ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- ~0, EXTENT_STATUS_DELAYED);
- if (ret) {
+ ret = ext4_insert_delayed_block(inode, map->m_lblk);
+ if (ret != 0) {
retval = ret;
goto out_unlock;
}
long left = mpd->wbc->nr_to_write;
pgoff_t index = mpd->first_page;
pgoff_t end = mpd->last_page;
- int tag;
+ xa_mark_t tag;
int i, err = 0;
int blkbits = mpd->inode->i_blkbits;
ext4_lblk_t lblk;
ext4_lblk_t end = map.m_lblk + map.m_len - 1;
struct extent_status es;
- ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es);
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
+ map.m_lblk, end, &es);
if (!es.es_len || es.es_lblk > end) {
/* entire range is a hole */
return !buffer_mapped(bh);
}
-int ext4_page_mkwrite(struct vm_fault *vmf)
+vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = vmf->page;
loff_t size;
unsigned long len;
- int ret;
+ int err;
+ vm_fault_t ret;
struct file *file = vma->vm_file;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
down_read(&EXT4_I(inode)->i_mmap_sem);
- ret = ext4_convert_inline_data(inode);
- if (ret)
+ err = ext4_convert_inline_data(inode);
+ if (err)
goto out_ret;
/* Delalloc case is easy... */
!ext4_should_journal_data(inode) &&
!ext4_nonda_switch(inode->i_sb)) {
do {
- ret = block_page_mkwrite(vma, vmf,
+ err = block_page_mkwrite(vma, vmf,
ext4_da_get_block_prep);
- } while (ret == -ENOSPC &&
+ } while (err == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries));
goto out_ret;
}
ret = VM_FAULT_SIGBUS;
goto out;
}
- ret = block_page_mkwrite(vma, vmf, get_block);
- if (!ret && ext4_should_journal_data(inode)) {
+ err = block_page_mkwrite(vma, vmf, get_block);
+ if (!err && ext4_should_journal_data(inode)) {
if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
PAGE_SIZE, NULL, do_journal_get_write_access)) {
unlock_page(page);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
}
ext4_journal_stop(handle);
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry_alloc;
out_ret:
- ret = block_page_mkwrite_return(ret);
+ ret = block_page_mkwrite_return(err);
out:
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(inode->i_sb);
return ret;
}
-int ext4_filemap_fault(struct vm_fault *vmf)
+vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
- int err;
+ vm_fault_t ret;
down_read(&EXT4_I(inode)->i_mmap_sem);
- err = filemap_fault(vmf);
+ ret = filemap_fault(vmf);
up_read(&EXT4_I(inode)->i_mmap_sem);
- return err;
+ return ret;
}
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/data.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
inode->i_ino == F2FS_NODE_INO(sbi) ||
S_ISDIR(inode->i_mode) ||
(S_ISREG(inode->i_mode) &&
- is_inode_flag_set(inode, FI_ATOMIC_FILE)) ||
+ (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) ||
is_cold_data(page))
return true;
return false;
}
+static enum count_type __read_io_type(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+
+ if (mapping) {
+ struct inode *inode = mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (inode->i_ino == F2FS_META_INO(sbi))
+ return F2FS_RD_META;
+
+ if (inode->i_ino == F2FS_NODE_INO(sbi))
+ return F2FS_RD_NODE;
+ }
+ return F2FS_RD_DATA;
+}
+
/* postprocessing steps for read bios */
enum bio_post_read_step {
STEP_INITIAL = 0,
/* PG_error was set if any post_read step failed */
if (bio->bi_status || PageError(page)) {
ClearPageUptodate(page);
- SetPageError(page);
+ /* will re-read again later */
+ ClearPageError(page);
} else {
SetPageUptodate(page);
}
+ dec_page_count(F2FS_P_SB(page), __read_io_type(page));
unlock_page(page);
}
if (bio->bi_private)
static void f2fs_read_end_io(struct bio *bio)
{
- if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), FAULT_IO)) {
- f2fs_show_injection_info(FAULT_IO);
+ if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)),
+ FAULT_READ_IO)) {
+ f2fs_show_injection_info(FAULT_READ_IO);
bio->bi_status = BLK_STS_IOERR;
}
struct bio_vec *bvec;
int i;
+ if (time_to_inject(sbi, FAULT_WRITE_IO)) {
+ f2fs_show_injection_info(FAULT_WRITE_IO);
+ bio->bi_status = BLK_STS_IOERR;
+ }
+
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
enum count_type type = WB_DATA_TYPE(page);
io->bio = NULL;
}
-static bool __has_merged_page(struct f2fs_bio_info *io,
- struct inode *inode, nid_t ino, pgoff_t idx)
+static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
+ struct page *page, nid_t ino)
{
struct bio_vec *bvec;
struct page *target;
if (!io->bio)
return false;
- if (!inode && !ino)
+ if (!inode && !page && !ino)
return true;
bio_for_each_segment_all(bvec, io->bio, i) {
else
target = fscrypt_control_page(bvec->bv_page);
- if (idx != target->index)
- continue;
-
if (inode && inode == target->mapping->host)
return true;
+ if (page && page == target)
+ return true;
if (ino && ino == ino_of_node(target))
return true;
}
}
static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode,
- nid_t ino, pgoff_t idx, enum page_type type)
+ struct page *page, nid_t ino,
+ enum page_type type)
{
enum page_type btype = PAGE_TYPE_OF_BIO(type);
enum temp_type temp;
io = sbi->write_io[btype] + temp;
down_read(&io->io_rwsem);
- ret = __has_merged_page(io, inode, ino, idx);
+ ret = __has_merged_page(io, inode, page, ino);
up_read(&io->io_rwsem);
/* TODO: use HOT temp only for meta pages now. */
}
static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
- struct inode *inode, nid_t ino, pgoff_t idx,
- enum page_type type, bool force)
+ struct inode *inode, struct page *page,
+ nid_t ino, enum page_type type, bool force)
{
enum temp_type temp;
- if (!force && !has_merged_page(sbi, inode, ino, idx, type))
+ if (!force && !has_merged_page(sbi, inode, page, ino, type))
return;
for (temp = HOT; temp < NR_TEMP_TYPE; temp++) {
}
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
- struct inode *inode, nid_t ino, pgoff_t idx,
- enum page_type type)
+ struct inode *inode, struct page *page,
+ nid_t ino, enum page_type type)
{
- __submit_merged_write_cond(sbi, inode, ino, idx, type, false);
+ __submit_merged_write_cond(sbi, inode, page, ino, type, false);
}
void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi)
bio_put(bio);
return -EFAULT;
}
+
+ if (fio->io_wbc && !is_read_io(fio->op))
+ wbc_account_io(fio->io_wbc, page, PAGE_SIZE);
+
bio_set_op_attrs(bio, fio->op, fio->op_flags);
- __submit_bio(fio->sbi, bio, fio->type);
+ inc_page_count(fio->sbi, is_read_io(fio->op) ?
+ __read_io_type(page): WB_DATA_TYPE(fio->page));
- if (!is_read_io(fio->op))
- inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page));
+ __submit_bio(fio->sbi, bio, fio->type);
return 0;
}
if (fio->in_list)
goto next;
out:
+ if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
+ f2fs_is_checkpoint_ready(sbi))
+ __submit_merged_bio(io);
up_write(&io->io_rwsem);
}
ctx->bio = bio;
ctx->enabled_steps = post_read_steps;
bio->bi_private = ctx;
-
- /* wait the page to be moved by cleaning */
- f2fs_wait_on_block_writeback(sbi, blkaddr);
}
return bio;
if (IS_ERR(bio))
return PTR_ERR(bio);
+ /* wait for GCed page writeback via META_MAPPING */
+ f2fs_wait_on_block_writeback(inode, blkaddr);
+
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
bio_put(bio);
return -EFAULT;
}
+ ClearPageError(page);
+ inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
__submit_bio(F2FS_I_SB(inode), bio, DATA);
return 0;
}
struct f2fs_summary sum;
struct node_info ni;
block_t old_blkaddr;
- pgoff_t fofs;
blkcnt_t count = 1;
int err;
dn->data_blkaddr = datablock_addr(dn->inode,
dn->node_page, dn->ofs_in_node);
- if (dn->data_blkaddr == NEW_ADDR)
+ if (dn->data_blkaddr != NULL_ADDR)
goto alloc;
if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count))))
old_blkaddr, old_blkaddr);
f2fs_set_data_blkaddr(dn);
- /* update i_size */
- fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
- dn->ofs_in_node;
- if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT))
- f2fs_i_size_write(dn->inode,
- ((loff_t)(fofs + 1) << PAGE_SHIFT));
+ /*
+ * i_size will be updated by direct_IO. Otherwise, we'll get stale
+ * data from unwritten block via dio_read.
+ */
return 0;
}
if (direct_io) {
map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint);
- flag = f2fs_force_buffered_io(inode, WRITE) ?
+ flag = f2fs_force_buffered_io(inode, iocb, from) ?
F2FS_GET_BLOCK_PRE_AIO :
F2FS_GET_BLOCK_PRE_DIO;
goto map_blocks;
return err;
}
-static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
+void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
{
if (flag == F2FS_GET_BLOCK_PRE_AIO) {
if (lock)
map->m_flags = F2FS_MAP_MAPPED;
if (map->m_next_extent)
*map->m_next_extent = pgofs + map->m_len;
+
+ /* for hardware encryption, but to avoid potential issue in future */
+ if (flag == F2FS_GET_BLOCK_DIO)
+ f2fs_wait_on_block_writeback_range(inode,
+ map->m_pblk, map->m_len);
goto out;
}
goto sync_out;
}
- if (!is_valid_data_blkaddr(sbi, blkaddr)) {
+ if (is_valid_data_blkaddr(sbi, blkaddr)) {
+ /* use out-place-update for driect IO under LFS mode */
+ if (test_opt(sbi, LFS) && create &&
+ flag == F2FS_GET_BLOCK_DIO) {
+ err = __allocate_data_block(&dn, map->m_seg_type);
+ if (!err)
+ set_inode_flag(inode, FI_APPEND_WRITE);
+ }
+ } else {
if (create) {
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
last_ofs_in_node = dn.ofs_in_node;
}
} else {
+ WARN_ON(flag != F2FS_GET_BLOCK_PRE_DIO &&
+ flag != F2FS_GET_BLOCK_DIO);
err = __allocate_data_block(&dn,
map->m_seg_type);
if (!err)
goto next_dnode;
sync_out:
+
+ /* for hardware encryption, but to avoid potential issue in future */
+ if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED)
+ f2fs_wait_on_block_writeback_range(inode,
+ map->m_pblk, map->m_len);
+
if (flag == F2FS_GET_BLOCK_PRECACHE) {
if (map->m_flags & F2FS_MAP_MAPPED) {
unsigned int ofs = start_pgofs - map->m_lblk;
struct buffer_head *bh_result, int create)
{
return __get_data_block(inode, iblock, bh_result, create,
- F2FS_GET_BLOCK_DEFAULT, NULL,
+ F2FS_GET_BLOCK_DIO, NULL,
f2fs_rw_hint_to_seg_type(
inode->i_write_hint));
}
}
}
+ /*
+ * If the page is under writeback, we need to wait for
+ * its completion to see the correct decrypted data.
+ */
+ f2fs_wait_on_block_writeback(inode, block_nr);
+
if (bio_add_page(bio, page, blocksize, 0) < blocksize)
goto submit_and_realloc;
+ inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
+ ClearPageError(page);
last_block_in_bio = block_nr;
goto next_page;
set_error_page:
return 0;
/* wait for GCed page writeback via META_MAPPING */
- f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr);
+ f2fs_wait_on_block_writeback(inode, fio->old_blkaddr);
retry_encrypt:
fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
is_inode_flag_set(inode, FI_NEED_IPU))
return true;
+ if (unlikely(fio && is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
+ !f2fs_is_checkpointed_data(sbi, fio->old_blkaddr)))
+ return true;
+
return false;
}
return true;
if (S_ISDIR(inode->i_mode))
return true;
+ if (IS_NOQUOTA(inode))
+ return true;
if (f2fs_is_atomic_file(inode))
return true;
if (fio) {
return true;
if (IS_ATOMIC_WRITTEN_PAGE(fio->page))
return true;
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
+ f2fs_is_checkpointed_data(sbi, fio->old_blkaddr)))
+ return true;
}
return false;
}
/* This page is already truncated */
if (fio->old_blkaddr == NULL_ADDR) {
ClearPageUptodate(page);
+ clear_cold_data(page);
goto out_writepage;
}
got_it:
out:
inode_dec_dirty_pages(inode);
- if (err)
+ if (err) {
ClearPageUptodate(page);
+ clear_cold_data(page);
+ }
if (wbc->for_reclaim) {
- f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA);
+ f2fs_submit_merged_write_cond(sbi, NULL, page, 0, DATA);
clear_inode_flag(inode, FI_HOT_DATA);
f2fs_remove_dirty_inode(inode);
submitted = NULL;
}
unlock_page(page);
- if (!S_ISDIR(inode->i_mode))
+ if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode))
f2fs_balance_fs(sbi, need_balance_fs);
if (unlikely(f2fs_cp_error(sbi))) {
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
- pgoff_t last_idx = ULONG_MAX;
int cycled;
int range_whole = 0;
- int tag;
+ xa_mark_t tag;
+ int nwritten = 0;
pagevec_init(&pvec);
done = 1;
break;
} else if (submitted) {
- last_idx = page->index;
+ nwritten++;
}
if (--wbc->nr_to_write <= 0 &&
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = done_index;
- if (last_idx != ULONG_MAX)
+ if (nwritten)
f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host,
- 0, last_idx, DATA);
+ NULL, 0, DATA);
return ret;
}
{
if (!S_ISREG(inode->i_mode))
return false;
+ if (IS_NOQUOTA(inode))
+ return false;
if (wbc->sync_mode != WB_SYNC_ALL)
return true;
if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks)
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto skip_write;
- if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
+ if ((S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) &&
+ wbc->sync_mode == WB_SYNC_NONE &&
get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
f2fs_available_free_memory(sbi, DIRTY_DENTS))
goto skip_write;
down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_pagecache(inode, i_size);
- f2fs_truncate_blocks(inode, i_size, true);
+ f2fs_truncate_blocks(inode, i_size, true, true);
up_write(&F2FS_I(inode)->i_mmap_sem);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
trace_f2fs_write_begin(inode, pos, len, flags);
+ err = f2fs_is_checkpoint_ready(sbi);
+ if (err)
+ goto fail;
+
if ((f2fs_is_atomic_file(inode) &&
!f2fs_available_free_memory(sbi, INMEM_PAGES)) ||
is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) {
if (err)
goto fail;
- if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) {
+ if (need_balance && !IS_NOQUOTA(inode) &&
+ has_not_enough_free_secs(sbi, 0, 0)) {
unlock_page(page);
f2fs_balance_fs(sbi, true);
lock_page(page);
f2fs_wait_on_page_writeback(page, DATA, false);
- /* wait for GCed page writeback via META_MAPPING */
- if (f2fs_post_read_required(inode))
- f2fs_wait_on_block_writeback(sbi, blkaddr);
-
if (len == PAGE_SIZE || PageUptodate(page))
return 0;
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
size_t count = iov_iter_count(iter);
loff_t offset = iocb->ki_pos;
int rw = iov_iter_rw(iter);
int err;
enum rw_hint hint = iocb->ki_hint;
int whint_mode = F2FS_OPTION(sbi).whint_mode;
+ bool do_opu;
err = check_direct_IO(inode, iter, offset);
if (err)
return err < 0 ? err : 0;
- if (f2fs_force_buffered_io(inode, rw))
+ if (f2fs_force_buffered_io(inode, iocb, iter))
return 0;
+ do_opu = allow_outplace_dio(inode, iocb, iter);
+
trace_f2fs_direct_IO_enter(inode, offset, count, rw);
if (rw == WRITE && whint_mode == WHINT_MODE_OFF)
iocb->ki_hint = WRITE_LIFE_NOT_SET;
- if (!down_read_trylock(&F2FS_I(inode)->i_gc_rwsem[rw])) {
- if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!down_read_trylock(&fi->i_gc_rwsem[rw])) {
iocb->ki_hint = hint;
err = -EAGAIN;
goto out;
}
- down_read(&F2FS_I(inode)->i_gc_rwsem[rw]);
+ if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) {
+ up_read(&fi->i_gc_rwsem[rw]);
+ iocb->ki_hint = hint;
+ err = -EAGAIN;
+ goto out;
+ }
+ } else {
+ down_read(&fi->i_gc_rwsem[rw]);
+ if (do_opu)
+ down_read(&fi->i_gc_rwsem[READ]);
}
err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio);
- up_read(&F2FS_I(inode)->i_gc_rwsem[rw]);
+
+ if (do_opu)
+ up_read(&fi->i_gc_rwsem[READ]);
+
+ up_read(&fi->i_gc_rwsem[rw]);
if (rw == WRITE) {
if (whint_mode == WHINT_MODE_OFF)
if (err > 0) {
f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
err);
- set_inode_flag(inode, FI_UPDATE_WRITE);
+ if (!do_opu)
+ set_inode_flag(inode, FI_UPDATE_WRITE);
} else if (err < 0) {
f2fs_write_failed(mapping, offset + count);
}
}
}
+ clear_cold_data(page);
+
/* This is atomic written page, keep Private */
if (IS_ATOMIC_WRITTEN_PAGE(page))
return f2fs_drop_inmem_page(inode, page);
if (IS_ATOMIC_WRITTEN_PAGE(page))
return 0;
+ clear_cold_data(page);
set_page_private(page, 0);
ClearPagePrivate(page);
return 1;
if (!PageUptodate(page))
SetPageUptodate(page);
- /* don't remain PG_checked flag which was set during GC */
- if (is_cold_data(page))
- clear_cold_data(page);
-
if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) {
if (!IS_ATOMIC_WRITTEN_PAGE(page)) {
f2fs_register_inmem_page(inode, page);
#endif
};
- void f2fs_clear_radix_tree_dirty_tag(struct page *page)
+ void f2fs_clear_page_cache_dirty_tag(struct page *page)
{
struct address_space *mapping = page_mapping(page);
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
+ __xa_clear_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/dir.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
f2fs_put_page(page, 1);
clear_inode_flag(inode, FI_NEW_INODE);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
fail:
up_write(&F2FS_I(inode)->i_sem);
- f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return err;
}
if (bit_pos == NR_DENTRY_IN_BLOCK &&
!f2fs_truncate_hole(dir, page->index, page->index + 1)) {
- f2fs_clear_radix_tree_dirty_tag(page);
+ f2fs_clear_page_cache_dirty_tag(page);
clear_page_dirty_for_io(page);
ClearPagePrivate(page);
ClearPageUptodate(page);
+ clear_cold_data(page);
inode_dec_dirty_pages(dir);
f2fs_remove_dirty_inode(dir);
}
struct f2fs_dir_entry *de = NULL;
struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode);
+ struct blk_plug plug;
+ bool readdir_ra = sbi->readdir_ra == 1;
+ int err = 0;
bit_pos = ((unsigned long)ctx->pos % d->max);
+ if (readdir_ra)
+ blk_start_plug(&plug);
+
while (bit_pos < d->max) {
bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos);
if (bit_pos >= d->max)
if (f2fs_encrypted_inode(d->inode)) {
int save_len = fstr->len;
- int err;
err = fscrypt_fname_disk_to_usr(d->inode,
(u32)de->hash_code, 0,
&de_name, fstr);
if (err)
- return err;
+ goto out;
de_name = *fstr;
fstr->len = save_len;
}
if (!dir_emit(ctx, de_name.name, de_name.len,
- le32_to_cpu(de->ino), d_type))
- return 1;
+ le32_to_cpu(de->ino), d_type)) {
+ err = 1;
+ goto out;
+ }
- if (sbi->readdir_ra == 1)
+ if (readdir_ra)
f2fs_ra_node_page(sbi, le32_to_cpu(de->ino));
bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
ctx->pos = start_pos + bit_pos;
}
- return 0;
+out:
+ if (readdir_ra)
+ blk_finish_plug(&plug);
+ return err;
}
static int f2fs_readdir(struct file *file, struct dir_context *ctx)
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/f2fs.h
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#ifndef _LINUX_F2FS_H
#define _LINUX_F2FS_H
+#include <linux/uio.h>
#include <linux/types.h>
#include <linux/page-flags.h>
#include <linux/buffer_head.h>
FAULT_DIR_DEPTH,
FAULT_EVICT_INODE,
FAULT_TRUNCATE,
- FAULT_IO,
+ FAULT_READ_IO,
FAULT_CHECKPOINT,
FAULT_DISCARD,
+ FAULT_WRITE_IO,
FAULT_MAX,
};
#define F2FS_MOUNT_QUOTA 0x00400000
#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000
#define F2FS_MOUNT_RESERVE_ROOT 0x01000000
+#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
#define F2FS_FEATURE_INODE_CRTIME 0x0100
#define F2FS_FEATURE_LOST_FOUND 0x0200
#define F2FS_FEATURE_VERITY 0x0400 /* reserved */
+#define F2FS_FEATURE_SB_CHKSUM 0x0800
#define F2FS_HAS_FEATURE(sb, mask) \
((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
#define CP_RECOVERY 0x00000008
#define CP_DISCARD 0x00000010
#define CP_TRIMMED 0x00000020
+#define CP_PAUSE 0x00000040
#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi)
#define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */
#define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */
#define DEF_CP_INTERVAL 60 /* 60 secs */
#define DEF_IDLE_INTERVAL 5 /* 5 secs */
+#define DEF_DISABLE_INTERVAL 5 /* 5 secs */
struct cp_control {
int reason;
META_NAT,
META_SIT,
META_SSA,
+ META_MAX,
META_POR,
DATA_GENERIC,
META_GENERIC,
atomic_t issued_discard; /* # of issued discard */
atomic_t issing_discard; /* # of issing discard */
atomic_t discard_cmd_cnt; /* # of cached cmd count */
- struct rb_root root; /* root of discard rb-tree */
+ struct rb_root_cached root; /* root of discard rb-tree */
bool rbtree_check; /* config for consistence check */
};
#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */
+/* maximum retry quota flush count */
+#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8
+
#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */
#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */
struct extent_tree {
nid_t ino; /* inode number */
- struct rb_root root; /* root of extent info rb-tree */
+ struct rb_root_cached root; /* root of extent info rb-tree */
struct extent_node *cached_en; /* recently accessed extent node */
struct extent_info largest; /* largested extent info */
struct list_head list; /* to be used by sbi->zombie_list */
rwlock_t lock; /* protect extent info rb-tree */
atomic_t node_cnt; /* # of extent node in rb-tree*/
+ bool largest_updated; /* largest extent updated */
};
/*
F2FS_GET_BLOCK_DEFAULT,
F2FS_GET_BLOCK_FIEMAP,
F2FS_GET_BLOCK_BMAP,
+ F2FS_GET_BLOCK_DIO,
F2FS_GET_BLOCK_PRE_DIO,
F2FS_GET_BLOCK_PRE_AIO,
F2FS_GET_BLOCK_PRECACHE,
}
extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync);
-static inline void __try_update_largest_extent(struct inode *inode,
- struct extent_tree *et, struct extent_node *en)
+static inline void __try_update_largest_extent(struct extent_tree *et,
+ struct extent_node *en)
{
if (en->ei.len > et->largest.len) {
et->largest = en->ei;
- f2fs_mark_inode_dirty_sync(inode, true);
+ et->largest_updated = true;
}
}
F2FS_DIRTY_IMETA,
F2FS_WB_CP_DATA,
F2FS_WB_DATA,
+ F2FS_RD_DATA,
+ F2FS_RD_NODE,
+ F2FS_RD_META,
NR_COUNT_TYPE,
};
SBI_NEED_SB_WRITE, /* need to recover superblock */
SBI_NEED_CP, /* need to checkpoint */
SBI_IS_SHUTDOWN, /* shutdown by ioctl */
+ SBI_IS_RECOVERED, /* recovered orphan/data */
+ SBI_CP_DISABLED, /* CP was disabled last mount */
+ SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */
+ SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */
+ SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */
};
enum {
CP_TIME,
REQ_TIME,
+ DISCARD_TIME,
+ GC_TIME,
+ DISABLE_TIME,
MAX_TIME,
};
unsigned int total_valid_node_count; /* valid node block count */
loff_t max_file_blocks; /* max block index of file */
int dir_level; /* directory level */
- unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */
int readdir_ra; /* readahead inode in readdir */
block_t user_block_count; /* # of user blocks */
block_t reserved_blocks; /* configurable reserved blocks */
block_t current_reserved_blocks; /* current reserved blocks */
+ /* Additional tracking for no checkpoint mode */
+ block_t unusable_block_count; /* # of blocks saved by last cp */
+
unsigned int nquota_files; /* # of quota sysfile */
u32 s_next_generation; /* for NFS support */
*/
#ifdef CONFIG_F2FS_STAT_FS
struct f2fs_stat_info *stat_info; /* FS status information */
+ atomic_t meta_count[META_MAX]; /* # of meta blocks */
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
atomic_t inplace_count; /* # of inplace update */
atomic_t max_aw_cnt; /* max # of atomic writes */
atomic_t max_vw_cnt; /* max # of volatile writes */
int bg_gc; /* background gc calls */
+ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */
+ unsigned int other_skip_bggc; /* skip background gc for other reasons */
unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */
#endif
spinlock_t stat_lock; /* lock for stat operations */
};
#ifdef CONFIG_F2FS_FAULT_INJECTION
-#define f2fs_show_injection_info(type) \
- printk("%sF2FS-fs : inject %s in %s of %pF\n", \
- KERN_INFO, f2fs_fault_name[type], \
+#define f2fs_show_injection_info(type) \
+ printk_ratelimited("%sF2FS-fs : inject %s in %s of %pF\n", \
+ KERN_INFO, f2fs_fault_name[type], \
__func__, __builtin_return_address(0))
static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
{
static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
{
- sbi->last_time[type] = jiffies;
+ unsigned long now = jiffies;
+
+ sbi->last_time[type] = now;
+
+ /* DISCARD_TIME and GC_TIME are based on REQ_TIME */
+ if (type == REQ_TIME) {
+ sbi->last_time[DISCARD_TIME] = now;
+ sbi->last_time[GC_TIME] = now;
+ }
}
static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type)
return time_after(jiffies, sbi->last_time[type] + interval);
}
-static inline bool is_idle(struct f2fs_sb_info *sbi)
+static inline unsigned int f2fs_time_to_wait(struct f2fs_sb_info *sbi,
+ int type)
{
- struct block_device *bdev = sbi->sb->s_bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- struct request_list *rl = &q->root_rl;
+ unsigned long interval = sbi->interval_time[type] * HZ;
+ unsigned int wait_ms = 0;
+ long delta;
- if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC])
- return false;
+ delta = (sbi->last_time[type] + interval) - jiffies;
+ if (delta > 0)
+ wait_ms = jiffies_to_msecs(delta);
- return f2fs_time_over(sbi, REQ_TIME);
+ return wait_ms;
}
/*
if (!__allow_reserved_blocks(sbi, inode, true))
avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
-
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ avail_user_block_count -= sbi->unusable_block_count;
if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
diff = sbi->total_valid_block_count - avail_user_block_count;
if (diff > *count)
atomic_inc(&sbi->nr_pages[count_type]);
if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES ||
- count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA)
+ count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA ||
+ count_type == F2FS_RD_DATA || count_type == F2FS_RD_NODE ||
+ count_type == F2FS_RD_META)
return;
set_sbi_flag(sbi, SBI_IS_DIRTY);
{
block_t valid_block_count;
unsigned int valid_node_count;
- bool quota = inode && !is_inode;
+ int err;
- if (quota) {
- int ret = dquot_reserve_block(inode, 1);
- if (ret)
- return ret;
+ if (is_inode) {
+ if (inode) {
+ err = dquot_alloc_inode(inode);
+ if (err)
+ return err;
+ }
+ } else {
+ err = dquot_reserve_block(inode, 1);
+ if (err)
+ return err;
}
if (time_to_inject(sbi, FAULT_BLOCK)) {
if (!__allow_reserved_blocks(sbi, inode, false))
valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks;
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ valid_block_count += sbi->unusable_block_count;
if (unlikely(valid_block_count > sbi->user_block_count)) {
spin_unlock(&sbi->stat_lock);
return 0;
enospc:
- if (quota)
+ if (is_inode) {
+ if (inode)
+ dquot_free_inode(inode);
+ } else {
dquot_release_reservation_block(inode, 1);
+ }
return -ENOSPC;
}
spin_unlock(&sbi->stat_lock);
- if (!is_inode)
+ if (is_inode)
+ dquot_free_inode(inode);
+ else
f2fs_i_blocks_write(inode, 1, false, true);
}
return bio_alloc(GFP_KERNEL, npages);
}
+static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
+{
+ if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) ||
+ get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) ||
+ get_pages(sbi, F2FS_WB_CP_DATA))
+ return false;
+ return f2fs_time_over(sbi, type);
+}
+
static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
unsigned long index, void *item)
{
*/
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
void f2fs_truncate_data_blocks(struct dnode_of_data *dn);
-int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock);
+int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
+ bool buf_write);
int f2fs_truncate(struct inode *inode);
int f2fs_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags);
int f2fs_precache_extents(struct inode *inode);
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid);
int f2fs_pin_file_control(struct inode *inode, bool inc);
/*
int f2fs_inode_dirtied(struct inode *inode, bool sync);
void f2fs_inode_synced(struct inode *inode);
int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
+int f2fs_quota_sync(struct super_block *sb, int type);
void f2fs_quota_off_umount(struct super_block *sb);
int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
int f2fs_sync_fs(struct super_block *sb, int sync);
void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid);
struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
struct page *f2fs_get_node_page_ra(struct page *parent, int start);
-void f2fs_move_node_page(struct page *node_page, int gc_type);
+int f2fs_move_node_page(struct page *node_page, int gc_type);
int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
struct writeback_control *wbc, bool atomic,
unsigned int *seq_id);
int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page);
int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
unsigned int segno, struct f2fs_summary_block *sum);
-void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
int f2fs_build_node_manager(struct f2fs_sb_info *sbi);
void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi);
int __init f2fs_create_node_manager_caches(void);
bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
struct cp_control *cpc);
+void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi);
+int f2fs_disable_cp_again(struct f2fs_sb_info *sbi);
void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
struct f2fs_io_info *fio, bool add_list);
void f2fs_wait_on_page_writeback(struct page *page,
enum page_type type, bool ordered);
-void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr);
+void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr);
+void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
+ block_t len);
void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
void f2fs_destroy_post_read_processing(void);
void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type);
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
- struct inode *inode, nid_t ino, pgoff_t idx,
- enum page_type type);
+ struct inode *inode, struct page *page,
+ nid_t ino, enum page_type type);
void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi);
int f2fs_submit_page_bio(struct f2fs_io_info *fio);
void f2fs_submit_page_write(struct f2fs_io_info *fio);
struct page *f2fs_get_new_data_page(struct inode *inode,
struct page *ipage, pgoff_t index, bool new_i_size);
int f2fs_do_write_data_page(struct f2fs_io_info *fio);
+void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock);
int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
int create, int flag);
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
struct page *page, enum migrate_mode mode);
#endif
bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len);
- void f2fs_clear_radix_tree_dirty_tag(struct page *page);
+ void f2fs_clear_page_cache_dirty_tag(struct page *page);
/*
* gc.c
int free_nids, avail_nids, alloc_nids;
int total_count, utilization;
int bg_gc, nr_wb_cp_data, nr_wb_data;
+ int nr_rd_data, nr_rd_node, nr_rd_meta;
+ unsigned int io_skip_bggc, other_skip_bggc;
int nr_flushing, nr_flushed, flush_list_empty;
int nr_discarding, nr_discarded;
int nr_discard_cmd;
int cursec[NR_CURSEG_TYPE];
int curzone[NR_CURSEG_TYPE];
+ unsigned int meta_count[META_MAX];
unsigned int segment_count[2];
unsigned int block_count[2];
unsigned int inplace_count;
#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++)
#define stat_inc_call_count(si) ((si)->call_count++)
#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++)
+#define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++)
+#define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++)
#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++)
#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--)
#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext))
if (f2fs_has_inline_dentry(inode)) \
(atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \
} while (0)
+#define stat_inc_meta_count(sbi, blkaddr) \
+ do { \
+ if (blkaddr < SIT_I(sbi)->sit_base_addr) \
+ atomic_inc(&(sbi)->meta_count[META_CP]); \
+ else if (blkaddr < NM_I(sbi)->nat_blkaddr) \
+ atomic_inc(&(sbi)->meta_count[META_SIT]); \
+ else if (blkaddr < SM_I(sbi)->ssa_blkaddr) \
+ atomic_inc(&(sbi)->meta_count[META_NAT]); \
+ else if (blkaddr < SM_I(sbi)->main_blkaddr) \
+ atomic_inc(&(sbi)->meta_count[META_SSA]); \
+ } while (0)
#define stat_inc_seg_type(sbi, curseg) \
((sbi)->segment_count[(curseg)->alloc_type]++)
#define stat_inc_block_count(sbi, curseg) \
#define stat_inc_bg_cp_count(si) do { } while (0)
#define stat_inc_call_count(si) do { } while (0)
#define stat_inc_bggc_count(si) do { } while (0)
+#define stat_io_skip_bggc_count(sbi) do { } while (0)
+#define stat_other_skip_bggc_count(sbi) do { } while (0)
#define stat_inc_dirty_inode(sbi, type) do { } while (0)
#define stat_dec_dirty_inode(sbi, type) do { } while (0)
#define stat_inc_total_hit(sb) do { } while (0)
#define stat_inc_volatile_write(inode) do { } while (0)
#define stat_dec_volatile_write(inode) do { } while (0)
#define stat_update_max_volatile_write(inode) do { } while (0)
+#define stat_inc_meta_count(sbi, blkaddr) do { } while (0)
#define stat_inc_seg_type(sbi, curseg) do { } while (0)
#define stat_inc_block_count(sbi, curseg) do { } while (0)
#define stat_inc_inplace_blocks(sbi) do { } while (0)
/*
* extent_cache.c
*/
-struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root,
+struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root,
struct rb_entry *cached_re, unsigned int ofs);
struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
- struct rb_root *root, struct rb_node **parent,
- unsigned int ofs);
-struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root,
+ struct rb_root_cached *root,
+ struct rb_node **parent,
+ unsigned int ofs, bool *leftmost);
+struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root,
struct rb_entry *cached_re, unsigned int ofs,
struct rb_entry **prev_entry, struct rb_entry **next_entry,
struct rb_node ***insert_p, struct rb_node **insert_parent,
- bool force);
+ bool force, bool *leftmost);
bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
- struct rb_root *root);
+ struct rb_root_cached *root);
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink);
bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext);
void f2fs_drop_extent_tree(struct inode *inode);
{
#ifdef CONFIG_F2FS_FS_ENCRYPTION
file_set_encrypt(inode);
- inode->i_flags |= S_ENCRYPTED;
+ f2fs_set_inode_flags(inode);
#endif
}
F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO);
F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME);
F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND);
+F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM);
#ifdef CONFIG_BLK_DEV_ZONED
static inline int get_blkz_type(struct f2fs_sb_info *sbi,
}
#endif
-static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
+static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi)
{
- struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
+ return f2fs_sb_has_blkzoned(sbi->sb);
+}
- return blk_queue_discard(q) || f2fs_sb_has_blkzoned(sbi->sb);
+static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi)
+{
+ return blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev));
+}
+
+static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi)
+{
+ return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) ||
+ f2fs_hw_should_discard(sbi);
}
static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
#endif
}
-static inline bool f2fs_force_buffered_io(struct inode *inode, int rw)
+static inline int block_unaligned_IO(struct inode *inode,
+ struct kiocb *iocb, struct iov_iter *iter)
{
- return (f2fs_post_read_required(inode) ||
- (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
- F2FS_I_SB(inode)->s_ndevs);
+ unsigned int i_blkbits = READ_ONCE(inode->i_blkbits);
+ unsigned int blocksize_mask = (1 << i_blkbits) - 1;
+ loff_t offset = iocb->ki_pos;
+ unsigned long align = offset | iov_iter_alignment(iter);
+
+ return align & blocksize_mask;
+}
+
+static inline int allow_outplace_dio(struct inode *inode,
+ struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int rw = iov_iter_rw(iter);
+
+ return (test_opt(sbi, LFS) && (rw == WRITE) &&
+ !block_unaligned_IO(inode, iocb, iter));
+}
+
+static inline bool f2fs_force_buffered_io(struct inode *inode,
+ struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int rw = iov_iter_rw(iter);
+
+ if (f2fs_post_read_required(inode))
+ return true;
+ if (sbi->s_ndevs)
+ return true;
+ /*
+ * for blkzoned device, fallback direct IO to buffered IO, so
+ * all IOs can be serialized by log-structured write.
+ */
+ if (f2fs_sb_has_blkzoned(sbi->sb))
+ return true;
+ if (test_opt(sbi, LFS) && (rw == WRITE) &&
+ block_unaligned_IO(inode, iocb, iter))
+ return true;
+ if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED))
+ return true;
+
+ return false;
}
#ifdef CONFIG_F2FS_FAULT_INJECTION
#endif
#endif
+
+static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
+{
+#ifdef CONFIG_QUOTA
+ if (f2fs_sb_has_quota_ino(sbi->sb))
+ return true;
+ if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] ||
+ F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] ||
+ F2FS_OPTION(sbi).s_qf_names[PRJQUOTA])
+ return true;
+#endif
+ return false;
+}
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/inline.c
* Copyright (c) 2013, Intel Corporation
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
kunmap_atomic(src_addr);
set_page_dirty(dn.inode_page);
- f2fs_clear_radix_tree_dirty_tag(page);
+ f2fs_clear_page_cache_dirty_tag(page);
set_inode_flag(inode, FI_APPEND_WRITE);
set_inode_flag(inode, FI_DATA_EXIST);
clear_inode_flag(inode, FI_INLINE_DATA);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
- if (f2fs_truncate_blocks(inode, 0, false))
+ if (f2fs_truncate_blocks(inode, 0, false, false))
return false;
goto process_inline;
}
return 0;
punch_dentry_pages:
truncate_inode_pages(&dir->i_data, 0);
- f2fs_truncate_blocks(dir, 0, false);
+ f2fs_truncate_blocks(dir, 0, false, false);
f2fs_remove_dirty_inode(dir);
return err;
}
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/node.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
static void clear_node_page_dirty(struct page *page)
{
if (PageDirty(page)) {
- f2fs_clear_radix_tree_dirty_tag(page);
+ f2fs_clear_page_cache_dirty_tag(page);
clear_page_dirty_for_io(page);
dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
}
/* get current nat block page with lock */
src_page = get_current_nat_page(sbi, nid);
+ if (IS_ERR(src_page))
+ return src_page;
dst_page = f2fs_grab_meta_page(sbi, dst_off);
f2fs_bug_on(sbi, PageDirty(src_page));
if (f2fs_check_nid_range(sbi, nid))
return;
- rcu_read_lock();
- apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid);
- rcu_read_unlock();
+ apage = xa_load(&NODE_MAPPING(sbi)->i_pages, nid);
if (apage)
return;
}
if (__is_valid_data_blkaddr(ni.blk_addr) &&
- !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC))
+ !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC)) {
+ up_read(&sbi->node_write);
goto redirty_out;
+ }
if (atomic && !test_opt(sbi, NOBARRIER))
fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
up_read(&sbi->node_write);
if (wbc->for_reclaim) {
- f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0,
- page->index, NODE);
+ f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE);
submitted = NULL;
}
return AOP_WRITEPAGE_ACTIVATE;
}
-void f2fs_move_node_page(struct page *node_page, int gc_type)
+int f2fs_move_node_page(struct page *node_page, int gc_type)
{
+ int err = 0;
+
if (gc_type == FG_GC) {
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
f2fs_wait_on_page_writeback(node_page, NODE, true);
f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page));
- if (!clear_page_dirty_for_io(node_page))
+ if (!clear_page_dirty_for_io(node_page)) {
+ err = -EAGAIN;
goto out_page;
+ }
if (__write_node_page(node_page, false, NULL,
- &wbc, false, FS_GC_NODE_IO, NULL))
+ &wbc, false, FS_GC_NODE_IO, NULL)) {
+ err = -EAGAIN;
unlock_page(node_page);
+ }
goto release_page;
} else {
/* set page dirty and write it */
unlock_page(node_page);
release_page:
f2fs_put_page(node_page, 0);
+ return err;
}
static int f2fs_write_node_page(struct page *page,
unsigned int *seq_id)
{
pgoff_t index;
- pgoff_t last_idx = ULONG_MAX;
struct pagevec pvec;
int ret = 0;
struct page *last_page = NULL;
bool marked = false;
nid_t ino = inode->i_ino;
int nr_pages;
+ int nwritten = 0;
if (atomic) {
last_page = last_fsync_dnode(sbi, ino);
f2fs_put_page(last_page, 0);
break;
} else if (submitted) {
- last_idx = page->index;
+ nwritten++;
}
if (page == last_page) {
goto retry;
}
out:
- if (last_idx != ULONG_MAX)
- f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE);
+ if (nwritten)
+ f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE);
return ret ? -EIO: 0;
}
nm_i->nat_block_bitmap)) {
struct page *page = get_current_nat_page(sbi, nid);
- ret = scan_nat_page(sbi, page, nid);
- f2fs_put_page(page, 1);
+ if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ } else {
+ ret = scan_nat_page(sbi, page, nid);
+ f2fs_put_page(page, 1);
+ }
if (ret) {
up_read(&nm_i->nat_tree_lock);
f2fs_bug_on(sbi, !mount);
f2fs_msg(sbi->sb, KERN_ERR,
"NAT is corrupt, run fsck to fix it");
- return -EINVAL;
+ return ret;
}
}
spin_unlock(&nm_i->nid_list_lock);
/* Let's scan nat pages and its caches to get free nids */
- f2fs_build_free_nids(sbi, true, false);
- goto retry;
+ if (!f2fs_build_free_nids(sbi, true, false))
+ goto retry;
+ return false;
}
/*
if (!PageUptodate(ipage))
SetPageUptodate(ipage);
fill_node_footer(ipage, ino, ino, 0, true);
- set_cold_node(page, false);
+ set_cold_node(ipage, false);
src = F2FS_INODE(page);
dst = F2FS_INODE(ipage);
F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
i_projid))
dst->i_projid = src->i_projid;
+
+ if (f2fs_sb_has_inode_crtime(sbi->sb) &&
+ F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
+ i_crtime_nsec)) {
+ dst->i_crtime = src->i_crtime;
+ dst->i_crtime_nsec = src->i_crtime_nsec;
+ }
}
new_ni = old_ni;
__clear_bit_le(nat_index, nm_i->full_nat_bits);
}
-static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
+static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
struct nat_entry_set *set, struct cp_control *cpc)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
down_write(&curseg->journal_rwsem);
} else {
page = get_next_nat_page(sbi, start_nid);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
nat_blk = page_address(page);
f2fs_bug_on(sbi, !nat_blk);
}
radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
kmem_cache_free(nat_entry_set_slab, set);
}
+ return 0;
}
/*
* This function is called during the checkpointing process.
*/
-void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
unsigned int found;
nid_t set_idx = 0;
LIST_HEAD(sets);
+ int err = 0;
/* during unmount, let's flush nat_bits before checking dirty_nat_cnt */
if (enabled_nat_bits(sbi, cpc)) {
}
if (!nm_i->dirty_nat_cnt)
- return;
+ return 0;
down_write(&nm_i->nat_tree_lock);
}
/* flush dirty nats in nat entry set */
- list_for_each_entry_safe(set, tmp, &sets, set_list)
- __flush_nat_entry_set(sbi, set, cpc);
+ list_for_each_entry_safe(set, tmp, &sets, set_list) {
+ err = __flush_nat_entry_set(sbi, set, cpc);
+ if (err)
+ break;
+ }
up_write(&nm_i->nat_tree_lock);
/* Allow dirty nats by node block allocation in write_begin */
+
+ return err;
}
static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
struct page *page;
page = f2fs_get_meta_page(sbi, nat_bits_addr++);
- if (IS_ERR(page)) {
- disable_nat_bits(sbi, true);
+ if (IS_ERR(page))
return PTR_ERR(page);
- }
memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS),
page_address(page), F2FS_BLKSIZE);
if (!page)
return;
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
mss->swap += PAGE_SIZE;
else
put_page(page);
smaps_walk.private = mss;
#ifdef CONFIG_SHMEM
+ /* In case of smaps_rollup, reset the value from previous vma */
+ mss->check_shmem_swap = false;
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
/*
* For shared or readonly shmem mappings we know that all
if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
!(vma->vm_flags & VM_WRITE)) {
- mss->swap = shmem_swapped;
+ mss->swap += shmem_swapped;
} else {
mss->check_shmem_swap = true;
smaps_walk.pte_hole = smaps_pte_hole;
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
+ /**
+ * struct address_space - Contents of a cacheable, mappable object.
+ * @host: Owner, either the inode or the block_device.
+ * @i_pages: Cached pages.
+ * @gfp_mask: Memory allocation flags to use for allocating pages.
+ * @i_mmap_writable: Number of VM_SHARED mappings.
+ * @i_mmap: Tree of private and shared mappings.
+ * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
+ * @nrpages: Number of page entries, protected by the i_pages lock.
+ * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
+ * @writeback_index: Writeback starts here.
+ * @a_ops: Methods.
+ * @flags: Error bits and flags (AS_*).
+ * @wb_err: The most recent error which has occurred.
+ * @private_lock: For use by the owner of the address_space.
+ * @private_list: For use by the owner of the address_space.
+ * @private_data: For use by the owner of the address_space.
+ */
struct address_space {
- struct inode *host; /* owner: inode, block_device */
- struct radix_tree_root i_pages; /* cached pages */
- atomic_t i_mmap_writable;/* count VM_SHARED mappings */
- struct rb_root_cached i_mmap; /* tree of private and shared mappings */
- struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */
- /* Protected by the i_pages lock */
- unsigned long nrpages; /* number of total pages */
- /* number of shadow or DAX exceptional entries */
+ struct inode *host;
+ struct xarray i_pages;
+ gfp_t gfp_mask;
+ atomic_t i_mmap_writable;
+ struct rb_root_cached i_mmap;
+ struct rw_semaphore i_mmap_rwsem;
+ unsigned long nrpages;
unsigned long nrexceptional;
- pgoff_t writeback_index;/* writeback starts here */
- const struct address_space_operations *a_ops; /* methods */
- unsigned long flags; /* error bits */
- spinlock_t private_lock; /* for use by the address_space */
- gfp_t gfp_mask; /* implicit gfp mask for allocations */
- struct list_head private_list; /* for use by the address_space */
- void *private_data; /* ditto */
+ pgoff_t writeback_index;
+ const struct address_space_operations *a_ops;
+ unsigned long flags;
errseq_t wb_err;
+ spinlock_t private_lock;
+ struct list_head private_list;
+ void *private_data;
} __attribute__((aligned(sizeof(long)))) __randomize_layout;
/*
* On most architectures that alignment is already the case; but
struct mutex bd_fsfreeze_mutex;
} __randomize_layout;
+ /* XArray tags, for tagging dirty and writeback pages in the pagecache. */
+ #define PAGECACHE_TAG_DIRTY XA_MARK_0
+ #define PAGECACHE_TAG_WRITEBACK XA_MARK_1
+ #define PAGECACHE_TAG_TOWRITE XA_MARK_2
+
/*
- * Radix-tree tags, for tagging dirty and writeback pages within the pagecache
- * radix trees
+ * Returns true if any of the pages in the mapping are marked with the tag.
*/
- #define PAGECACHE_TAG_DIRTY 0
- #define PAGECACHE_TAG_WRITEBACK 1
- #define PAGECACHE_TAG_TOWRITE 2
-
- int mapping_tagged(struct address_space *mapping, int tag);
+ static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag)
+ {
+ return xa_marked(&mapping->i_pages, tag);
+ }
static inline void i_mmap_lock_write(struct address_space *mapping)
{
extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
struct inode *inode_out, loff_t pos_out,
u64 *len, bool is_dedupe);
+extern int do_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, u64 len);
extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len);
+ struct file *file_out, loff_t pos_out, u64 len);
extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
struct inode *dest, loff_t destoff,
loff_t len, bool *is_same);
return;
__sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE);
}
-
-static inline int do_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- u64 len)
-{
- int ret;
-
- file_start_write(file_out);
- ret = vfs_clone_file_range(file_in, pos_in, file_out, pos_out, len);
- file_end_write(file_out);
-
- return ret;
-}
/*
* get_write_access() gets write permission for a file.
SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */
SWP_BLKDEV = (1 << 6), /* its a block device */
- SWP_FILE = (1 << 7), /* set after swap_activate success */
- SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */
- SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */
- SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */
- SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */
+ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */
+ SWP_FS = (1 << 8), /* swap file goes through fs */
+ SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */
+ SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
+ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
+ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
/* add others here before... */
- SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */
+ SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */
};
#define SWAP_CLUSTER_MAX 32UL
/* linux/mm/workingset.c */
void *workingset_eviction(struct address_space *mapping, struct page *page);
-bool workingset_refault(void *shadow);
+void workingset_refault(struct page *page, void *shadow);
void workingset_activation(struct page *page);
- /* Do not use directly, use workingset_lookup_update */
- void workingset_update_node(struct radix_tree_node *node);
-
- /* Returns workingset_update_node() if the mapping has shadow entries. */
- #define workingset_lookup_update(mapping) \
- ({ \
- radix_tree_update_node_t __helper = workingset_update_node; \
- if (dax_mapping(mapping) || shmem_mapping(mapping)) \
- __helper = NULL; \
- __helper; \
- })
+ /* Only track the nodes of mappings with shadow entries */
+ void workingset_update_node(struct xa_node *node);
+ #define mapping_set_update(xas, mapping) do { \
+ if (!dax_mapping(mapping) && !shmem_mapping(mapping)) \
+ xas_set_update(xas, workingset_update_node); \
+ } while (0)
/* linux/mm/page_alloc.c */
extern unsigned long totalram_pages;
extern int add_to_swap(struct page *page);
extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
- extern void __delete_from_swap_cache(struct page *);
+ extern void __delete_from_swap_cache(struct page *, swp_entry_t entry);
extern void delete_from_swap_cache(struct page *);
extern void free_page_and_swap_cache(struct page *);
extern void free_pages_and_swap_cache(struct page **, int);
return -1;
}
- static inline void __delete_from_swap_cache(struct page *page)
+ static inline void __delete_from_swap_cache(struct page *page,
+ swp_entry_t entry)
{
}
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
- #include <linux/radix-tree.h>
#include <linux/device.h>
- #include <linux/types.h>
- #include <linux/pfn_t.h>
#include <linux/io.h>
#include <linux/kasan.h>
- #include <linux/mm.h>
#include <linux/memory_hotplug.h>
+ #include <linux/mm.h>
+ #include <linux/pfn_t.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+ #include <linux/types.h>
#include <linux/wait_bit.h>
+ #include <linux/xarray.h>
- static DEFINE_MUTEX(pgmap_lock);
- static RADIX_TREE(pgmap_radix, GFP_KERNEL);
+ static DEFINE_XARRAY(pgmap_array);
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
- static unsigned long order_at(struct resource *res, unsigned long pgoff)
- {
- unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
- unsigned long nr_pages, mask;
-
- nr_pages = PHYS_PFN(resource_size(res));
- if (nr_pages == pgoff)
- return ULONG_MAX;
-
- /*
- * What is the largest aligned power-of-2 range available from
- * this resource pgoff to the end of the resource range,
- * considering the alignment of the current pgoff?
- */
- mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
- if (!mask)
- return ULONG_MAX;
-
- return find_first_bit(&mask, BITS_PER_LONG);
- }
-
- #define foreach_order_pgoff(res, order, pgoff) \
- for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
- pgoff += 1UL << order, order = order_at((res), pgoff))
-
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
unsigned long addr,
EXPORT_SYMBOL(device_private_entry_fault);
#endif /* CONFIG_DEVICE_PRIVATE */
- static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff)
+ static void pgmap_array_delete(struct resource *res)
{
- unsigned long pgoff, order;
-
- mutex_lock(&pgmap_lock);
- foreach_order_pgoff(res, order, pgoff) {
- if (pgoff >= end_pgoff)
- break;
- radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
- }
- mutex_unlock(&pgmap_lock);
-
+ xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
+ NULL, GFP_KERNEL);
synchronize_rcu();
}
mem_hotplug_done();
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
- pgmap_radix_release(res, -1);
+ pgmap_array_delete(res);
dev_WARN_ONCE(dev, pgmap->altmap.alloc,
"%s: failed to free all reserved pages\n", __func__);
}
struct vmem_altmap *altmap = pgmap->altmap_valid ?
&pgmap->altmap : NULL;
struct resource *res = &pgmap->res;
- unsigned long pfn;
+ struct dev_pagemap *conflict_pgmap;
pgprot_t pgprot = PAGE_KERNEL;
- unsigned long pgoff, order;
int error, nid, is_ram;
- struct dev_pagemap *conflict_pgmap;
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
pgmap->dev = dev;
- mutex_lock(&pgmap_lock);
- error = 0;
-
- foreach_order_pgoff(res, order, pgoff) {
- error = __radix_tree_insert(&pgmap_radix,
- PHYS_PFN(res->start) + pgoff, order, pgmap);
- if (error) {
- dev_err(dev, "%s: failed: %d\n", __func__, error);
- break;
- }
- }
- mutex_unlock(&pgmap_lock);
+ error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
+ PHYS_PFN(res->end), pgmap, GFP_KERNEL));
if (error)
- goto err_radix;
+ goto err_array;
nid = dev_to_node(dev);
if (nid < 0)
if (error)
goto err_add_memory;
- for_each_device_pfn(pfn, pgmap) {
- struct page *page = pfn_to_page(pfn);
-
- /*
- * ZONE_DEVICE pages union ->lru with a ->pgmap back
- * pointer. It is a bug if a ZONE_DEVICE page is ever
- * freed or placed on a driver-private list. Seed the
- * storage with LIST_POISON* values.
- */
- list_del(&page->lru);
- page->pgmap = pgmap;
- percpu_ref_get(pgmap->ref);
- }
+ /*
+ * Initialization of the pages has been deferred until now in order
+ * to allow us to do the work while not holding the hotplug lock.
+ */
+ memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT, pgmap);
+ percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
devm_add_action(dev, devm_memremap_pages_release, pgmap);
err_kasan:
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
err_pfn_remap:
- err_radix:
- pgmap_radix_release(res, pgoff);
+ pgmap_array_delete(res);
+ err_array:
return ERR_PTR(error);
}
EXPORT_SYMBOL(devm_memremap_pages);
/* fall back to slow path lookup */
rcu_read_lock();
- pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
+ pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
pgmap = NULL;
rcu_read_unlock();
for more information.
- config RADIX_TREE_MULTIORDER
+ config XARRAY_MULTI
bool
+ help
+ Support entries which occupy multiple consecutive indices in the
+ XArray.
config ASSOCIATIVE_ARRAY
bool
config GENERIC_LIB_UCMPDI2
bool
+
+config GENERIC_LIB_UMODDI3
+ bool
bool
depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
select STACKTRACE
- select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !X86
+ select FRAME_POINTER if !MIPS && !PPC && !ARM && !S390 && !MICROBLAZE && !ARC && !X86
select KALLSYMS
select KALLSYMS_ALL
depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT
depends on !X86_64
select STACKTRACE
- select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !X86
+ select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM && !ARC && !X86
help
Provide stacktrace filter for fault-injection capabilities
depends on DEBUG_KERNEL
depends on STACKTRACE_SUPPORT
depends on PROC_FS
- select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !X86
+ select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM && !ARC && !X86
select KALLSYMS
select KALLSYMS_ALL
select STACKTRACE
config TEST_UUID
tristate "Test functions located in the uuid module at runtime"
+ config TEST_XARRAY
+ tristate "Test the XArray code at runtime"
+
config TEST_OVERFLOW
tristate "Test check_*_overflow() functions at runtime"
If unsure, say N.
+config TEST_MEMCAT_P
+ tristate "Test memcat_p() helper function"
+ help
+ Test the memcat_p() helper for correctly merging two
+ pointer arrays together.
+
+ If unsure, say N.
+
endif # RUNTIME_TESTING_MENU
config MEMTEST
KCOV_INSTRUMENT_dynamic_debug.o := n
lib-y := ctype.o string.o vsprintf.o cmdline.o \
- rbtree.o radix-tree.o timerqueue.o\
+ rbtree.o radix-tree.o timerqueue.o xarray.o \
idr.o int_sqrt.o extable.o \
sha1.o chacha20.o irq_regs.o argv_split.o \
flex_proportions.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
- nmi_backtrace.o nodemask.o win_minmax.o
+ nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o
lib-$(CONFIG_PRINTK) += dump_stack.o
lib-$(CONFIG_MMU) += ioremap.o
obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o
obj-$(CONFIG_TEST_BITFIELD) += test_bitfield.o
obj-$(CONFIG_TEST_UUID) += test_uuid.o
+ obj-$(CONFIG_TEST_XARRAY) += test_xarray.o
obj-$(CONFIG_TEST_PARMAN) += test_parman.o
obj-$(CONFIG_TEST_KMOD) += test_kmod.o
obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o
+obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o
ifeq ($(CONFIG_DEBUG_KOBJECT),y)
CFLAGS_kobject.o += -DDEBUG
obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/
obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
obj-$(CONFIG_BCH) += bch.o
-CFLAGS_bch.o := $(call cc-option,-Wframe-larger-than=4500)
obj-$(CONFIG_LZO_COMPRESS) += lzo/
obj-$(CONFIG_LZO_DECOMPRESS) += lzo/
obj-$(CONFIG_LZ4_COMPRESS) += lz4/
obj-$(CONFIG_GENERIC_LIB_MULDI3) += muldi3.o
obj-$(CONFIG_GENERIC_LIB_CMPDI2) += cmpdi2.o
obj-$(CONFIG_GENERIC_LIB_UCMPDI2) += ucmpdi2.o
+obj-$(CONFIG_GENERIC_LIB_UMODDI3) += umoddi3.o udivmoddi4.o
#include <linux/cleancache.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
+#include <linux/delayacct.h>
+#include <linux/psi.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
* ->tasklist_lock (memory_failure, collect_procs_ao)
*/
- static int page_cache_tree_insert(struct address_space *mapping,
- struct page *page, void **shadowp)
- {
- struct radix_tree_node *node;
- void **slot;
- int error;
-
- error = __radix_tree_create(&mapping->i_pages, page->index, 0,
- &node, &slot);
- if (error)
- return error;
- if (*slot) {
- void *p;
-
- p = radix_tree_deref_slot_protected(slot,
- &mapping->i_pages.xa_lock);
- if (!radix_tree_exceptional_entry(p))
- return -EEXIST;
-
- mapping->nrexceptional--;
- if (shadowp)
- *shadowp = p;
- }
- __radix_tree_replace(&mapping->i_pages, node, slot, page,
- workingset_lookup_update(mapping));
- mapping->nrpages++;
- return 0;
- }
-
- static void page_cache_tree_delete(struct address_space *mapping,
+ static void page_cache_delete(struct address_space *mapping,
struct page *page, void *shadow)
{
- int i, nr;
+ XA_STATE(xas, &mapping->i_pages, page->index);
+ unsigned int nr = 1;
+
+ mapping_set_update(&xas, mapping);
- /* hugetlb pages are represented by one entry in the radix tree */
- nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
+ /* hugetlb pages are represented by a single entry in the xarray */
+ if (!PageHuge(page)) {
+ xas_set_order(&xas, page->index, compound_order(page));
+ nr = 1U << compound_order(page);
+ }
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(nr != 1 && shadow, page);
- for (i = 0; i < nr; i++) {
- struct radix_tree_node *node;
- void **slot;
-
- __radix_tree_lookup(&mapping->i_pages, page->index + i,
- &node, &slot);
-
- VM_BUG_ON_PAGE(!node && nr != 1, page);
-
- radix_tree_clear_tags(&mapping->i_pages, node, slot);
- __radix_tree_replace(&mapping->i_pages, node, slot, shadow,
- workingset_lookup_update(mapping));
- }
+ xas_store(&xas, shadow);
+ xas_init_marks(&xas);
page->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
trace_mm_filemap_delete_from_page_cache(page);
unaccount_page_cache_page(mapping, page);
- page_cache_tree_delete(mapping, page, shadow);
+ page_cache_delete(mapping, page, shadow);
}
static void page_cache_free_page(struct address_space *mapping,
EXPORT_SYMBOL(delete_from_page_cache);
/*
- * page_cache_tree_delete_batch - delete several pages from page cache
+ * page_cache_delete_batch - delete several pages from page cache
* @mapping: the mapping to which pages belong
* @pvec: pagevec with pages to delete
*
*
* The function expects the i_pages lock to be held.
*/
- static void
- page_cache_tree_delete_batch(struct address_space *mapping,
+ static void page_cache_delete_batch(struct address_space *mapping,
struct pagevec *pvec)
{
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
int total_pages = 0;
int i = 0, tail_pages = 0;
struct page *page;
- pgoff_t start;
- start = pvec->pages[0]->index;
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
+ mapping_set_update(&xas, mapping);
+ xas_for_each(&xas, page, ULONG_MAX) {
if (i >= pagevec_count(pvec) && !tail_pages)
break;
- page = radix_tree_deref_slot_protected(slot,
- &mapping->i_pages.xa_lock);
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
continue;
if (!tail_pages) {
/*
* have our pages locked so they are protected from
* being removed.
*/
- if (page != pvec->pages[i])
+ if (page != pvec->pages[i]) {
+ VM_BUG_ON_PAGE(page->index >
+ pvec->pages[i]->index, page);
continue;
+ }
WARN_ON_ONCE(!PageLocked(page));
if (PageTransHuge(page) && !PageHuge(page))
tail_pages = HPAGE_PMD_NR - 1;
*/
i++;
} else {
+ VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages
+ != pvec->pages[i]->index, page);
tail_pages--;
}
- radix_tree_clear_tags(&mapping->i_pages, iter.node, slot);
- __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL,
- workingset_lookup_update(mapping));
+ xas_store(&xas, NULL);
total_pages++;
}
mapping->nrpages -= total_pages;
unaccount_page_cache_page(mapping, pvec->pages[i]);
}
- page_cache_tree_delete_batch(mapping, pvec);
+ page_cache_delete_batch(mapping, pvec);
xa_unlock_irqrestore(&mapping->i_pages, flags);
for (i = 0; i < pagevec_count(pvec); i++)
bool filemap_range_has_page(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
- pgoff_t index = start_byte >> PAGE_SHIFT;
- pgoff_t end = end_byte >> PAGE_SHIFT;
struct page *page;
+ XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
+ pgoff_t max = end_byte >> PAGE_SHIFT;
if (end_byte < start_byte)
return false;
- if (mapping->nrpages == 0)
- return false;
+ rcu_read_lock();
+ for (;;) {
+ page = xas_find(&xas, max);
+ if (xas_retry(&xas, page))
+ continue;
+ /* Shadow entries don't count */
+ if (xa_is_value(page))
+ continue;
+ /*
+ * We don't need to try to pin this page; we're about to
+ * release the RCU lock anyway. It is enough to know that
+ * there was a page here recently.
+ */
+ break;
+ }
+ rcu_read_unlock();
- if (!find_get_pages_range(mapping, &index, end, 1, &page))
- return false;
- put_page(page);
- return true;
+ return page != NULL;
}
EXPORT_SYMBOL(filemap_range_has_page);
* locked. This function does not add the new page to the LRU, the
* caller must do that.
*
- * The remove + add is atomic. The only way this function can fail is
- * memory allocation failure.
+ * The remove + add is atomic. This function cannot fail.
*/
int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
{
- int error;
+ struct address_space *mapping = old->mapping;
+ void (*freepage)(struct page *) = mapping->a_ops->freepage;
+ pgoff_t offset = old->index;
+ XA_STATE(xas, &mapping->i_pages, offset);
+ unsigned long flags;
VM_BUG_ON_PAGE(!PageLocked(old), old);
VM_BUG_ON_PAGE(!PageLocked(new), new);
VM_BUG_ON_PAGE(new->mapping, new);
- error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK);
- if (!error) {
- struct address_space *mapping = old->mapping;
- void (*freepage)(struct page *);
- unsigned long flags;
-
- pgoff_t offset = old->index;
- freepage = mapping->a_ops->freepage;
-
- get_page(new);
- new->mapping = mapping;
- new->index = offset;
+ get_page(new);
+ new->mapping = mapping;
+ new->index = offset;
- xa_lock_irqsave(&mapping->i_pages, flags);
- __delete_from_page_cache(old, NULL);
- error = page_cache_tree_insert(mapping, new, NULL);
- BUG_ON(error);
+ xas_lock_irqsave(&xas, flags);
+ xas_store(&xas, new);
- /*
- * hugetlb pages do not participate in page cache accounting.
- */
- if (!PageHuge(new))
- __inc_node_page_state(new, NR_FILE_PAGES);
- if (PageSwapBacked(new))
- __inc_node_page_state(new, NR_SHMEM);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
- mem_cgroup_migrate(old, new);
- radix_tree_preload_end();
- if (freepage)
- freepage(old);
- put_page(old);
- }
+ old->mapping = NULL;
+ /* hugetlb pages do not participate in page cache accounting. */
+ if (!PageHuge(old))
+ __dec_node_page_state(new, NR_FILE_PAGES);
+ if (!PageHuge(new))
+ __inc_node_page_state(new, NR_FILE_PAGES);
+ if (PageSwapBacked(old))
+ __dec_node_page_state(new, NR_SHMEM);
+ if (PageSwapBacked(new))
+ __inc_node_page_state(new, NR_SHMEM);
+ xas_unlock_irqrestore(&xas, flags);
+ mem_cgroup_migrate(old, new);
+ if (freepage)
+ freepage(old);
+ put_page(old);
- return error;
+ return 0;
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
pgoff_t offset, gfp_t gfp_mask,
void **shadowp)
{
+ XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
struct mem_cgroup *memcg;
int error;
+ void *old;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
+ mapping_set_update(&xas, mapping);
if (!huge) {
error = mem_cgroup_try_charge(page, current->mm,
return error;
}
- error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK);
- if (error) {
- if (!huge)
- mem_cgroup_cancel_charge(page, memcg, false);
- return error;
- }
-
get_page(page);
page->mapping = mapping;
page->index = offset;
- xa_lock_irq(&mapping->i_pages);
- error = page_cache_tree_insert(mapping, page, shadowp);
- radix_tree_preload_end();
- if (unlikely(error))
- goto err_insert;
+ do {
+ xas_lock_irq(&xas);
+ old = xas_load(&xas);
+ if (old && !xa_is_value(old))
+ xas_set_err(&xas, -EEXIST);
+ xas_store(&xas, page);
+ if (xas_error(&xas))
+ goto unlock;
+
+ if (xa_is_value(old)) {
+ mapping->nrexceptional--;
+ if (shadowp)
+ *shadowp = old;
+ }
+ mapping->nrpages++;
+
+ /* hugetlb pages do not participate in page cache accounting */
+ if (!huge)
+ __inc_node_page_state(page, NR_FILE_PAGES);
+ unlock:
+ xas_unlock_irq(&xas);
+ } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
+
+ if (xas_error(&xas))
+ goto error;
- /* hugetlb pages do not participate in page cache accounting. */
- if (!huge)
- __inc_node_page_state(page, NR_FILE_PAGES);
- xa_unlock_irq(&mapping->i_pages);
if (!huge)
mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
return 0;
- err_insert:
+ error:
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
- xa_unlock_irq(&mapping->i_pages);
if (!huge)
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
- return error;
+ return xas_error(&xas);
}
/**
* data from the working set, only to cache data that will
* get overwritten with something else, is a waste of memory.
*/
- if (!(gfp_mask & __GFP_WRITE) &&
- shadow && workingset_refault(shadow)) {
- SetPageActive(page);
- workingset_activation(page);
- } else
- ClearPageActive(page);
+ WARN_ON_ONCE(PageActive(page));
+ if (!(gfp_mask & __GFP_WRITE) && shadow)
+ workingset_refault(page, shadow);
lru_cache_add(page);
}
return ret;
{
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
+ bool thrashing = false;
+ unsigned long pflags;
int ret = 0;
+ if (bit_nr == PG_locked &&
+ !PageUptodate(page) && PageWorkingset(page)) {
+ if (!PageSwapBacked(page))
+ delayacct_thrashing_start();
+ psi_memstall_enter(&pflags);
+ thrashing = true;
+ }
+
init_wait(wait);
wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
wait->func = wake_page_function;
finish_wait(q, wait);
+ if (thrashing) {
+ if (!PageSwapBacked(page))
+ delayacct_thrashing_end();
+ psi_memstall_leave(&pflags);
+ }
+
/*
* A signal could leave PageWaiters set. Clearing it here if
* !waitqueue_active would be possible (by open-coding finish_wait),
}
/**
- * page_cache_next_hole - find the next hole (not-present entry)
- * @mapping: mapping
- * @index: index
- * @max_scan: maximum range to search
- *
- * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
- * lowest indexed hole.
- *
- * Returns: the index of the hole if found, otherwise returns an index
- * outside of the set specified (in which case 'return - index >=
- * max_scan' will be true). In rare cases of index wrap-around, 0 will
- * be returned.
- *
- * page_cache_next_hole may be called under rcu_read_lock. However,
- * like radix_tree_gang_lookup, this will not atomically search a
- * snapshot of the tree at a single point in time. For example, if a
- * hole is created at index 5, then subsequently a hole is created at
- * index 10, page_cache_next_hole covering both indexes may return 10
- * if called under rcu_read_lock.
+ * page_cache_next_miss() - Find the next gap in the page cache.
+ * @mapping: Mapping.
+ * @index: Index.
+ * @max_scan: Maximum range to search.
+ *
+ * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
+ * gap with the lowest index.
+ *
+ * This function may be called under the rcu_read_lock. However, this will
+ * not atomically search a snapshot of the cache at a single point in time.
+ * For example, if a gap is created at index 5, then subsequently a gap is
+ * created at index 10, page_cache_next_miss covering both indices may
+ * return 10 if called under the rcu_read_lock.
+ *
+ * Return: The index of the gap if found, otherwise an index outside the
+ * range specified (in which case 'return - index >= max_scan' will be true).
+ * In the rare case of index wrap-around, 0 will be returned.
*/
- pgoff_t page_cache_next_hole(struct address_space *mapping,
+ pgoff_t page_cache_next_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
{
- unsigned long i;
+ XA_STATE(xas, &mapping->i_pages, index);
- for (i = 0; i < max_scan; i++) {
- struct page *page;
-
- page = radix_tree_lookup(&mapping->i_pages, index);
- if (!page || radix_tree_exceptional_entry(page))
+ while (max_scan--) {
+ void *entry = xas_next(&xas);
+ if (!entry || xa_is_value(entry))
break;
- index++;
- if (index == 0)
+ if (xas.xa_index == 0)
break;
}
- return index;
+ return xas.xa_index;
}
- EXPORT_SYMBOL(page_cache_next_hole);
+ EXPORT_SYMBOL(page_cache_next_miss);
/**
- * page_cache_prev_hole - find the prev hole (not-present entry)
- * @mapping: mapping
- * @index: index
- * @max_scan: maximum range to search
- *
- * Search backwards in the range [max(index-max_scan+1, 0), index] for
- * the first hole.
- *
- * Returns: the index of the hole if found, otherwise returns an index
- * outside of the set specified (in which case 'index - return >=
- * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
- * will be returned.
- *
- * page_cache_prev_hole may be called under rcu_read_lock. However,
- * like radix_tree_gang_lookup, this will not atomically search a
- * snapshot of the tree at a single point in time. For example, if a
- * hole is created at index 10, then subsequently a hole is created at
- * index 5, page_cache_prev_hole covering both indexes may return 5 if
- * called under rcu_read_lock.
+ * page_cache_prev_miss() - Find the next gap in the page cache.
+ * @mapping: Mapping.
+ * @index: Index.
+ * @max_scan: Maximum range to search.
+ *
+ * Search the range [max(index - max_scan + 1, 0), index] for the
+ * gap with the highest index.
+ *
+ * This function may be called under the rcu_read_lock. However, this will
+ * not atomically search a snapshot of the cache at a single point in time.
+ * For example, if a gap is created at index 10, then subsequently a gap is
+ * created at index 5, page_cache_prev_miss() covering both indices may
+ * return 5 if called under the rcu_read_lock.
+ *
+ * Return: The index of the gap if found, otherwise an index outside the
+ * range specified (in which case 'index - return >= max_scan' will be true).
+ * In the rare case of wrap-around, ULONG_MAX will be returned.
*/
- pgoff_t page_cache_prev_hole(struct address_space *mapping,
+ pgoff_t page_cache_prev_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
{
- unsigned long i;
+ XA_STATE(xas, &mapping->i_pages, index);
- for (i = 0; i < max_scan; i++) {
- struct page *page;
-
- page = radix_tree_lookup(&mapping->i_pages, index);
- if (!page || radix_tree_exceptional_entry(page))
+ while (max_scan--) {
+ void *entry = xas_prev(&xas);
+ if (!entry || xa_is_value(entry))
break;
- index--;
- if (index == ULONG_MAX)
+ if (xas.xa_index == ULONG_MAX)
break;
}
- return index;
+ return xas.xa_index;
}
- EXPORT_SYMBOL(page_cache_prev_hole);
+ EXPORT_SYMBOL(page_cache_prev_miss);
/**
* find_get_entry - find and get a page cache entry
*/
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
- void **pagep;
+ XA_STATE(xas, &mapping->i_pages, offset);
struct page *head, *page;
rcu_read_lock();
repeat:
- page = NULL;
- pagep = radix_tree_lookup_slot(&mapping->i_pages, offset);
- if (pagep) {
- page = radix_tree_deref_slot(pagep);
- if (unlikely(!page))
- goto out;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto repeat;
- /*
- * A shadow entry of a recently evicted page,
- * or a swap entry from shmem/tmpfs. Return
- * it without attempting to raise page count.
- */
- goto out;
- }
+ xas_reset(&xas);
+ page = xas_load(&xas);
+ if (xas_retry(&xas, page))
+ goto repeat;
+ /*
+ * A shadow entry of a recently evicted page, or a swap entry from
+ * shmem/tmpfs. Return it without attempting to raise page count.
+ */
+ if (!page || xa_is_value(page))
+ goto out;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
- goto repeat;
+ head = compound_head(page);
+ if (!page_cache_get_speculative(head))
+ goto repeat;
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
+ /* The page was split under us? */
+ if (compound_head(page) != head) {
+ put_page(head);
+ goto repeat;
+ }
- /*
- * Has the page moved?
- * This is part of the lockless pagecache protocol. See
- * include/linux/pagemap.h for details.
- */
- if (unlikely(page != *pagep)) {
- put_page(head);
- goto repeat;
- }
+ /*
+ * Has the page moved?
+ * This is part of the lockless pagecache protocol. See
+ * include/linux/pagemap.h for details.
+ */
+ if (unlikely(page != xas_reload(&xas))) {
+ put_page(head);
+ goto repeat;
}
out:
rcu_read_unlock();
repeat:
page = find_get_entry(mapping, offset);
- if (page && !radix_tree_exception(page)) {
+ if (page && !xa_is_value(page)) {
lock_page(page);
/* Has the page been truncated? */
if (unlikely(page_mapping(page) != mapping)) {
repeat:
page = find_get_entry(mapping, offset);
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
page = NULL;
if (!page)
goto no_page;
pgoff_t start, unsigned int nr_entries,
struct page **entries, pgoff_t *indices)
{
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, start);
+ struct page *page;
unsigned int ret = 0;
- struct radix_tree_iter iter;
if (!nr_entries)
return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- struct page *head, *page;
- repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
+ xas_for_each(&xas, page, ULONG_MAX) {
+ struct page *head;
+ if (xas_retry(&xas, page))
continue;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- /*
- * A shadow entry of a recently evicted page, a swap
- * entry from shmem/tmpfs or a DAX entry. Return it
- * without attempting to raise page count.
- */
+ /*
+ * A shadow entry of a recently evicted page, a swap
+ * entry from shmem/tmpfs or a DAX entry. Return it
+ * without attempting to raise page count.
+ */
+ if (xa_is_value(page))
goto export;
- }
head = compound_head(page);
if (!page_cache_get_speculative(head))
- goto repeat;
+ goto retry;
/* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
+ if (compound_head(page) != head)
+ goto put_page;
/* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
+
export:
- indices[ret] = iter.index;
+ indices[ret] = xas.xa_index;
entries[ret] = page;
if (++ret == nr_entries)
break;
+ continue;
+ put_page:
+ put_page(head);
+ retry:
+ xas_reset(&xas);
}
rcu_read_unlock();
return ret;
pgoff_t end, unsigned int nr_pages,
struct page **pages)
{
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, *start);
+ struct page *page;
unsigned ret = 0;
if (unlikely(!nr_pages))
return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) {
- struct page *head, *page;
-
- if (iter.index > end)
- break;
- repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
+ xas_for_each(&xas, page, end) {
+ struct page *head;
+ if (xas_retry(&xas, page))
continue;
-
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- /*
- * A shadow entry of a recently evicted page,
- * or a swap entry from shmem/tmpfs. Skip
- * over it.
- */
+ /* Skip over shadow, swap and DAX entries */
+ if (xa_is_value(page))
continue;
- }
head = compound_head(page);
if (!page_cache_get_speculative(head))
- goto repeat;
+ goto retry;
/* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
+ if (compound_head(page) != head)
+ goto put_page;
/* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
pages[ret] = page;
if (++ret == nr_pages) {
- *start = pages[ret - 1]->index + 1;
+ *start = page->index + 1;
goto out;
}
+ continue;
+ put_page:
+ put_page(head);
+ retry:
+ xas_reset(&xas);
}
/*
* We come here when there is no page beyond @end. We take care to not
* overflow the index @start as it confuses some of the callers. This
- * breaks the iteration when there is page at index -1 but that is
+ * breaks the iteration when there is a page at index -1 but that is
* already broken anyway.
*/
if (end == (pgoff_t)-1)
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
unsigned int nr_pages, struct page **pages)
{
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, index);
+ struct page *page;
unsigned int ret = 0;
if (unlikely(!nr_pages))
return 0;
rcu_read_lock();
- radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) {
- struct page *head, *page;
- repeat:
- page = radix_tree_deref_slot(slot);
- /* The hole, there no reason to continue */
- if (unlikely(!page))
- break;
-
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- /*
- * A shadow entry of a recently evicted page,
- * or a swap entry from shmem/tmpfs. Stop
- * looking for contiguous pages.
- */
+ for (page = xas_load(&xas); page; page = xas_next(&xas)) {
+ struct page *head;
+ if (xas_retry(&xas, page))
+ continue;
+ /*
+ * If the entry has been swapped out, we can stop looking.
+ * No current caller is looking for DAX entries.
+ */
+ if (xa_is_value(page))
break;
- }
head = compound_head(page);
if (!page_cache_get_speculative(head))
- goto repeat;
+ goto retry;
/* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
+ if (compound_head(page) != head)
+ goto put_page;
/* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
/*
* must check mapping and index after taking the ref.
* otherwise we can get both false positives and false
* negatives, which is just confusing to the caller.
*/
- if (page->mapping == NULL || page_to_pgoff(page) != iter.index) {
+ if (!page->mapping || page_to_pgoff(page) != xas.xa_index) {
put_page(page);
break;
}
pages[ret] = page;
if (++ret == nr_pages)
break;
+ continue;
+ put_page:
+ put_page(head);
+ retry:
+ xas_reset(&xas);
}
rcu_read_unlock();
return ret;
* @tag. We update @index to index the next page for the traversal.
*/
unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
- pgoff_t end, int tag, unsigned int nr_pages,
+ pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
struct page **pages)
{
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, *index);
+ struct page *page;
unsigned ret = 0;
if (unlikely(!nr_pages))
return 0;
rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) {
- struct page *head, *page;
-
- if (iter.index > end)
- break;
- repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
+ xas_for_each_marked(&xas, page, end, tag) {
+ struct page *head;
+ if (xas_retry(&xas, page))
continue;
-
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- /*
- * A shadow entry of a recently evicted page.
- *
- * Those entries should never be tagged, but
- * this tree walk is lockless and the tags are
- * looked up in bulk, one radix tree node at a
- * time, so there is a sizable window for page
- * reclaim to evict a page we saw tagged.
- *
- * Skip over it.
- */
+ /*
+ * Shadow entries should never be tagged, but this iteration
+ * is lockless so there is a window for page reclaim to evict
+ * a page we saw tagged. Skip over it.
+ */
+ if (xa_is_value(page))
continue;
- }
head = compound_head(page);
if (!page_cache_get_speculative(head))
- goto repeat;
+ goto retry;
/* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
+ if (compound_head(page) != head)
+ goto put_page;
/* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
pages[ret] = page;
if (++ret == nr_pages) {
- *index = pages[ret - 1]->index + 1;
+ *index = page->index + 1;
goto out;
}
+ continue;
+ put_page:
+ put_page(head);
+ retry:
+ xas_reset(&xas);
}
/*
- * We come here when we got at @end. We take care to not overflow the
+ * We come here when we got to @end. We take care to not overflow the
* index @index as it confuses some of the callers. This breaks the
- * iteration when there is page at index -1 but that is already broken
- * anyway.
+ * iteration when there is a page at index -1 but that is already
+ * broken anyway.
*/
if (end == (pgoff_t)-1)
*index = (pgoff_t)-1;
* @tag.
*/
unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
- int tag, unsigned int nr_entries,
+ xa_mark_t tag, unsigned int nr_entries,
struct page **entries, pgoff_t *indices)
{
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, start);
+ struct page *page;
unsigned int ret = 0;
- struct radix_tree_iter iter;
if (!nr_entries)
return 0;
rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) {
- struct page *head, *page;
- repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
+ xas_for_each_marked(&xas, page, ULONG_MAX, tag) {
+ struct page *head;
+ if (xas_retry(&xas, page))
continue;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
-
- /*
- * A shadow entry of a recently evicted page, a swap
- * entry from shmem/tmpfs or a DAX entry. Return it
- * without attempting to raise page count.
- */
+ /*
+ * A shadow entry of a recently evicted page, a swap
+ * entry from shmem/tmpfs or a DAX entry. Return it
+ * without attempting to raise page count.
+ */
+ if (xa_is_value(page))
goto export;
- }
head = compound_head(page);
if (!page_cache_get_speculative(head))
- goto repeat;
+ goto retry;
/* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
+ if (compound_head(page) != head)
+ goto put_page;
/* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
+
export:
- indices[ret] = iter.index;
+ indices[ret] = xas.xa_index;
entries[ret] = page;
if (++ret == nr_entries)
break;
+ continue;
+ put_page:
+ put_page(head);
+ retry:
+ xas_reset(&xas);
}
rcu_read_unlock();
return ret;
* system is low on memory, or a problem occurs while trying
* to schedule I/O.
*/
- if (error == -ENOMEM)
- return VM_FAULT_OOM;
- return VM_FAULT_SIGBUS;
+ return vmf_error(error);
page_not_uptodate:
/*
void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
- struct radix_tree_iter iter;
- void **slot;
struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
+ XA_STATE(xas, &mapping->i_pages, start_pgoff);
struct page *head, *page;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) {
- if (iter.index > end_pgoff)
- break;
- repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
- goto next;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
+ xas_for_each(&xas, page, end_pgoff) {
+ if (xas_retry(&xas, page))
+ continue;
+ if (xa_is_value(page))
goto next;
- }
head = compound_head(page);
if (!page_cache_get_speculative(head))
- goto repeat;
+ goto next;
/* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
+ if (compound_head(page) != head)
+ goto skip;
/* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
+ if (unlikely(page != xas_reload(&xas)))
+ goto skip;
if (!PageUptodate(page) ||
PageReadahead(page) ||
if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--;
- vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
+ vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
if (vmf->pte)
- vmf->pte += iter.index - last_pgoff;
- last_pgoff = iter.index;
+ vmf->pte += xas.xa_index - last_pgoff;
+ last_pgoff = xas.xa_index;
if (alloc_set_pte(vmf, NULL, page))
goto unlock;
unlock_page(page);
/* Huge page is mapped? No need to proceed. */
if (pmd_trans_huge(*vmf->pmd))
break;
- if (iter.index == end_pgoff)
- break;
}
rcu_read_unlock();
}
return generic_file_mmap(file, vma);
}
#else
-int filemap_page_mkwrite(struct vm_fault *vmf)
+vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
- return -ENOSYS;
+ return VM_FAULT_SIGBUS;
}
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
{
put_page(page);
if (err == -EEXIST)
goto repeat;
- /* Presumably ENOMEM for radix tree node */
+ /* Presumably ENOMEM for xarray node */
return ERR_PTR(err);
}
if (iocb->ki_flags & IOCB_NOWAIT) {
/* If there are pages to writeback, return */
if (filemap_range_has_page(inode->i_mapping, pos,
- pos + iov_iter_count(from)))
+ pos + write_len))
return -EAGAIN;
} else {
written = filemap_write_and_wait_range(mapping, pos,
}
struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, int flags)
+ pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{
unsigned long pfn = pmd_pfn(*pmd);
struct mm_struct *mm = vma->vm_mm;
- struct dev_pagemap *pgmap;
struct page *page;
assert_spin_locked(pmd_lockptr(mm, pmd));
return ERR_PTR(-EEXIST);
pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
- pgmap = get_dev_pagemap(pfn, NULL);
- if (!pgmap)
+ *pgmap = get_dev_pagemap(pfn, *pgmap);
+ if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
get_page(page);
- put_dev_pagemap(pgmap);
return page;
}
}
struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, int flags)
+ pud_t *pud, int flags, struct dev_pagemap **pgmap)
{
unsigned long pfn = pud_pfn(*pud);
struct mm_struct *mm = vma->vm_mm;
- struct dev_pagemap *pgmap;
struct page *page;
assert_spin_locked(pud_lockptr(mm, pud));
return ERR_PTR(-EEXIST);
pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
- pgmap = get_dev_pagemap(pfn, NULL);
- if (!pgmap)
+ *pgmap = get_dev_pagemap(pfn, *pgmap);
+ if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
get_page(page);
- put_dev_pagemap(pgmap);
return page;
}
* We are not sure a pending tlb flush here is for a huge page
* mapping or not. Hence use the tlb range variant
*/
- if (mm_tlb_flush_pending(vma->vm_mm))
+ if (mm_tlb_flush_pending(vma->vm_mm)) {
flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+ /*
+ * change_huge_pmd() released the pmd lock before
+ * invalidating the secondary MMUs sharing the primary
+ * MMU pagetables (with ->invalidate_range()). The
+ * mmu_notifier_invalidate_range_end() (which
+ * internally calls ->invalidate_range()) in
+ * change_pmd_range() will run after us, so we can't
+ * rely on it here and we need an explicit invalidate.
+ */
+ mmu_notifier_invalidate_range(vma->vm_mm, haddr,
+ haddr + HPAGE_PMD_SIZE);
+ }
/*
* Migrate the THP to the requested node, returns with page unlocked
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
- pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
+ pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
pmd_t pmd;
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
- if (pmd_present(pmd) && pmd_dirty(pmd))
+ if (pmd_present(pmd))
force_flush = true;
VM_BUG_ON(!pmd_none(*new_pmd));
}
pmd = move_soft_dirty_pmd(pmd);
set_pmd_at(mm, new_addr, new_pmd, pmd);
- if (new_ptl != old_ptl)
- spin_unlock(new_ptl);
if (force_flush)
flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
- else
- *need_flush = true;
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
spin_unlock(old_ptl);
return true;
}
(1L << PG_mlocked) |
(1L << PG_uptodate) |
(1L << PG_active) |
+ (1L << PG_workingset) |
(1L << PG_locked) |
(1L << PG_unevictable) |
(1L << PG_dirty)));
ClearPageCompound(head);
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
- /* Additional pin to radix tree of swap cache */
+ /* Additional pin to swap cache */
if (PageSwapCache(head))
page_ref_add(head, 2);
else
page_ref_inc(head);
} else {
- /* Additional pin to radix tree */
+ /* Additional pin to page cache */
page_ref_add(head, 2);
xa_unlock(&head->mapping->i_pages);
}
{
int extra_pins;
- /* Additional pins from radix tree */
+ /* Additional pins from page cache */
if (PageAnon(page))
extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
else
spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
if (mapping) {
- void **pslot;
+ XA_STATE(xas, &mapping->i_pages, page_index(head));
- xa_lock(&mapping->i_pages);
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
- page_index(head));
/*
- * Check if the head page is present in radix tree.
+ * Check if the head page is present in page cache.
* We assume all tail are present too, if head is there.
*/
- if (radix_tree_deref_slot_protected(pslot,
- &mapping->i_pages.xa_lock) != head)
+ xa_lock(&mapping->i_pages);
+ if (xas_load(&xas) != head)
goto fail;
}
if (!(pvmw->pmd && !pvmw->pte))
return;
- mmu_notifier_invalidate_range_start(mm, address,
- address + HPAGE_PMD_SIZE);
-
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
pmdval = *pvmw->pmd;
pmdp_invalidate(vma, address, pvmw->pmd);
set_pmd_at(mm, address, pvmw->pmd, pmdswp);
page_remove_rmap(page, true);
put_page(page);
-
- mmu_notifier_invalidate_range_end(mm, address,
- address + HPAGE_PMD_SIZE);
}
void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
else
page_add_file_rmap(new, true);
set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
- if (vma->vm_flags & VM_LOCKED)
+ if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
mlock_vma_page(new);
update_mmu_cache_pmd(vma, address, pvmw->pmd);
}
new_flags |= VM_DONTDUMP;
break;
case MADV_DODUMP:
- if (new_flags & VM_SPECIAL) {
+ if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
error = -EINVAL;
goto out;
}
index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
page = find_get_entry(mapping, index);
- if (!radix_tree_exceptional_entry(page)) {
+ if (!xa_is_value(page)) {
if (page)
put_page(page);
continue;
if (order > PAGE_ALLOC_COSTLY_ORDER)
return OOM_SKIPPED;
+ memcg_memory_event(memcg, MEMCG_OOM);
+
/*
* We are in the middle of the charge context here, so we
* don't want to block when potentially sitting on a callstack
if (fatal_signal_pending(current))
goto force;
- memcg_memory_event(mem_over_limit, MEMCG_OOM);
-
/*
* keep retrying as long as the memcg oom killer is able to make
* a forward progress or bypass the charge if the oom killer
/*
* Enqueue the creation of a per-memcg kmem_cache.
*/
-static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
{
struct memcg_kmem_cache_create_work *cw;
queue_work(memcg_kmem_cache_wq, &cw->work);
}
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- /*
- * We need to stop accounting when we kmalloc, because if the
- * corresponding kmalloc cache is not yet created, the first allocation
- * in __memcg_schedule_kmem_cache_create will recurse.
- *
- * However, it is better to enclose the whole function. Depending on
- * the debugging options enabled, INIT_WORK(), for instance, can
- * trigger an allocation. This too, will make us recurse. Because at
- * this point we can't allow ourselves back into memcg_kmem_get_cache,
- * the safest choice is to do it like this, wrapping the whole function.
- */
- current->memcg_kmem_skip_account = 1;
- __memcg_schedule_kmem_cache_create(memcg, cachep);
- current->memcg_kmem_skip_account = 0;
-}
-
static inline bool memcg_kmem_bypass(void)
{
if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
if (memcg_kmem_bypass())
return cachep;
- if (current->memcg_kmem_skip_account)
- return cachep;
-
memcg = get_mem_cgroup_from_current();
kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
{
- VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
- atomic_add(n, &memcg->id.ref);
+ refcount_add(n, &memcg->id.ref);
}
static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
- VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
- if (atomic_sub_and_test(n, &memcg->id.ref)) {
+ if (refcount_sub_and_test(n, &memcg->id.ref)) {
mem_cgroup_id_remove(memcg);
/* Memcg ID pins CSS */
}
/* Online state pins memcg ID, memcg ID pins CSS */
- atomic_set(&memcg->id.ref, 1);
+ refcount_set(&memcg->id.ref, 1);
css_get(css);
return 0;
}
memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
+ drain_all_stock(memcg);
+
mem_cgroup_id_put(memcg);
}
/* shmem/tmpfs may report page out on swap: account for that too. */
if (shmem_mapping(mapping)) {
page = find_get_entry(mapping, pgoff);
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
if (do_memsw_account())
*entry = swp;
seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
+ seq_printf(m, "workingset_refault %lu\n",
+ acc.stat[WORKINGSET_REFAULT]);
+ seq_printf(m, "workingset_activate %lu\n",
+ acc.stat[WORKINGSET_ACTIVATE]);
+ seq_printf(m, "workingset_nodereclaim %lu\n",
+ acc.stat[WORKINGSET_NODERECLAIM]);
+
seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
acc.events[PGSCAN_DIRECT]);
seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
- seq_printf(m, "workingset_refault %lu\n",
- acc.stat[WORKINGSET_REFAULT]);
- seq_printf(m, "workingset_activate %lu\n",
- acc.stat[WORKINGSET_ACTIVATE]);
- seq_printf(m, "workingset_nodereclaim %lu\n",
- acc.stat[WORKINGSET_NODERECLAIM]);
-
return 0;
}
#ifdef CONFIG_MEMCG_SWAP
static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
{
- while (!atomic_inc_not_zero(&memcg->id.ref)) {
+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
/*
* The root cgroup cannot be destroyed, so it's refcount must
* always be >= 1.
if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
mlock_vma_page(new);
+ if (PageTransHuge(page) && PageMlocked(page))
+ clear_page_mlock(page);
+
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, pvmw.address, pvmw.pte);
}
page = migration_entry_to_page(entry);
/*
- * Once radix-tree replacement of page migration started, page_count
+ * Once page cache replacement of page migration started, page_count
* *must* be zero. And, we don't want to call wait_on_page_locked()
* against a page without get_page().
* So, we use get_page_unless_zero(), here. Even failed, page fault
struct buffer_head *head, enum migrate_mode mode,
int extra_count)
{
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
struct zone *oldzone, *newzone;
int dirty;
int expected_count = 1 + extra_count;
- void **pslot;
/*
* Device public or private pages have an extra refcount as they are
oldzone = page_zone(page);
newzone = page_zone(newpage);
- xa_lock_irq(&mapping->i_pages);
-
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
- page_index(page));
+ xas_lock_irq(&xas);
expected_count += hpage_nr_pages(page) + page_has_private(page);
- if (page_count(page) != expected_count ||
- radix_tree_deref_slot_protected(pslot,
- &mapping->i_pages.xa_lock) != page) {
- xa_unlock_irq(&mapping->i_pages);
+ if (page_count(page) != expected_count || xas_load(&xas) != page) {
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
if (!page_ref_freeze(page, expected_count)) {
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
if (mode == MIGRATE_ASYNC && head &&
!buffer_migrate_lock_buffers(head, mode)) {
page_ref_unfreeze(page, expected_count);
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
SetPageDirty(newpage);
}
- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
+ xas_store(&xas, newpage);
if (PageTransHuge(page)) {
int i;
- int index = page_index(page);
for (i = 1; i < HPAGE_PMD_NR; i++) {
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
- index + i);
- radix_tree_replace_slot(&mapping->i_pages, pslot,
- newpage + i);
+ xas_next(&xas);
+ xas_store(&xas, newpage + i);
}
}
*/
page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
- xa_unlock(&mapping->i_pages);
+ xas_unlock(&xas);
/* Leave irq disabled to prevent preemption while updating stats */
/*
int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page)
{
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
int expected_count;
- void **pslot;
-
- xa_lock_irq(&mapping->i_pages);
-
- pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page));
+ xas_lock_irq(&xas);
expected_count = 2 + page_has_private(page);
- if (page_count(page) != expected_count ||
- radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) {
- xa_unlock_irq(&mapping->i_pages);
+ if (page_count(page) != expected_count || xas_load(&xas) != page) {
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
if (!page_ref_freeze(page, expected_count)) {
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
get_page(newpage);
- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
+ xas_store(&xas, newpage);
page_ref_unfreeze(page, expected_count - 1);
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
return MIGRATEPAGE_SUCCESS;
}
SetPageActive(newpage);
} else if (TestClearPageUnevictable(page))
SetPageUnevictable(newpage);
+ if (PageWorkingset(page))
+ SetPageWorkingset(newpage);
if (PageChecked(page))
SetPageChecked(newpage);
if (PageMappedToDisk(page))
* we encounter them after the rest of the list
* is processed.
*/
- if (PageTransHuge(page)) {
+ if (PageTransHuge(page) && !PageHuge(page)) {
lock_page(page);
rc = split_huge_page_to_list(page, from);
unlock_page(page);
return newpage;
}
-/*
- * page migration rate limiting control.
- * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
- * window of time. Default here says do not migrate more than 1280M per second.
- */
-static unsigned int migrate_interval_millisecs __read_mostly = 100;
-static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
-
-/* Returns true if the node is migrate rate-limited after the update */
-static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
- unsigned long nr_pages)
-{
- /*
- * Rate-limit the amount of data that is being migrated to a node.
- * Optimal placement is no good if the memory bus is saturated and
- * all the time is being spent migrating!
- */
- if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
- spin_lock(&pgdat->numabalancing_migrate_lock);
- pgdat->numabalancing_migrate_nr_pages = 0;
- pgdat->numabalancing_migrate_next_window = jiffies +
- msecs_to_jiffies(migrate_interval_millisecs);
- spin_unlock(&pgdat->numabalancing_migrate_lock);
- }
- if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
- trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
- nr_pages);
- return true;
- }
-
- /*
- * This is an unlocked non-atomic update so errors are possible.
- * The consequences are failing to migrate when we potentiall should
- * have which is not severe enough to warrant locking. If it is ever
- * a problem, it can be converted to a per-cpu counter.
- */
- pgdat->numabalancing_migrate_nr_pages += nr_pages;
- return false;
-}
-
static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
int page_lru;
if (page_is_file_cache(page) && PageDirty(page))
goto out;
- /*
- * Rate-limit the amount of data that is being migrated to a node.
- * Optimal placement is no good if the memory bus is saturated and
- * all the time is being spent migrating!
- */
- if (numamigrate_update_ratelimit(pgdat, 1))
- goto out;
-
isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated)
goto out;
int isolated = 0;
struct page *new_page = NULL;
int page_lru = page_is_file_cache(page);
- unsigned long mmun_start = address & HPAGE_PMD_MASK;
- unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
-
- /*
- * Rate-limit the amount of data that is being migrated to a node.
- * Optimal placement is no good if the memory bus is saturated and
- * all the time is being spent migrating!
- */
- if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
- goto out_dropref;
+ unsigned long start = address & HPAGE_PMD_MASK;
new_page = alloc_pages_node(node,
(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
/* anon mapping, we can simply copy page->mapping to the new page: */
new_page->mapping = page->mapping;
new_page->index = page->index;
+ /* flush the cache before copying using the kernel virtual address */
+ flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
migrate_page_copy(new_page, page);
WARN_ON(PageLRU(new_page));
/* Recheck the target PMD */
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
/* Reverse changes made by migrate_page_copy() */
if (TestClearPageActive(new_page))
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
/*
- * Clear the old entry under pagetable lock and establish the new PTE.
- * Any parallel GUP will either observe the old page blocking on the
- * page lock, block on the page table lock or observe the new page.
- * The SetPageUptodate on the new page and page_add_new_anon_rmap
- * guarantee the copy is visible before the pagetable update.
+ * Overwrite the old entry under pagetable lock and establish
+ * the new PTE. Any parallel GUP will either observe the old
+ * page blocking on the page lock, block on the page table
+ * lock or observe the new page. The SetPageUptodate on the
+ * new page and page_add_new_anon_rmap guarantee the copy is
+ * visible before the pagetable update.
+ */
+ page_add_anon_rmap(new_page, vma, start, true);
+ /*
+ * At this point the pmd is numa/protnone (i.e. non present) and the TLB
+ * has already been flushed globally. So no TLB can be currently
+ * caching this non present pmd mapping. There's no need to clear the
+ * pmd before doing set_pmd_at(), nor to flush the TLB after
+ * set_pmd_at(). Clearing the pmd here would introduce a race
+ * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
+ * mmap_sem for reading. If the pmd is set to NULL at any given time,
+ * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
+ * pmd.
*/
- flush_cache_range(vma, mmun_start, mmun_end);
- page_add_anon_rmap(new_page, vma, mmun_start, true);
- pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
- set_pmd_at(mm, mmun_start, pmd, entry);
+ set_pmd_at(mm, start, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
page_ref_unfreeze(page, 2);
set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
spin_unlock(ptl);
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pmdp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
/* Take an "isolate" reference and put new page on the LRU. */
get_page(new_page);
out_fail:
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
-out_dropref:
ptl = pmd_lock(mm, pmd);
if (pmd_same(*pmd, entry)) {
entry = pmd_modify(entry, vma->vm_page_prot);
- set_pmd_at(mm, mmun_start, pmd, entry);
+ set_pmd_at(mm, start, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
}
spin_unlock(ptl);
* dirty pages in the file (thus it is important for this function to be quick
* so that it can tag pages faster than a dirtying process can create them).
*/
- /*
- * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock
- * latency.
- */
void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
- #define WRITEBACK_TAG_BATCH 4096
- unsigned long tagged = 0;
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, start);
+ unsigned int tagged = 0;
+ void *page;
- xa_lock_irq(&mapping->i_pages);
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start,
- PAGECACHE_TAG_DIRTY) {
- if (iter.index > end)
- break;
- radix_tree_iter_tag_set(&mapping->i_pages, &iter,
- PAGECACHE_TAG_TOWRITE);
- tagged++;
- if ((tagged % WRITEBACK_TAG_BATCH) != 0)
+ xas_lock_irq(&xas);
+ xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) {
+ xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ if (++tagged % XA_CHECK_SCHED)
continue;
- slot = radix_tree_iter_resume(slot, &iter);
- xa_unlock_irq(&mapping->i_pages);
+
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
cond_resched();
- xa_lock_irq(&mapping->i_pages);
+ xas_lock_irq(&xas);
}
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
}
EXPORT_SYMBOL(tag_pages_for_writeback);
* not miss some pages (e.g., because some other process has cleared TOWRITE
* tag we set). The rule we follow is that TOWRITE tag can be cleared only
* by the process clearing the DIRTY tag (and submitting the page for IO).
+ *
+ * To avoid deadlocks between range_cyclic writeback and callers that hold
+ * pages in PageWriteback to aggregate IO until write_cache_pages() returns,
+ * we do not loop back to the start of the file. Doing so causes a page
+ * lock/page writeback access order inversion - we should only ever lock
+ * multiple pages in ascending page->index order, and looping back to the start
+ * of the file violates that rule and causes deadlocks.
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
- int cycled;
int range_whole = 0;
- int tag;
+ xa_mark_t tag;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
writeback_index = mapping->writeback_index; /* prev offset */
index = writeback_index;
- if (index == 0)
- cycled = 1;
- else
- cycled = 0;
end = -1;
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- cycled = 1; /* ignore range_cyclic tests */
}
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
-retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index;
pagevec_release(&pvec);
cond_resched();
}
- if (!cycled && !done) {
- /*
- * range_cyclic:
- * We hit the last page and there is more work to be done: wrap
- * back to the start of the file
- */
- cycled = 1;
- index = 0;
- end = writeback_index - 1;
- goto retry;
- }
+
+ /*
+ * If we hit the last page and there is more work to be done: wrap
+ * back the index back to the start of the file for the next
+ * time we are called.
+ */
+ if (wbc->range_cyclic && !done)
+ done_index = 0;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = done_index;
/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
- * its radix tree.
+ * the xarray.
*
* This is also used when a single buffer is being dirtied: we want to set the
* page dirty in that case, but not all the buffers. This is a "bottom-up"
BUG_ON(page_mapping(page) != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->i_pages, page_index(page),
+ __xa_set_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
xa_unlock_irqrestore(&mapping->i_pages, flags);
unlock_page_memcg(page);
* Returns true if the page was previously dirty.
*
* This is for preparing to put the page under writeout. We leave the page
- * tagged as dirty in the radix tree so that a concurrent write-for-sync
+ * tagged as dirty in the xarray so that a concurrent write-for-sync
* can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
* implementation will run either set_page_writeback() or set_page_dirty(),
- * at which stage we bring the page's dirty flag and radix-tree dirty tag
+ * at which stage we bring the page's dirty flag and xarray dirty tag
* back into sync.
*
- * This incoherency between the page's dirty flag and radix-tree tag is
+ * This incoherency between the page's dirty flag and xarray tag is
* unfortunate, but it only exists while the page is locked.
*/
int clear_page_dirty_for_io(struct page *page)
xa_lock_irqsave(&mapping->i_pages, flags);
ret = TestClearPageWriteback(page);
if (ret) {
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
+ __xa_clear_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi)) {
struct bdi_writeback *wb = inode_to_wb(inode);
lock_page_memcg(page);
if (mapping && mapping_use_writeback_tags(mapping)) {
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
- xa_lock_irqsave(&mapping->i_pages, flags);
+ xas_lock_irqsave(&xas, flags);
+ xas_load(&xas);
ret = TestSetPageWriteback(page);
if (!ret) {
bool on_wblist;
on_wblist = mapping_tagged(mapping,
PAGECACHE_TAG_WRITEBACK);
- radix_tree_tag_set(&mapping->i_pages, page_index(page),
- PAGECACHE_TAG_WRITEBACK);
+ xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi))
inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
sb_mark_inode_writeback(mapping->host);
}
if (!PageDirty(page))
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
- PAGECACHE_TAG_DIRTY);
+ xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
if (!keep_write)
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
- PAGECACHE_TAG_TOWRITE);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ xas_unlock_irqrestore(&xas, flags);
} else {
ret = TestSetPageWriteback(page);
}
}
EXPORT_SYMBOL(__test_set_page_writeback);
- /*
- * Return true if any of the pages in the mapping are marked with the
- * passed tag.
- */
- int mapping_tagged(struct address_space *mapping, int tag)
- {
- return radix_tree_tagged(&mapping->i_pages, tag);
- }
- EXPORT_SYMBOL(mapping_tagged);
-
/**
* wait_for_stable_page() - wait for writeback to finish, if necessary.
* @page: The page to wait on.
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/backing-dev.h>
-#include <linux/memremap.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
- if (!radix_tree_exceptional_entry(page))
+ if (!xa_is_value(page))
pvec->pages[j++] = page;
}
pvec->nr = j;
unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
struct address_space *mapping, pgoff_t *index, pgoff_t end,
- int tag)
+ xa_mark_t tag)
{
pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
PAGEVEC_SIZE, pvec->pages);
unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
struct address_space *mapping, pgoff_t *index, pgoff_t end,
- int tag, unsigned max_pages)
+ xa_mark_t tag, unsigned max_pages)
{
pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
}
/*
- * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
+ * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
- int __add_to_swap_cache(struct page *page, swp_entry_t entry)
+ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
{
- int error, i, nr = hpage_nr_pages(page);
- struct address_space *address_space;
+ struct address_space *address_space = swap_address_space(entry);
pgoff_t idx = swp_offset(entry);
+ XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
+ unsigned long i, nr = 1UL << compound_order(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapCache(page), page);
page_ref_add(page, nr);
SetPageSwapCache(page);
- address_space = swap_address_space(entry);
- xa_lock_irq(&address_space->i_pages);
- for (i = 0; i < nr; i++) {
- set_page_private(page + i, entry.val + i);
- error = radix_tree_insert(&address_space->i_pages,
- idx + i, page + i);
- if (unlikely(error))
- break;
- }
- if (likely(!error)) {
+ do {
+ xas_lock_irq(&xas);
+ xas_create_range(&xas);
+ if (xas_error(&xas))
+ goto unlock;
+ for (i = 0; i < nr; i++) {
+ VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
+ set_page_private(page + i, entry.val + i);
+ xas_store(&xas, page + i);
+ xas_next(&xas);
+ }
address_space->nrpages += nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
ADD_CACHE_INFO(add_total, nr);
- } else {
- /*
- * Only the context which have set SWAP_HAS_CACHE flag
- * would call add_to_swap_cache().
- * So add_to_swap_cache() doesn't returns -EEXIST.
- */
- VM_BUG_ON(error == -EEXIST);
- set_page_private(page + i, 0UL);
- while (i--) {
- radix_tree_delete(&address_space->i_pages, idx + i);
- set_page_private(page + i, 0UL);
- }
- ClearPageSwapCache(page);
- page_ref_sub(page, nr);
- }
- xa_unlock_irq(&address_space->i_pages);
+ unlock:
+ xas_unlock_irq(&xas);
+ } while (xas_nomem(&xas, gfp));
- return error;
- }
-
-
- int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
- {
- int error;
+ if (!xas_error(&xas))
+ return 0;
- error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page));
- if (!error) {
- error = __add_to_swap_cache(page, entry);
- radix_tree_preload_end();
- }
- return error;
+ ClearPageSwapCache(page);
+ page_ref_sub(page, nr);
+ return xas_error(&xas);
}
/*
* This must be called only on pages that have
* been verified to be in the swap cache.
*/
- void __delete_from_swap_cache(struct page *page)
+ void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
{
- struct address_space *address_space;
+ struct address_space *address_space = swap_address_space(entry);
int i, nr = hpage_nr_pages(page);
- swp_entry_t entry;
- pgoff_t idx;
+ pgoff_t idx = swp_offset(entry);
+ XA_STATE(xas, &address_space->i_pages, idx);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
VM_BUG_ON_PAGE(PageWriteback(page), page);
- entry.val = page_private(page);
- address_space = swap_address_space(entry);
- idx = swp_offset(entry);
for (i = 0; i < nr; i++) {
- radix_tree_delete(&address_space->i_pages, idx + i);
+ void *entry = xas_store(&xas, NULL);
+ VM_BUG_ON_PAGE(entry != page + i, entry);
set_page_private(page + i, 0);
+ xas_next(&xas);
}
ClearPageSwapCache(page);
address_space->nrpages -= nr;
return 0;
/*
- * Radix-tree node allocations from PF_MEMALLOC contexts could
+ * XArray node allocations from PF_MEMALLOC contexts could
* completely exhaust the page allocator. __GFP_NOMEMALLOC
* stops emergency reserves from being allocated.
*
*/
err = add_to_swap_cache(page, entry,
__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
- /* -ENOMEM radix-tree allocation failure */
if (err)
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
*/
void delete_from_swap_cache(struct page *page)
{
- swp_entry_t entry;
- struct address_space *address_space;
+ swp_entry_t entry = { .val = page_private(page) };
+ struct address_space *address_space = swap_address_space(entry);
- entry.val = page_private(page);
-
- address_space = swap_address_space(entry);
xa_lock_irq(&address_space->i_pages);
- __delete_from_swap_cache(page);
+ __delete_from_swap_cache(page, entry);
xa_unlock_irq(&address_space->i_pages);
put_swap_page(page, entry);
break; /* Out of memory */
}
- /*
- * call radix_tree_preload() while we can wait.
- */
- err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
- if (err)
- break;
-
/*
* Swap entry may have been freed since our caller observed it.
*/
err = swapcache_prepare(entry);
if (err == -EEXIST) {
- radix_tree_preload_end();
/*
* We might race against get_swap_page() and stumble
* across a SWAP_HAS_CACHE swap_map entry whose page
*/
cond_resched();
continue;
- }
- if (err) { /* swp entry is obsolete ? */
- radix_tree_preload_end();
+ } else if (err) /* swp entry is obsolete ? */
break;
- }
- /* May fail (-ENOMEM) if radix-tree node allocation failed. */
+ /* May fail (-ENOMEM) if XArray node allocation failed. */
__SetPageLocked(new_page);
__SetPageSwapBacked(new_page);
- err = __add_to_swap_cache(new_page, entry);
+ err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
if (likely(!err)) {
- radix_tree_preload_end();
- /*
- * Initiate read into locked page and return.
- */
+ /* Initiate read into locked page */
+ SetPageWorkingset(new_page);
lru_cache_add_anon(new_page);
*new_page_allocated = true;
return new_page;
}
- radix_tree_preload_end();
__ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
return -ENOMEM;
for (i = 0; i < nr; i++) {
space = spaces + i;
- INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN);
+ xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
atomic_set(&space->i_mmap_writable, 0);
space->a_ops = &swap_aops;
/* swap cache doesn't use writeback related tags */
#include <linux/prefetch.h>
#include <linux/printk.h>
#include <linux/dax.h>
+#include <linux/psi.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
total_scan = nr;
- delta = freeable >> priority;
- delta *= 4;
- do_div(delta, shrinker->seeks);
+ if (shrinker->seeks) {
+ delta = freeable >> priority;
+ delta *= 4;
+ do_div(delta, shrinker->seeks);
+ } else {
+ /*
+ * These objects don't require any IO to create. Trim
+ * them aggressively under memory pressure to keep
+ * them from causing refetches in the IO caches.
+ */
+ delta = freeable / 2;
+ }
/*
* Make sure we apply some minimal pressure on default priority
struct mem_cgroup *memcg, int priority)
{
struct memcg_shrinker_map *map;
- unsigned long freed = 0;
- int ret, i;
+ unsigned long ret, freed = 0;
+ int i;
if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
return 0;
struct mem_cgroup *memcg,
int priority)
{
+ unsigned long ret, freed = 0;
struct shrinker *shrinker;
- unsigned long freed = 0;
- int ret;
if (!mem_cgroup_is_root(memcg))
return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
{
/*
* A freeable page cache page is referenced only by the caller
- * that isolated the page, the page cache radix tree and
- * optional buffer heads at page->private.
+ * that isolated the page, the page cache and optional buffer
+ * heads at page->private.
*/
- int radix_pins = PageTransHuge(page) && PageSwapCache(page) ?
+ int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ?
HPAGE_PMD_NR : 1;
- return page_count(page) - page_has_private(page) == 1 + radix_pins;
+ return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
}
static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
mem_cgroup_swapout(page, swap);
- __delete_from_swap_cache(page);
+ __delete_from_swap_cache(page, swap);
xa_unlock_irqrestore(&mapping->i_pages, flags);
put_swap_page(page, swap);
} else {
}
ClearPageActive(page); /* we are de-activating */
+ SetPageWorkingset(page);
list_add(&page->lru, &l_inactive);
}
/*
* Scan types proportional to swappiness and
* their relative recent reclaim efficiency.
+ * Make sure we don't miss the last page
+ * because of a round-off error.
*/
- scan = div64_u64(scan * fraction[file],
- denominator);
+ scan = DIV64_U64_ROUND_UP(scan * fraction[file],
+ denominator);
break;
case SCAN_FILE:
case SCAN_ANON:
{
struct zonelist *zonelist;
unsigned long nr_reclaimed;
+ unsigned long pflags;
int nid;
unsigned int noreclaim_flag;
struct scan_control sc = {
sc.gfp_mask,
sc.reclaim_idx);
+ psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
+
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
memalloc_noreclaim_restore(noreclaim_flag);
+ psi_memstall_leave(&pflags);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
int i;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
+ unsigned long pflags;
struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.may_swap = 1,
};
+ psi_memstall_enter(&pflags);
__fs_reclaim_acquire();
count_vm_event(PAGEOUTRUN);
out:
snapshot_refaults(NULL, pgdat);
__fs_reclaim_release();
+ psi_memstall_leave(&pflags);
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
* the only thing eating into inactive list space is active pages.
*
*
- * Activating refaulting pages
+ * Refaulting inactive pages
*
* All that is known about the active list is that the pages have been
* accessed more than once in the past. This means that at any given
* used less frequently than the refaulting page - or even not used at
* all anymore.
*
+ * That means if inactive cache is refaulting with a suitable refault
+ * distance, we assume the cache workingset is transitioning and put
+ * pressure on the current active list.
+ *
* If this is wrong and demotion kicks in, the pages which are truly
* used more frequently will be reactivated while the less frequently
* used once will be evicted from memory.
* But if this is right, the stale pages will be pushed out of memory
* and the used pages get to stay in cache.
*
+ * Refaulting active pages
+ *
+ * If on the other hand the refaulting pages have recently been
+ * deactivated, it means that the active list is no longer protecting
+ * actively used cache from reclaim. The cache is NOT transitioning to
+ * a different workingset; the existing workingset is thrashing in the
+ * space allocated to the page cache.
+ *
*
* Implementation
*
* and activations is maintained (node->inactive_age).
*
* On eviction, a snapshot of this counter (along with some bits to
- * identify the node) is stored in the now empty page cache radix tree
+ * identify the node) is stored in the now empty page cache
* slot of the evicted page. This is called a shadow entry.
*
* On cache misses for which there are shadow entries, an eligible
* refault distance will immediately activate the refaulting page.
*/
- #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
+ #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
- NODES_SHIFT + \
- MEM_CGROUP_ID_SHIFT)
+ 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
/*
* Eviction timestamps need to be able to cover the full range of
- * actionable refaults. However, bits are tight in the radix tree
+ * actionable refaults. However, bits are tight in the xarray
* entry, and after storing the identifier for the lruvec there might
* not be enough left to represent every single actionable refault. In
* that case, we have to sacrifice granularity for distance, and group
*/
static unsigned int bucket_order __read_mostly;
-static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
+static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
+ bool workingset)
{
eviction >>= bucket_order;
+ eviction &= EVICTION_MASK;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
- eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
+ eviction = (eviction << 1) | workingset;
- return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
+ return xa_mk_value(eviction);
}
static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
- unsigned long *evictionp)
+ unsigned long *evictionp, bool *workingsetp)
{
- unsigned long entry = (unsigned long)shadow;
+ unsigned long entry = xa_to_value(shadow);
int memcgid, nid;
+ bool workingset;
- entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
+ workingset = entry & 1;
+ entry >>= 1;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
*evictionp = entry << bucket_order;
+ *workingsetp = workingset;
}
/**
*/
void *workingset_eviction(struct address_space *mapping, struct page *page)
{
- struct mem_cgroup *memcg = page_memcg(page);
struct pglist_data *pgdat = page_pgdat(page);
+ struct mem_cgroup *memcg = page_memcg(page);
int memcgid = mem_cgroup_id(memcg);
unsigned long eviction;
struct lruvec *lruvec;
lruvec = mem_cgroup_lruvec(pgdat, memcg);
eviction = atomic_long_inc_return(&lruvec->inactive_age);
- return pack_shadow(memcgid, pgdat, eviction);
+ return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
}
/**
* workingset_refault - evaluate the refault of a previously evicted page
+ * @page: the freshly allocated replacement page
* @shadow: shadow entry of the evicted page
*
* Calculates and evaluates the refault distance of the previously
* evicted page in the context of the node it was allocated in.
- *
- * Returns %true if the page should be activated, %false otherwise.
*/
-bool workingset_refault(void *shadow)
+void workingset_refault(struct page *page, void *shadow)
{
unsigned long refault_distance;
+ struct pglist_data *pgdat;
unsigned long active_file;
struct mem_cgroup *memcg;
unsigned long eviction;
struct lruvec *lruvec;
unsigned long refault;
- struct pglist_data *pgdat;
+ bool workingset;
int memcgid;
- unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
rcu_read_lock();
/*
* configurations instead.
*/
memcg = mem_cgroup_from_id(memcgid);
- if (!mem_cgroup_disabled() && !memcg) {
- rcu_read_unlock();
- return false;
- }
+ if (!mem_cgroup_disabled() && !memcg)
+ goto out;
lruvec = mem_cgroup_lruvec(pgdat, memcg);
refault = atomic_long_read(&lruvec->inactive_age);
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
/*
- * The unsigned subtraction here gives an accurate distance
- * across inactive_age overflows in most cases.
+ * Calculate the refault distance
*
- * There is a special case: usually, shadow entries have a
- * short lifetime and are either refaulted or reclaimed along
- * with the inode before they get too old. But it is not
- * impossible for the inactive_age to lap a shadow entry in
- * the field, which can then can result in a false small
- * refault distance, leading to a false activation should this
- * old entry actually refault again. However, earlier kernels
- * used to deactivate unconditionally with *every* reclaim
- * invocation for the longest time, so the occasional
- * inappropriate activation leading to pressure on the active
- * list is not a problem.
+ * The unsigned subtraction here gives an accurate distance
+ * across inactive_age overflows in most cases. There is a
+ * special case: usually, shadow entries have a short lifetime
+ * and are either refaulted or reclaimed along with the inode
+ * before they get too old. But it is not impossible for the
+ * inactive_age to lap a shadow entry in the field, which can
+ * then result in a false small refault distance, leading to a
+ * false activation should this old entry actually refault
+ * again. However, earlier kernels used to deactivate
+ * unconditionally with *every* reclaim invocation for the
+ * longest time, so the occasional inappropriate activation
+ * leading to pressure on the active list is not a problem.
*/
refault_distance = (refault - eviction) & EVICTION_MASK;
inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
- if (refault_distance <= active_file) {
- inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
- rcu_read_unlock();
- return true;
+ /*
+ * Compare the distance to the existing workingset size. We
+ * don't act on pages that couldn't stay resident even if all
+ * the memory was available to the page cache.
+ */
+ if (refault_distance > active_file)
+ goto out;
+
+ SetPageActive(page);
+ atomic_long_inc(&lruvec->inactive_age);
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+
+ /* Page was active prior to eviction */
+ if (workingset) {
+ SetPageWorkingset(page);
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
}
+out:
rcu_read_unlock();
- return false;
}
/**
static struct list_lru shadow_nodes;
- void workingset_update_node(struct radix_tree_node *node)
+ void workingset_update_node(struct xa_node *node)
{
/*
* Track non-empty nodes that contain only shadow entries;
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
- if (node->count && node->count == node->exceptional) {
+ VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */
+
- if (list_empty(&node->private_list))
+ if (node->count && node->count == node->nr_values) {
+ if (list_empty(&node->private_list)) {
list_lru_add(&shadow_nodes, &node->private_list);
+ __inc_lruvec_page_state(virt_to_page(node),
+ WORKINGSET_NODES);
+ }
} else {
- if (!list_empty(&node->private_list))
+ if (!list_empty(&node->private_list)) {
list_lru_del(&shadow_nodes, &node->private_list);
+ __dec_lruvec_page_state(virt_to_page(node),
+ WORKINGSET_NODES);
+ }
}
}
{
unsigned long max_nodes;
unsigned long nodes;
- unsigned long cache;
+ unsigned long pages;
nodes = list_lru_shrink_count(&shadow_nodes, sc);
/*
- * Approximate a reasonable limit for the radix tree nodes
+ * Approximate a reasonable limit for the nodes
* containing shadow entries. We don't need to keep more
* shadow entries than possible pages on the active list,
* since refault distances bigger than that are dismissed.
* worst-case density of 1/8th. Below that, not all eligible
* refaults can be detected anymore.
*
- * On 64-bit with 7 radix_tree_nodes per page and 64 slots
+ * On 64-bit with 7 xa_nodes per page and 64 slots
* each, this will reclaim shadow entries when they consume
* ~1.8% of available memory:
*
- * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE
+ * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
*/
+#ifdef CONFIG_MEMCG
if (sc->memcg) {
- cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
- LRU_ALL_FILE);
- } else {
- cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
- node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
- }
- max_nodes = cache >> (XA_CHUNK_SHIFT - 3);
+ struct lruvec *lruvec;
+
+ pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
+ LRU_ALL);
+ lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg);
+ pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
+ pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE);
+ } else
+#endif
+ pages = node_present_pages(sc->nid);
+
- max_nodes = pages >> (RADIX_TREE_MAP_SHIFT - 3);
++ max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
if (!nodes)
return SHRINK_EMPTY;
static enum lru_status shadow_lru_isolate(struct list_head *item,
struct list_lru_one *lru,
spinlock_t *lru_lock,
- void *arg)
+ void *arg) __must_hold(lru_lock)
{
+ struct xa_node *node = container_of(item, struct xa_node, private_list);
+ XA_STATE(xas, node->array, 0);
struct address_space *mapping;
- struct radix_tree_node *node;
- unsigned int i;
int ret;
/*
* the shadow node LRU under the i_pages lock and the
* lru_lock. Because the page cache tree is emptied before
* the inode can be destroyed, holding the lru_lock pins any
- * address_space that has radix tree nodes on the LRU.
+ * address_space that has nodes on the LRU.
*
* We can then safely transition to the i_pages lock to
* pin only the address_space of the particular node we want
* to reclaim, take the node off-LRU, and drop the lru_lock.
*/
- node = container_of(item, struct radix_tree_node, private_list);
- mapping = container_of(node->root, struct address_space, i_pages);
+ mapping = container_of(node->array, struct address_space, i_pages);
/* Coming from the list, invert the lock order */
if (!xa_trylock(&mapping->i_pages)) {
}
list_lru_isolate(lru, item);
+ __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES);
+
spin_unlock(lru_lock);
/*
* no pages, so we expect to be able to remove them all and
* delete and free the empty node afterwards.
*/
- if (WARN_ON_ONCE(!node->exceptional))
+ if (WARN_ON_ONCE(!node->nr_values))
goto out_invalid;
- if (WARN_ON_ONCE(node->count != node->exceptional))
- goto out_invalid;
- for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
- if (node->slots[i]) {
- if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i])))
- goto out_invalid;
- if (WARN_ON_ONCE(!node->exceptional))
- goto out_invalid;
- if (WARN_ON_ONCE(!mapping->nrexceptional))
- goto out_invalid;
- node->slots[i] = NULL;
- node->exceptional--;
- node->count--;
- mapping->nrexceptional--;
- }
- }
- if (WARN_ON_ONCE(node->exceptional))
+ if (WARN_ON_ONCE(node->count != node->nr_values))
goto out_invalid;
- inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
+ mapping->nrexceptional -= node->nr_values;
+ xas.xa_node = xa_parent_locked(&mapping->i_pages, node);
+ xas.xa_offset = node->offset;
+ xas.xa_shift = node->shift + XA_CHUNK_SHIFT;
+ xas_set_update(&xas, workingset_update_node);
+ /*
+ * We could store a shadow entry here which was the minimum of the
+ * shadow entries we were tracking ...
+ */
+ xas_store(&xas, NULL);
- __radix_tree_delete_node(&mapping->i_pages, node,
- workingset_lookup_update(mapping));
+ __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
out_invalid:
xa_unlock_irq(&mapping->i_pages);
static struct shrinker workingset_shadow_shrinker = {
.count_objects = count_shadow_nodes,
.scan_objects = scan_shadow_nodes,
- .seeks = DEFAULT_SEEKS,
+ .seeks = 0, /* ->count reports only fully expendable nodes */
.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
};