F: drivers/gpu/drm/amd/include/v9_structs.h
F: drivers/gpu/drm/amd/include/vi_structs.h
F: include/uapi/linux/kfd_ioctl.h
+F: include/uapi/linux/kfd_sysfs.h
AMD SPI DRIVER
T: git https://gitlab.freedesktop.org/agd5f/linux.git
F: drivers/gpu/drm/amd/pm/
+AMD PSTATE DRIVER
+S: Supported
+F: Documentation/admin-guide/pm/amd-pstate.rst
+F: drivers/cpufreq/amd-pstate*
+
AMD PTDMA DRIVER
F: Documentation/devicetree/bindings/iio/adc/adi,ad7780.yaml
F: drivers/iio/adc/ad7780.c
+ANALOG DEVICES INC AD74413R DRIVER
+S: Supported
+W: http://ez.analog.com/community/linux-device-drivers
+F: Documentation/devicetree/bindings/iio/addac/adi,ad74413r.yaml
+F: drivers/iio/addac/ad74413r.c
+F: include/dt-bindings/iio/addac/adi,ad74413r.h
+
ANALOG DEVICES INC AD9389B DRIVER
S: Maintained
+F: Documentation/devicetree/bindings/media/i2c/adv748x.yaml
F: drivers/media/i2c/adv748x/*
ANALOG DEVICES INC ADV7511 DRIVER
C: irc://irc.oftc.net/asahi-dev
T: git https://github.com/AsahiLinux/linux.git
F: Documentation/devicetree/bindings/arm/apple.yaml
+F: Documentation/devicetree/bindings/arm/apple/*
F: Documentation/devicetree/bindings/i2c/apple,i2c.yaml
F: Documentation/devicetree/bindings/interrupt-controller/apple,aic.yaml
F: Documentation/devicetree/bindings/mailbox/apple,mailbox.yaml
F: Documentation/devicetree/bindings/pci/apple,pcie.yaml
F: Documentation/devicetree/bindings/pinctrl/apple,pinctrl.yaml
+F: Documentation/devicetree/bindings/power/apple*
+F: Documentation/devicetree/bindings/watchdog/apple,wdt.yaml
F: arch/arm64/boot/dts/apple/
F: drivers/i2c/busses/i2c-pasemi-core.c
F: drivers/i2c/busses/i2c-pasemi-platform.c
F: drivers/irqchip/irq-apple-aic.c
F: drivers/mailbox/apple-mailbox.c
F: drivers/pinctrl/pinctrl-apple-gpio.c
+F: drivers/soc/apple/*
F: include/dt-bindings/interrupt-controller/apple-aic.h
F: include/dt-bindings/pinctrl/apple.h
F: include/linux/apple-mailbox.h
F: drivers/hwtracing/coresight/*
F: include/dt-bindings/arm/coresight-cti-dt.h
F: include/linux/coresight*
+F: samples/coresight/*
F: tools/perf/arch/arm/util/auxtrace.c
F: tools/perf/arch/arm/util/cs-etm.c
F: tools/perf/arch/arm/util/cs-etm.h
F: arch/arm/boot/dts/mstar-*
F: arch/arm/mach-mstar/
F: drivers/clk/mstar/
+F: drivers/clocksource/timer-msc313e.c
F: drivers/gpio/gpio-msc313.c
F: drivers/rtc/rtc-msc313.c
F: drivers/watchdog/msc313e_wdt.c
F: Documentation/arm/samsung/
F: Documentation/devicetree/bindings/arm/samsung/
F: Documentation/devicetree/bindings/power/pd-samsung.yaml
+F: Documentation/devicetree/bindings/soc/samsung/
F: arch/arm/boot/dts/exynos*
F: arch/arm/boot/dts/s3c*
F: arch/arm/boot/dts/s5p*
N: s5pv210
ARM/SAMSUNG S5P SERIES 2D GRAPHICS ACCELERATION (G2D) SUPPORT
-M: Andrzej Hajda <a.hajda@samsung.com>
+M: Ćukasz Stelmach <l.stelmach@samsung.com>
S: Maintained
F: drivers/media/platform/s5p-jpeg/
ARM/SAMSUNG S5P SERIES Multi Format Codec (MFC) SUPPORT
S: Maintained
S: Supported
T: git git://git.kernel.org/pub/scm/linux/kernel/git/iwamatsu/linux-visconti.git
F: Documentation/devicetree/bindings/arm/toshiba.yaml
+F: Documentation/devicetree/bindings/clock/toshiba,tmpv770x-pipllct.yaml
+F: Documentation/devicetree/bindings/clock/toshiba,tmpv770x-pismu.yaml
F: Documentation/devicetree/bindings/net/toshiba,visconti-dwmac.yaml
F: Documentation/devicetree/bindings/gpio/toshiba,gpio-visconti.yaml
F: Documentation/devicetree/bindings/pci/toshiba,visconti-pcie.yaml
F: Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml
F: Documentation/devicetree/bindings/watchdog/toshiba,visconti-wdt.yaml
F: arch/arm64/boot/dts/toshiba/
+F: drivers/clk/visconti/
F: drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c
F: drivers/gpio/gpio-visconti.c
F: drivers/pci/controller/dwc/pcie-visconti.c
F: drivers/platform/x86/asus*.c
F: drivers/platform/x86/eeepc*.c
+ASUS TF103C DOCK DRIVER
+S: Maintained
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git
+F: drivers/platform/x86/asus-tf103c-dock.c
+
+ASUS WMI HARDWARE MONITOR DRIVER
+S: Maintained
+F: drivers/hwmon/asus_wmi_sensors.c
+
+ASUS WMI EC HARDWARE MONITOR DRIVER
+S: Maintained
+F: drivers/hwmon/asus_wmi_ec_sensors.c
+
ASUS WIRELESS RADIO CONTROL DRIVER
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
+F: Documentation/ABI/stable/sysfs-block
+F: Documentation/block/
F: block/
F: drivers/block/
F: include/linux/blk*
S: Maintained
-F: Documentation/bpf/bpf_lsm.rst
+F: Documentation/bpf/prog_lsm.rst
F: include/linux/bpf_lsm.h
F: kernel/bpf/bpf_lsm.c
F: security/bpf/
F: drivers/net/ethernet/broadcom/unimac.h
BROADCOM BCM5301X ARM ARCHITECTURE
F: arch/arm/mach-bcm/bcm_5301x.c
BROADCOM BCM53573 ARM ARCHITECTURE
S: Maintained
-F: Documentation/devicetree/bindings/usb/brcm,bdc.txt
+F: Documentation/devicetree/bindings/usb/brcm,bdc.yaml
F: drivers/usb/gadget/udc/bdc/
BROADCOM BMIPS CPUFREQ DRIVER
S: Supported
-F: Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.txt
+F: Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.yaml
F: drivers/gpio/gpio-brcmstb.c
BROADCOM BRCMSTB I2C DRIVER
S: Supported
-F: Documentation/devicetree/bindings/net/brcm,bcmgenet.txt
+F: Documentation/devicetree/bindings/net/brcm,bcmgenet.yaml
F: Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml
F: drivers/net/ethernet/broadcom/genet/
F: drivers/net/ethernet/broadcom/unimac.h
S: Maintained
-F: Documentation/devicetree/bindings/net/brcm,amac.txt
+F: Documentation/devicetree/bindings/net/brcm,amac.yaml
F: drivers/net/ethernet/broadcom/bgmac*
F: drivers/net/ethernet/broadcom/unimac.h
S: Maintained
-F: Documentation/devicetree/bindings/thermal/brcm,avs-tmon.txt
+F: Documentation/devicetree/bindings/thermal/brcm,avs-tmon.yaml
F: drivers/thermal/broadcom/brcmstb*
BROADCOM STB DPFE DRIVER
S: Supported
F: drivers/net/ethernet/broadcom/bcmsysport.*
F: drivers/net/ethernet/broadcom/unimac.h
+F: Documentation/devicetree/bindings/net/brcm,systemport.yaml
BROADCOM TG3 GIGABIT ETHERNET DRIVER
CIRRUS LOGIC AUDIO CODEC DRIVERS
S: Maintained
+F: Documentation/devicetree/bindings/sound/cirrus,cs*
+F: sound/pci/hda/cs*
F: sound/soc/codecs/cs*
CIRRUS LOGIC DSP FIRMWARE DRIVER
S: Odd Fixes
F: drivers/comedi/
+F: include/linux/comedi/
+F: include/uapi/linux/comedi.h
COMMON CLK FRAMEWORK
T: git git://linuxtv.org/media_tree.git
F: drivers/media/platform/sti/delta
+DELTA AHE-50DC FAN CONTROL MODULE DRIVER
+S: Maintained
+F: drivers/hwmon/pmbus/delta-ahe50dc-fan.c
+
DELTA DPS920AB PSU DRIVER
DRM DRIVER FOR MSM ADRENO GPU
DRM DRIVER FOR NVIDIA GEFORCE/QUADRO GPUS
S: Supported
-T: git git://github.com/skeggsb/linux
+W: https://nouveau.freedesktop.org/
+Q: https://patchwork.freedesktop.org/project/nouveau/
+Q: https://gitlab.freedesktop.org/drm/nouveau/-/merge_requests
+B: https://gitlab.freedesktop.org/drm/nouveau/-/issues
+C: irc://irc.oftc.net/nouveau
+T: git https://gitlab.freedesktop.org/drm/nouveau.git
F: drivers/gpu/drm/nouveau/
F: include/uapi/drm/nouveau_drm.h
F: drivers/gpu/drm/atmel-hlcdc/
DRM DRIVERS FOR BRIDGE CHIPS
-M: Andrzej Hajda <a.hajda@samsung.com>
+M: Andrzej Hajda <andrzej.hajda@intel.com>
S: Supported
T: git git://linuxtv.org/pinchartl/media drm/du/next
+F: Documentation/devicetree/bindings/display/bridge/renesas,dsi-csi2-tx.yaml
F: Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.yaml
F: Documentation/devicetree/bindings/display/bridge/renesas,lvds.yaml
F: Documentation/devicetree/bindings/display/renesas,du.yaml
F: drivers/gpu/drm/panel/
F: include/drm/drm_panel.h
+DRM PRIVACY-SCREEN CLASS
+S: Maintained
+T: git git://anongit.freedesktop.org/drm/drm-misc
+F: drivers/gpu/drm/drm_privacy_screen*
+F: include/drm/drm_privacy_screen*
+
DRM TTM SUBSYSTEM
F: drivers/mmc/host/cqhci*
EMULEX 10Gbps iSCSI - OneConnect DRIVER
S: Supported
W: http://www.broadcom.com
F: drivers/base/firmware_loader/
F: include/linux/firmware.h
-FLASH ADAPTER DRIVER (IBM Flash Adapter 900GB Full Height PCI Flash Card)
-S: Maintained
-F: drivers/block/rsxx/
-
FLEXTIMER FTM-QUADDEC DRIVER
FREESCALE CAAM (Cryptographic Acceleration and Assurance Module) DRIVER
S: Maintained
F: Documentation/devicetree/bindings/crypto/fsl-sec4.txt
F: include/linux/hid*
F: include/uapi/linux/hid*
+HID LOGITECH DRIVERS
+S: Maintained
+F: drivers/hid/hid-logitech-*
+
HID PLAYSTATION DRIVER
HISILICON PMU DRIVER
S: Supported
W: http://www.hisilicon.com
+F: Documentation/admin-guide/perf/hisi-pcie-pmu.rst
F: Documentation/admin-guide/perf/hisi-pmu.rst
F: drivers/perf/hisilicon
HISILICON SECURITY ENGINE V2 DRIVER (SEC2)
S: Maintained
F: Documentation/ABI/testing/debugfs-hisi-sec
S: Supported
W: https://01.org/linuxgraphics/
F: drivers/crypto/keembay/Kconfig
F: drivers/crypto/keembay/Makefile
F: drivers/crypto/keembay/keembay-ocs-ecc.c
-F: drivers/crypto/keembay/ocs-ecc-curve-defs.h
INTEL KEEM BAY OCS HCU CRYPTO DRIVER
F: drivers/crypto/keembay/ocs-hcu.c
F: drivers/crypto/keembay/ocs-hcu.h
+INTEL THUNDER BAY EMMC PHY DRIVER
+S: Maintained
+F: Documentation/devicetree/bindings/phy/intel,phy-thunderbay-emmc.yaml
+F: drivers/phy/intel/phy-intel-thunderbay-emmc.c
+
INTEL MANAGEMENT ENGINE (mei)
F: drivers/mfd/intel_soc_pmic*
F: include/linux/mfd/intel_soc_pmic*
-INTEL PMT DRIVER
-S: Maintained
-F: drivers/mfd/intel_pmt.c
+INTEL PMT DRIVERS
+S: Supported
F: drivers/platform/x86/intel/pmt/
INTEL PRO/WIRELESS 2100, 2200BG, 2915ABG NETWORK CONNECTION SUPPORT
S: Maintained
F: drivers/platform/x86/intel/uncore-frequency.c
+INTEL VENDOR SPECIFIC EXTENDED CAPABILITIES DRIVER
+S: Supported
+F: drivers/platform/x86/intel/vsec.*
+
INTEL VIRTUAL BUTTON DRIVER
W: http://legousb.sourceforge.net/
F: drivers/usb/misc/legousbtower.c
+LETSKETCH HID TABLET DRIVER
+S: Maintained
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git
+F: drivers/hid/hid-letsketch.c
+
LG LAPTOP EXTRAS
F: Documentation/devicetree/bindings/media/i2c/maxim,max9286.yaml
F: drivers/media/i2c/max9286.c
+MAX96712 QUAD GMSL2 DESERIALIZER DRIVER
+S: Maintained
+F: drivers/staging/media/max96712/max96712.c
+
MAX9860 MONO AUDIO VOICE CODEC DRIVER
F: Documentation/devicetree/bindings/power/supply/maxim,max17042.yaml
F: drivers/power/supply/max17042_battery.c
+MAXIM MAX20086 CAMERA POWER PROTECTOR DRIVER
+S: Maintained
+F: Documentation/devicetree/bindings/regulator/maxim,max20086.yaml
+F: drivers/regulator/max20086-regulator.c
+
MAXIM MAX77650 PMIC MFD DRIVER
F: drivers/regulator/max77802-regulator.c
F: include/dt-bindings/*/*max77802.h
+MAXIM MAX77976 BATTERY CHARGER
+S: Supported
+F: Documentation/devicetree/bindings/power/supply/maxim,max77976.yaml
+F: drivers/power/supply/max77976_charger.c
+
MAXIM MUIC CHARGER DRIVERS FOR EXYNOS BASED BOARDS
S: Supported
-F: Documentation/devicetree/bindings/*/max77686.txt
+F: Documentation/devicetree/bindings/*/maxim,max77686.yaml
F: Documentation/devicetree/bindings/clock/maxim,max77686.txt
F: Documentation/devicetree/bindings/mfd/max14577.txt
F: Documentation/devicetree/bindings/mfd/max77693.txt
S: Maintained
F: drivers/net/ethernet/microchip/lan743x_*
+MICROCHIP LAN966X ETHERNET DRIVER
+S: Maintained
+F: drivers/net/ethernet/microchip/lan966x/*
+
MICROCHIP LCDFB DRIVER
F: drivers/gpu/drm/imx/dcss/
NXP i.MX 8QXP ADC DRIVER
-S: Supported
+S: Maintained
F: Documentation/devicetree/bindings/iio/adc/nxp,imx8qxp-adc.yaml
F: drivers/iio/adc/imx8qxp-adc.c
+NXP i.MX 7D/6SX/6UL AND VF610 ADC DRIVER
+S: Maintained
+F: Documentation/devicetree/bindings/iio/adc/fsl,imx7d-adc.yaml
+F: Documentation/devicetree/bindings/iio/adc/fsl,vf610-adc.yaml
+F: drivers/iio/adc/imx7d_adc.c
+F: drivers/iio/adc/vf610_adc.c
+
NXP PF8100/PF8121A/PF8200 PMIC REGULATOR DEVICE DRIVER
S: Maintained
F: Documentation/hwmon/nzxt-kraken2.rst
F: drivers/hwmon/nzxt-kraken2.c
+NZXT-SMART2 HARDWARE MONITORING DRIVER
+S: Maintained
+F: Documentation/hwmon/nzxt-smart2.rst
+F: drivers/hwmon/nzxt-smart2.c
+
OBJAGG
OMNIVISION OV5670 SENSOR DRIVER
S: Maintained
T: git git://linuxtv.org/media_tree.git
T: git git://linuxtv.org/media_tree.git
F: drivers/media/i2c/ov5675.c
+OMNIVISION OV5693 SENSOR DRIVER
+S: Maintained
+T: git git://linuxtv.org/media_tree.git
+F: drivers/media/i2c/ov5693.c
+
OMNIVISION OV5695 SENSOR DRIVER
F: include/trace/events/page_pool.h
F: net/core/page_pool.c
+ PAGE TABLE CHECK
+ S: Maintained
+ F: Documentation/vm/page_table_check.rst
+ F: include/linux/page_table_check.h
+ F: mm/page_table_check.c
+
PANASONIC LAPTOP ACPI EXTRAS DRIVER
PCMCIA SUBSYSTEM
S: Odd Fixes
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/brodo/pcmcia.git
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/brodo/linux.git
F: Documentation/pcmcia/
F: drivers/pcmcia/
F: include/pcmcia/
S: Maintained
F: drivers/pinctrl/pinctrl-single.c
+PIN CONTROLLER - THUNDERBAY
+S: Supported
+F: drivers/pinctrl/pinctrl-thunderbay.c
+
PKTCDVD DRIVER
S: Orphan
S: Maintained
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux.git
F: include/linux/printk.h
F: kernel/printk/
F: Documentation/devicetree/bindings/net/wireless/qca,ath9k.yaml
F: drivers/net/wireless/ath/ath9k/
+QUALCOMM BAM-DMUX WWAN NETWORK DRIVER
+S: Maintained
+F: Documentation/devicetree/bindings/net/qcom,bam-dmux.yaml
+F: drivers/net/wwan/qcom_bam_dmux.c
+
QUALCOMM CAMERA SUBSYSTEM DRIVER
F: Documentation/devicetree/bindings/media/*camss*
F: drivers/media/platform/qcom/camss/
+QUALCOMM CLOCK DRIVERS
+S: Supported
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux.git
+F: Documentation/devicetree/bindings/clock/qcom,*
+F: drivers/clk/qcom/
+F: include/dt-bindings/clock/qcom,*
+
QUALCOMM CORE POWER REDUCTION (CPR) AVS DRIVER
RANDOM NUMBER DRIVER
+T: git https://git.kernel.org/pub/scm/linux/kernel/git/crng/random.git
S: Maintained
F: drivers/char/random.c
F: Documentation/devicetree/bindings/iio/adc/renesas,rzg2l-adc.yaml
F: drivers/iio/adc/rzg2l_adc.c
+RENESAS R-CAR GEN3 & RZ/N1 NAND CONTROLLER DRIVER
+S: Maintained
+F: Documentation/devicetree/bindings/mtd/renesas-nandc.yaml
+F: drivers/mtd/nand/raw/renesas-nand-controller.c
+
RESET CONTROLLER FRAMEWORK
S: Maintained
S: Supported
-F: Documentation/devicetree/bindings/mfd/rohm,bd70528-pmic.txt
-F: Documentation/devicetree/bindings/regulator/rohm,bd70528-regulator.txt
F: drivers/clk/clk-bd718x7.c
-F: drivers/gpio/gpio-bd70528.c
F: drivers/gpio/gpio-bd71815.c
F: drivers/gpio/gpio-bd71828.c
-F: drivers/mfd/rohm-bd70528.c
F: drivers/mfd/rohm-bd71828.c
F: drivers/mfd/rohm-bd718x7.c
F: drivers/mfd/rohm-bd9576.c
-F: drivers/power/supply/bd70528-charger.c
-F: drivers/regulator/bd70528-regulator.c
F: drivers/regulator/bd71815-regulator.c
F: drivers/regulator/bd71828-regulator.c
F: drivers/regulator/bd718x7-regulator.c
F: drivers/regulator/bd9576-regulator.c
F: drivers/regulator/rohm-regulator.c
F: drivers/rtc/rtc-bd70528.c
-F: drivers/watchdog/bd70528_wdt.c
F: drivers/watchdog/bd9576_wdt.c
-F: include/linux/mfd/rohm-bd70528.h
F: include/linux/mfd/rohm-bd71815.h
F: include/linux/mfd/rohm-bd71828.h
F: include/linux/mfd/rohm-bd718x7.h
F: drivers/nfc/s3fwrn5
SAMSUNG S5C73M3 CAMERA DRIVER
S: Supported
F: drivers/media/i2c/s5c73m3/*
SAMSUNG S5K5BAF CAMERA DRIVER
S: Supported
F: drivers/media/i2c/s5k5baf.c
S: Supported
T: git git://git.kernel.org/pub/scm/linux/kernel/git/snawrocki/clk.git
-F: Documentation/devicetree/bindings/clock/exynos*.txt
F: Documentation/devicetree/bindings/clock/samsung,*.yaml
F: Documentation/devicetree/bindings/clock/samsung,s3c*
-F: Documentation/devicetree/bindings/clock/samsung,s5p*
F: drivers/clk/samsung/
F: include/dt-bindings/clock/exynos*.h
F: include/dt-bindings/clock/s3c*.h
S: Maintained
F: drivers/mmc/host/sdhci-omap.c
+SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) NXP i.MX DRIVER
+S: Maintained
+F: drivers/mmc/host/sdhci-esdhc-imx.c
+
SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER
F: include/linux/arm_sdei.h
F: include/uapi/linux/arm_sdei.h
-SOFTWARE NODES
+SOFTWARE NODES AND DEVICE PROPERTIES
S: Maintained
+F: drivers/base/property.c
F: drivers/base/swnode.c
+F: include/linux/fwnode.h
+F: include/linux/property.h
SOFTWARE RAID (Multiple Disks) SUPPORT
F: include/sound/
F: include/uapi/sound/
F: sound/
+F: tools/testing/selftests/alsa
SOUND - COMPRESSED AUDIO
F: sound/core/pcm_dmaengine.c
F: sound/soc/soc-generic-dmaengine-pcm.c
+SOUND - ALSA SELFTESTS
+S: Supported
+F: tools/testing/selftests/alsa
+
SOUND - SOC LAYER / DYNAMIC AUDIO POWER MANAGEMENT (ASoC)
SPI NOR SUBSYSTEM
S: Maintained
W: http://www.linux-mtd.infradead.org/
S: Odd Fixes
F: drivers/net/ethernet/adaptec/starfire*
+STARFIVE JH7100 CLOCK DRIVER
+S: Maintained
+F: Documentation/devicetree/bindings/clock/starfive,jh7100-clkgen.yaml
+F: drivers/clk/starfive/clk-starfive-jh7100.c
+F: include/dt-bindings/clock/starfive-jh7100.h
+
+STARFIVE JH7100 PINCTRL DRIVER
+S: Maintained
+F: Documentation/devicetree/bindings/pinctrl/starfive,jh7100-pinctrl.yaml
+F: drivers/pinctrl/pinctrl-starfive.c
+F: include/dt-bindings/pinctrl/pinctrl-starfive.h
+
+STARFIVE JH7100 RESET CONTROLLER DRIVER
+S: Maintained
+F: Documentation/devicetree/bindings/reset/starfive,jh7100-reset.yaml
+F: drivers/reset/reset-starfive-jh7100.c
+F: include/dt-bindings/reset/starfive-jh7100.h
+
STATIC BRANCH/CALL
S: Supported
T: git git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git
+F: Documentation/arc/
F: Documentation/devicetree/bindings/arc/*
F: Documentation/devicetree/bindings/interrupt-controller/snps,arc*
F: arch/arc/
T: git git://github.com/srcres258/linux-doc.git doc-zh-tw
F: Documentation/translations/zh_TW/
-TRIVIAL PATCHES
-S: Maintained
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/jikos/trivial.git
-K: ^Subject:.*(?i)trivial
-
TTY LAYER
W: http://www.linux-mtd.infradead.org/doc/ubifs.html
T: git git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git next
T: git git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git fixes
+F: Documentation/ABI/testing/sysfs-fs-ubifs
F: Documentation/filesystems/ubifs-authentication.rst
F: Documentation/filesystems/ubifs.rst
F: fs/ubifs/
VIRTIO GPU DRIVER
S: Maintained
S: Maintained
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/pmladek/printk.git
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux.git
F: Documentation/core-api/printk-formats.rst
F: lib/test_printf.c
F: lib/test_scanf.c
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/mm
F: arch/x86/mm/
+X86 PLATFORM ANDROID TABLETS DSDT FIXUP DRIVER
+S: Maintained
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git
+F: drivers/platform/x86/x86-android-tablets.c
+
X86 PLATFORM DRIVERS
F: drivers/xen/xen-scsiback.c
F: include/xen/interface/io/vscsiif.h
+XEN PVUSB DRIVER
+S: Supported
+F: drivers/usb/host/xen*
+F: include/xen/interface/io/usbif.h
+
XEN SOUND FRONTEND DRIVER
F: include/uapi/linux/dqblk_xfs.h
F: include/uapi/linux/fsmap.h
+XILINX AMS DRIVER
+S: Maintained
+F: Documentation/devicetree/bindings/iio/adc/xlnx,zynqmp-ams.yaml
+F: drivers/iio/adc/xilinx-ams.c
+
XILINX AXI ETHERNET DRIVER
S: Maintained
F: Documentation/devicetree/bindings/phy/xlnx,zynqmp-psgtr.yaml
F: drivers/phy/xilinx/phy-zynqmp.c
+XILINX EVENT MANAGEMENT DRIVER
+S: Maintained
+F: drivers/soc/xilinx/xlnx_event_manager.c
+F: include/linux/firmware/xlnx-event-manager.h
+
XILLYBUS DRIVER
config ARCH_SUPPORTS_DEBUG_PAGEALLOC
bool
+ config ARCH_SUPPORTS_PAGE_TABLE_CHECK
+ bool
+
config ARCH_SPLIT_ARG64
bool
help
config DYNAMIC_SIGFRAME
bool
+# Select, if arch has a named attribute group bound to NUMA device nodes.
+config HAVE_ARCH_NODE_DEV_GROUP
+ bool
+
source "kernel/gcov/Kconfig"
source "scripts/gcc-plugins/Kconfig"
#include <linux/sched/debug.h>
#include <linux/highmem.h>
#include <linux/perf_event.h>
+#include <linux/kfence.h>
#include <asm/system_misc.h>
#include <asm/system_info.h>
{ }
#endif /* CONFIG_MMU */
+static inline bool is_write_fault(unsigned int fsr)
+{
+ return (fsr & FSR_WRITE) && !(fsr & FSR_CM);
+}
+
static void die_kernel_fault(const char *msg, struct mm_struct *mm,
unsigned long addr, unsigned int fsr,
struct pt_regs *regs)
/*
* No handler, we'll have to terminate things with extreme prejudice.
*/
- if (addr < PAGE_SIZE)
+ if (addr < PAGE_SIZE) {
msg = "NULL pointer dereference";
- else
+ } else {
+ if (kfence_handle_page_fault(addr, is_write_fault(fsr), regs))
+ return;
+
msg = "paging request";
+ }
die_kernel_fault(msg, mm, addr, fsr, regs);
}
}
#ifdef CONFIG_MMU
-#define VM_FAULT_BADMAP 0x010000
-#define VM_FAULT_BADACCESS 0x020000
+#define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000)
+#define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000)
static inline bool is_permission_fault(unsigned int fsr)
{
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
- if ((fsr & FSR_WRITE) && !(fsr & FSR_CM)) {
+ if (is_write_fault(fsr)) {
flags |= FAULT_FLAG_WRITE;
vm_flags = VM_WRITE;
}
return 0;
}
- if (!(fault & VM_FAULT_ERROR) && flags & FAULT_FLAG_ALLOW_RETRY) {
+ if (!(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_RETRY) {
flags |= FAULT_FLAG_TRIED;
goto retry;
pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
addr);
+ kasan_non_canonical_hook(addr);
+
mem_abort_decode(esr);
show_pte(addr);
}
if (fault & VM_FAULT_RETRY) {
- if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
- mm_flags |= FAULT_FLAG_TRIED;
- goto retry;
- }
+ mm_flags |= FAULT_FLAG_TRIED;
+ goto retry;
}
mmap_read_unlock(mm);
if (!inf->fn(far, esr, regs))
return;
- if (!user_mode(regs)) {
- pr_alert("Unhandled fault at 0x%016lx\n", addr);
- mem_abort_decode(esr);
- show_pte(addr);
- }
+ if (!user_mode(regs))
+ die_kernel_fault(inf->name, addr, esr, regs);
/*
* At this point we have an unrecognized fault type whose tag bits may
* Fix up get_user() and put_user().
* ASM_EXCEPTIONTABLE_ENTRY_EFAULT() sets the least-significant
* bit in the relative address of the fixup routine to indicate
- * that %r8 should be loaded with -EFAULT to report a userspace
- * access error.
+ * that gr[ASM_EXCEPTIONTABLE_REG] should be loaded with
+ * -EFAULT to report a userspace access error.
*/
if (fix->fixup & 1) {
- regs->gr[8] = -EFAULT;
+ regs->gr[ASM_EXCEPTIONTABLE_REG] = -EFAULT;
/* zero target register for get_user() */
if (parisc_acctyp(0, regs->iir) == VM_READ) {
unsigned long acc_type;
vm_fault_t fault = 0;
unsigned int flags;
-
- if (faulthandler_disabled())
- goto no_context;
+ char *msg;
tsk = current;
mm = tsk->mm;
- if (!mm)
+ if (!mm) {
+ msg = "Page fault: no context";
goto no_context;
+ }
flags = FAULT_FLAG_DEFAULT;
if (user_mode(regs))
goto bad_area;
BUG();
}
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_RETRY) {
- /*
- * No need to mmap_read_unlock(mm) as we would
- * have already released it in __lock_page_or_retry
- * in mm/filemap.c.
- */
- flags |= FAULT_FLAG_TRIED;
- goto retry;
- }
+ if (fault & VM_FAULT_RETRY) {
+ /*
+ * No need to mmap_read_unlock(mm) as we would
+ * have already released it in __lock_page_or_retry
+ * in mm/filemap.c.
+ */
+ flags |= FAULT_FLAG_TRIED;
+ goto retry;
}
mmap_read_unlock(mm);
return;
force_sig_fault(signo, si_code, (void __user *) address);
return;
}
+ msg = "Page fault: bad address";
no_context:
return;
}
- parisc_terminate("Bad Address (null pointer deref?)", regs, code, address);
+ parisc_terminate(msg, regs, code, address);
- out_of_memory:
+out_of_memory:
mmap_read_unlock(mm);
- if (!user_mode(regs))
+ if (!user_mode(regs)) {
+ msg = "Page fault: out of memory";
goto no_context;
+ }
pagefault_out_of_memory();
}
#include <linux/kfence.h>
#include <linux/pkeys.h>
+#include <asm/asm-prototypes.h>
#include <asm/firmware.h>
#include <asm/interrupt.h>
#include <asm/page.h>
* case.
*/
if (unlikely(fault & VM_FAULT_RETRY)) {
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- flags |= FAULT_FLAG_TRIED;
- goto retry;
- }
+ flags |= FAULT_FLAG_TRIED;
+ goto retry;
}
mmap_read_unlock(current->mm);
{
bad_page_fault(regs, SIGSEGV);
}
+
+/*
+ * In radix, segment interrupts indicate the EA is not addressable by the
+ * page table geometry, so they are always sent here.
+ *
+ * In hash, this is called if do_slb_fault returns error. Typically it is
+ * because the EA was outside the region allowed by software.
+ */
+DEFINE_INTERRUPT_HANDLER(do_bad_segment_interrupt)
+{
+ int err = regs->result;
+
+ if (err == -EFAULT) {
+ if (user_mode(regs))
+ _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar);
+ else
+ bad_page_fault(regs, SIGSEGV);
+ } else if (err == -EINVAL) {
+ unrecoverable_exception(regs);
+ } else {
+ BUG();
+ }
+}
#endif
pr_cont("R1:%016lx ", *table);
if (*table & _REGION_ENTRY_INVALID)
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
fallthrough;
case _ASCE_TYPE_REGION2:
table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
pr_cont("R2:%016lx ", *table);
if (*table & _REGION_ENTRY_INVALID)
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
fallthrough;
case _ASCE_TYPE_REGION3:
table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
pr_cont("R3:%016lx ", *table);
if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
fallthrough;
case _ASCE_TYPE_SEGMENT:
table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
pr_cont("S:%016lx ", *table);
if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
goto out;
- table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+ table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
}
table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
if (bad_address(table))
if (unlikely(fault & VM_FAULT_ERROR))
goto out_up;
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_RETRY) {
- if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
- (flags & FAULT_FLAG_RETRY_NOWAIT)) {
- /* FAULT_FLAG_RETRY_NOWAIT has been set,
- * mmap_lock has not been released */
- current->thread.gmap_pfault = 1;
- fault = VM_FAULT_PFAULT;
- goto out_up;
- }
- flags &= ~FAULT_FLAG_RETRY_NOWAIT;
- flags |= FAULT_FLAG_TRIED;
- mmap_read_lock(mm);
- goto retry;
+ if (fault & VM_FAULT_RETRY) {
+ if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
+ (flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ /*
+ * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has
+ * not been released
+ */
+ current->thread.gmap_pfault = 1;
+ fault = VM_FAULT_PFAULT;
+ goto out_up;
}
+ flags &= ~FAULT_FLAG_RETRY_NOWAIT;
+ flags |= FAULT_FLAG_TRIED;
+ mmap_read_lock(mm);
+ goto retry;
}
if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
address = __gmap_link(gmap, current->thread.gmap_addr,
}
BUG();
}
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_RETRY) {
- flags |= FAULT_FLAG_TRIED;
+ if (fault & VM_FAULT_RETRY) {
+ flags |= FAULT_FLAG_TRIED;
- goto retry;
- }
+ goto retry;
}
pmd = pmd_off(mm, address);
pagefault_out_of_memory();
return 0;
}
-EXPORT_SYMBOL(handle_page_fault);
static void show_segv_info(struct uml_pt_regs *regs)
{
select ARCH_SUPPORTS_ACPI
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
select ARCH_SUPPORTS_LTO_CLANG
select HAVE_ARCH_KCSAN if X86_64
select X86_FEATURE_NAMES if PROC_FS
select PROC_PID_ARCH_STATUS if PROC_FS
+ select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX
imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI
config INSTRUCTION_DECODER
branches. Requires a compiler with -mindirect-branch=thunk-extern
support for full protection. The kernel may run slower.
+config CC_HAS_SLS
+ def_bool $(cc-option,-mharden-sls=all)
+
+config SLS
+ bool "Mitigate Straight-Line-Speculation"
+ depends on CC_HAS_SLS && X86_64
+ default n
+ help
+ Compile the kernel with straight-line-speculation options to guard
+ against straight line speculation. The kernel image might be slightly
+ larger.
+
config X86_CPU_RESCTRL
bool "x86 CPU resource control support"
depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
helps to determine the effectiveness of preserving large and huge
page mappings when mapping protections are changed.
+config X86_MEM_ENCRYPT
+ select ARCH_HAS_FORCE_DMA_UNENCRYPTED
+ select DYNAMIC_PHYSICAL_MASK
+ select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
+ def_bool n
+
config AMD_MEM_ENCRYPT
bool "AMD Secure Memory Encryption (SME) support"
depends on X86_64 && CPU_SUP_AMD
select DMA_COHERENT_POOL
- select DYNAMIC_PHYSICAL_MASK
select ARCH_USE_MEMREMAP_PROT
- select ARCH_HAS_FORCE_DMA_UNENCRYPTED
select INSTRUCTION_DECODER
- select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
select ARCH_HAS_CC_PLATFORM
+ select X86_MEM_ENCRYPT
help
Say yes to enable support for the encryption of system memory.
This requires an AMD processor that supports Secure Memory
select SRCU
select MMU_NOTIFIER
select NUMA_KEEP_MEMINFO if NUMA
+ select XARRAY_MULTI
help
Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
that can be used by applications to set aside private regions of code
config EFI_STUB
bool "EFI stub support"
- depends on EFI && !X86_USE_3DNOW
+ depends on EFI
depends on $(cc-option,-mabi=ms) || X86_32
select RELOCATABLE
help
#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot)))
#ifndef __ASSEMBLY__
+#include <linux/spinlock.h>
#include <asm/x86_init.h>
#include <asm/pkru.h>
#include <asm/fpu/api.h>
#include <asm-generic/pgtable_uffd.h>
+ #include <linux/page_table_check.h>
extern pgd_t early_top_pgt[PTRS_PER_PGD];
bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
return true;
if ((pte_flags(a) & _PAGE_PROTNONE) &&
- mm_tlb_flush_pending(mm))
+ atomic_read(&mm->tlb_flush_pending))
return true;
return false;
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
+ page_table_check_pte_set(mm, addr, ptep, pte);
set_pte(ptep, pte);
}
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
+ page_table_check_pmd_set(mm, addr, pmdp, pmd);
set_pmd(pmdp, pmd);
}
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud)
{
+ page_table_check_pud_set(mm, addr, pudp, pud);
native_set_pud(pudp, pud);
}
pte_t *ptep)
{
pte_t pte = native_ptep_get_and_clear(ptep);
+ page_table_check_pte_clear(mm, addr, pte);
return pte;
}
* care about updates and native needs no locking
*/
pte = native_local_ptep_get_and_clear(ptep);
+ page_table_check_pte_clear(mm, addr, pte);
} else {
pte = ptep_get_and_clear(mm, addr, ptep);
}
return pte;
}
+ #define __HAVE_ARCH_PTEP_CLEAR
+ static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+ {
+ if (IS_ENABLED(CONFIG_PAGE_TABLE_CHECK))
+ ptep_get_and_clear(mm, addr, ptep);
+ else
+ pte_clear(mm, addr, ptep);
+ }
+
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
{
- return native_pmdp_get_and_clear(pmdp);
+ pmd_t pmd = native_pmdp_get_and_clear(pmdp);
+
+ page_table_check_pmd_clear(mm, addr, pmd);
+
+ return pmd;
}
#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
unsigned long addr, pud_t *pudp)
{
- return native_pudp_get_and_clear(pudp);
+ pud_t pud = native_pudp_get_and_clear(pudp);
+
+ page_table_check_pud_clear(mm, addr, pud);
+
+ return pud;
}
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
+ page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
if (IS_ENABLED(CONFIG_SMP)) {
return xchg(pmdp, pmd);
} else {
NULL,
};
- static const struct attribute_group zram_disk_attr_group = {
- .attrs = zram_disk_attrs,
- };
-
- static const struct attribute_group *zram_disk_attr_groups[] = {
- &zram_disk_attr_group,
- NULL,
- };
+ ATTRIBUTE_GROUPS(zram_disk);
/*
* Allocate and initialize new zram device. the function returns
zram->disk->major = zram_major;
zram->disk->first_minor = device_id;
zram->disk->minors = 1;
+ zram->disk->flags |= GENHD_FL_NO_PART;
zram->disk->fops = &zram_devops;
zram->disk->private_data = zram;
snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
- ret = device_add_disk(NULL, zram->disk, zram_disk_attr_groups);
+ ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
if (ret)
goto out_cleanup_disk;
#include "dax-private.h"
#include "bus.h"
-static struct class *dax_class;
-
static DEFINE_MUTEX(dax_bus_lock);
#define DAX_NAME_LEN 30
static int dax_bus_match(struct device *dev, struct device_driver *drv);
+ /*
+ * Static dax regions are regions created by an external subsystem
+ * nvdimm where a single range is assigned. Its boundaries are by the external
+ * subsystem and are usually limited to one physical memory range. For example,
+ * for PMEM it is usually defined by NVDIMM Namespace boundaries (i.e. a
+ * single contiguous range)
+ *
+ * On dynamic dax regions, the assigned region can be partitioned by dax core
+ * into multiple subdivisions. A subdivision is represented into one
+ * /dev/daxN.M device composed by one or more potentially discontiguous ranges.
+ *
+ * When allocating a dax region, drivers must set whether it's static
+ * (IORESOURCE_DAX_STATIC). On static dax devices, the @pgmap is pre-assigned
+ * to dax core when calling devm_create_dev_dax(), whereas in dynamic dax
+ * devices it is NULL but afterwards allocated by dax core on device ->probe().
+ * Care is needed to make sure that dynamic dax devices are torn down with a
+ * cleared @pgmap field (see kill_dev_dax()).
+ */
static bool is_static(struct dax_region *dax_region)
{
return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0;
}
+ bool static_dev_dax(struct dev_dax *dev_dax)
+ {
+ return is_static(dev_dax->region);
+ }
+ EXPORT_SYMBOL_GPL(static_dev_dax);
+
static u64 dev_dax_size(struct dev_dax *dev_dax)
{
u64 size = 0;
kill_dax(dax_dev);
unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+
+ /*
+ * Dynamic dax region have the pgmap allocated via dev_kzalloc()
+ * and thus freed by devm. Clear the pgmap to not have stale pgmap
+ * ranges on probe() from previous reconfigurations of region devices.
+ */
+ if (!static_dev_dax(dev_dax))
+ dev_dax->pgmap = NULL;
}
EXPORT_SYMBOL_GPL(kill_dev_dax);
}
/*
- * No 'host' or dax_operations since there is no access to this
- * device outside of mmap of the resulting character device.
+ * No dax_operations since there is no access to this device outside of
+ * mmap of the resulting character device.
*/
- dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC);
+ dax_dev = alloc_dax(dev_dax, NULL);
if (IS_ERR(dax_dev)) {
rc = PTR_ERR(dax_dev);
goto err_alloc_dax;
}
+ set_dax_synchronous(dax_dev);
+ set_dax_nocache(dax_dev);
+ set_dax_nomc(dax_dev);
/* a device_dax instance is dead while the driver is not attached */
kill_dax(dax_dev);
inode = dax_inode(dax_dev);
dev->devt = inode->i_rdev;
- if (data->subsys == DEV_DAX_BUS)
- dev->bus = &dax_bus_type;
- else
- dev->class = dax_class;
+ dev->bus = &dax_bus_type;
dev->parent = parent;
dev->type = &dev_dax_type;
int __init dax_bus_init(void)
{
- int rc;
-
- if (IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT)) {
- dax_class = class_create(THIS_MODULE, "dax");
- if (IS_ERR(dax_class))
- return PTR_ERR(dax_class);
- }
-
- rc = bus_register(&dax_bus_type);
- if (rc)
- class_destroy(dax_class);
- return rc;
+ return bus_register(&dax_bus_type);
}
void __exit dax_bus_exit(void)
{
bus_unregister(&dax_bus_type);
- class_destroy(dax_class);
}
struct range *range, int target_node, unsigned int align,
unsigned long flags);
-enum dev_dax_subsys {
- DEV_DAX_BUS = 0, /* zeroed dev_dax_data picks this by default */
- DEV_DAX_CLASS,
-};
-
struct dev_dax_data {
struct dax_region *dax_region;
struct dev_pagemap *pgmap;
- enum dev_dax_subsys subsys;
resource_size_t size;
int id;
};
struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data);
-/* to be deleted when DEV_DAX_CLASS is removed */
-struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys);
-
struct dax_device_driver {
struct device_driver drv;
struct list_head ids;
__dax_driver_register(driver, THIS_MODULE, KBUILD_MODNAME)
void dax_driver_unregister(struct dax_device_driver *dax_drv);
void kill_dev_dax(struct dev_dax *dev_dax);
+ bool static_dev_dax(struct dev_dax *dev_dax);
-#if IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT)
-int dev_dax_probe(struct dev_dax *dev_dax);
-#endif
-
/*
* While run_dax() is potentially a generic operation that could be
* defined in include/linux/dax.h we don't want to grow any users
return -1;
}
+ static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn,
+ unsigned long fault_size)
+ {
+ unsigned long i, nr_pages = fault_size / PAGE_SIZE;
+ struct file *filp = vmf->vma->vm_file;
+ struct dev_dax *dev_dax = filp->private_data;
+ pgoff_t pgoff;
+
+ /* mapping is only set on the head */
+ if (dev_dax->pgmap->vmemmap_shift)
+ nr_pages = 1;
+
+ pgoff = linear_page_index(vmf->vma,
+ ALIGN(vmf->address, fault_size));
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
+
+ page = compound_head(page);
+ if (page->mapping)
+ continue;
+
+ page->mapping = filp->f_mapping;
+ page->index = pgoff + i;
+ }
+ }
+
static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
- struct vm_fault *vmf, pfn_t *pfn)
+ struct vm_fault *vmf)
{
struct device *dev = &dev_dax->dev;
phys_addr_t phys;
+ pfn_t pfn;
unsigned int fault_size = PAGE_SIZE;
if (check_vma(dev_dax, vmf->vma, __func__))
return VM_FAULT_SIGBUS;
}
- *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
+ pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
- return vmf_insert_mixed(vmf->vma, vmf->address, *pfn);
+ dax_set_mapping(vmf, pfn, fault_size);
+
+ return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
}
static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
- struct vm_fault *vmf, pfn_t *pfn)
+ struct vm_fault *vmf)
{
unsigned long pmd_addr = vmf->address & PMD_MASK;
struct device *dev = &dev_dax->dev;
phys_addr_t phys;
pgoff_t pgoff;
+ pfn_t pfn;
unsigned int fault_size = PMD_SIZE;
if (check_vma(dev_dax, vmf->vma, __func__))
return VM_FAULT_SIGBUS;
}
- *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
+ pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
- return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE);
+ dax_set_mapping(vmf, pfn, fault_size);
+
+ return vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
- struct vm_fault *vmf, pfn_t *pfn)
+ struct vm_fault *vmf)
{
unsigned long pud_addr = vmf->address & PUD_MASK;
struct device *dev = &dev_dax->dev;
phys_addr_t phys;
pgoff_t pgoff;
+ pfn_t pfn;
unsigned int fault_size = PUD_SIZE;
return VM_FAULT_SIGBUS;
}
- *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
+ pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
- return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE);
+ dax_set_mapping(vmf, pfn, fault_size);
+
+ return vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
}
#else
static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
- struct vm_fault *vmf, pfn_t *pfn)
+ struct vm_fault *vmf)
{
return VM_FAULT_FALLBACK;
}
enum page_entry_size pe_size)
{
struct file *filp = vmf->vma->vm_file;
- unsigned long fault_size;
vm_fault_t rc = VM_FAULT_SIGBUS;
int id;
- pfn_t pfn;
struct dev_dax *dev_dax = filp->private_data;
dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm,
id = dax_read_lock();
switch (pe_size) {
case PE_SIZE_PTE:
- fault_size = PAGE_SIZE;
- rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn);
+ rc = __dev_dax_pte_fault(dev_dax, vmf);
break;
case PE_SIZE_PMD:
- fault_size = PMD_SIZE;
- rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn);
+ rc = __dev_dax_pmd_fault(dev_dax, vmf);
break;
case PE_SIZE_PUD:
- fault_size = PUD_SIZE;
- rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn);
+ rc = __dev_dax_pud_fault(dev_dax, vmf);
break;
default:
rc = VM_FAULT_SIGBUS;
}
- if (rc == VM_FAULT_NOPAGE) {
- unsigned long i;
- pgoff_t pgoff;
-
- /*
- * In the device-dax case the only possibility for a
- * VM_FAULT_NOPAGE result is when device-dax capacity is
- * mapped. No need to consider the zero page, or racing
- * conflicting mappings.
- */
- pgoff = linear_page_index(vmf->vma, vmf->address
- & ~(fault_size - 1));
- for (i = 0; i < fault_size / PAGE_SIZE; i++) {
- struct page *page;
-
- page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
- if (page->mapping)
- continue;
- page->mapping = filp->f_mapping;
- page->index = pgoff + i;
- }
- }
dax_read_unlock(id);
return rc;
void *addr;
int rc, i;
- pgmap = dev_dax->pgmap;
- if (dev_WARN_ONCE(dev, pgmap && dev_dax->nr_range > 1,
- "static pgmap / multi-range device conflict\n"))
- return -EINVAL;
+ if (static_dev_dax(dev_dax)) {
+ if (dev_dax->nr_range > 1) {
+ dev_warn(dev,
+ "static pgmap / multi-range device conflict\n");
+ return -EINVAL;
+ }
- if (!pgmap) {
- pgmap = devm_kzalloc(dev, sizeof(*pgmap) + sizeof(struct range)
- * (dev_dax->nr_range - 1), GFP_KERNEL);
+ pgmap = dev_dax->pgmap;
+ } else {
+ if (dev_dax->pgmap) {
+ dev_warn(dev,
+ "dynamic-dax with pre-populated page map\n");
+ return -EINVAL;
+ }
+
+ pgmap = devm_kzalloc(dev,
+ struct_size(pgmap, ranges, dev_dax->nr_range - 1),
+ GFP_KERNEL);
if (!pgmap)
return -ENOMEM;
+
pgmap->nr_range = dev_dax->nr_range;
+ dev_dax->pgmap = pgmap;
+
+ for (i = 0; i < dev_dax->nr_range; i++) {
+ struct range *range = &dev_dax->ranges[i].range;
+ pgmap->ranges[i] = *range;
+ }
}
for (i = 0; i < dev_dax->nr_range; i++) {
i, range->start, range->end);
return -EBUSY;
}
- /* don't update the range for static pgmap */
- if (!dev_dax->pgmap)
- pgmap->ranges[i] = *range;
}
pgmap->type = MEMORY_DEVICE_GENERIC;
+ if (dev_dax->align > PAGE_SIZE)
+ pgmap->vmemmap_shift =
+ order_base_2(dev_dax->align >> PAGE_SHIFT);
addr = devm_memremap_pages(dev, pgmap);
if (IS_ERR(addr))
return PTR_ERR(addr);
inode = dax_inode(dax_dev);
cdev = inode->i_cdev;
cdev_init(cdev, &dax_fops);
- if (dev->class) {
- /* for the CONFIG_DEV_DAX_PMEM_COMPAT case */
- cdev->owner = dev->parent->driver->owner;
- } else
- cdev->owner = dev->driver->owner;
+ cdev->owner = dev->driver->owner;
cdev_set_parent(cdev, &dev->kobj);
rc = cdev_add(cdev, dev->devt, 1);
if (rc)
#include <linux/serial_core.h>
#include <linux/sysfs.h>
#include <linux/random.h>
+ #include <linux/kmemleak.h>
#include <asm/setup.h> /* for COMMAND_LINE_SIZE */
#include <asm/page.h>
if (nomap) {
/*
* If the memory is already reserved (by another region), we
- * should not allow it to be marked nomap.
+ * should not allow it to be marked nomap, but don't worry
+ * if the region isn't memory as it won't be mapped.
*/
- if (memblock_is_region_reserved(base, size))
+ if (memblock_overlaps_region(&memblock.memory, base, size) &&
+ memblock_is_region_reserved(base, size))
return -EBUSY;
return memblock_mark_nomap(base, size);
size = dt_mem_next_cell(dt_root_size_cells, &prop);
if (size &&
- early_init_dt_reserve_memory_arch(base, size, nomap) == 0)
+ early_init_dt_reserve_memory_arch(base, size, nomap) == 0) {
pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n",
uname, &base, (unsigned long)(size / SZ_1M));
+ if (!nomap)
+ kmemleak_alloc_phys(base, size, 0, 0);
+ }
else
pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n",
uname, &base, (unsigned long)(size / SZ_1M));
elfcorehdr_addr, elfcorehdr_size);
}
-static phys_addr_t cap_mem_addr;
-static phys_addr_t cap_mem_size;
+static unsigned long chosen_node_offset = -FDT_ERR_NOTFOUND;
/**
* early_init_dt_check_for_usable_mem_range - Decode usable memory range
* location from flat tree
- * @node: reference to node containing usable memory range location ('chosen')
*/
-static void __init early_init_dt_check_for_usable_mem_range(unsigned long node)
+void __init early_init_dt_check_for_usable_mem_range(void)
{
const __be32 *prop;
int len;
+ phys_addr_t cap_mem_addr;
+ phys_addr_t cap_mem_size;
+ unsigned long node = chosen_node_offset;
+
+ if ((long)node < 0)
+ return;
pr_debug("Looking for usable-memory-range property... ");
pr_debug("cap_mem_start=%pa cap_mem_size=%pa\n", &cap_mem_addr,
&cap_mem_size);
+
+ memblock_cap_memory_range(cap_mem_addr, cap_mem_size);
}
#ifdef CONFIG_SERIAL_EARLYCON
/*
* early_init_dt_scan_root - fetch the top level address and size cells
*/
-int __init early_init_dt_scan_root(unsigned long node, const char *uname,
- int depth, void *data)
+int __init early_init_dt_scan_root(void)
{
const __be32 *prop;
+ const void *fdt = initial_boot_params;
+ int node = fdt_path_offset(fdt, "/");
- if (depth != 0)
- return 0;
+ if (node < 0)
+ return -ENODEV;
dt_root_size_cells = OF_ROOT_NODE_SIZE_CELLS_DEFAULT;
dt_root_addr_cells = OF_ROOT_NODE_ADDR_CELLS_DEFAULT;
dt_root_addr_cells = be32_to_cpup(prop);
pr_debug("dt_root_addr_cells = %x\n", dt_root_addr_cells);
- /* break now */
- return 1;
+ return 0;
}
u64 __init dt_mem_next_cell(int s, const __be32 **cellp)
/*
* early_init_dt_scan_memory - Look for and parse memory nodes
*/
-int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
- int depth, void *data)
+int __init early_init_dt_scan_memory(void)
{
- const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- const __be32 *reg, *endp;
- int l;
- bool hotpluggable;
+ int node;
+ const void *fdt = initial_boot_params;
- /* We are scanning "memory" nodes only */
- if (type == NULL || strcmp(type, "memory") != 0)
- return 0;
+ fdt_for_each_subnode(node, fdt, 0) {
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *reg, *endp;
+ int l;
+ bool hotpluggable;
- reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l);
- if (reg == NULL)
- reg = of_get_flat_dt_prop(node, "reg", &l);
- if (reg == NULL)
- return 0;
+ /* We are scanning "memory" nodes only */
+ if (type == NULL || strcmp(type, "memory") != 0)
+ continue;
- endp = reg + (l / sizeof(__be32));
- hotpluggable = of_get_flat_dt_prop(node, "hotpluggable", NULL);
+ reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l);
+ if (reg == NULL)
+ reg = of_get_flat_dt_prop(node, "reg", &l);
+ if (reg == NULL)
+ continue;
- pr_debug("memory scan node %s, reg size %d,\n", uname, l);
+ endp = reg + (l / sizeof(__be32));
+ hotpluggable = of_get_flat_dt_prop(node, "hotpluggable", NULL);
- while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
- u64 base, size;
+ pr_debug("memory scan node %s, reg size %d,\n",
+ fdt_get_name(fdt, node, NULL), l);
- base = dt_mem_next_cell(dt_root_addr_cells, ®);
- size = dt_mem_next_cell(dt_root_size_cells, ®);
+ while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
+ u64 base, size;
- if (size == 0)
- continue;
- pr_debug(" - %llx, %llx\n", base, size);
+ base = dt_mem_next_cell(dt_root_addr_cells, ®);
+ size = dt_mem_next_cell(dt_root_size_cells, ®);
- early_init_dt_add_memory_arch(base, size);
+ if (size == 0)
+ continue;
+ pr_debug(" - %llx, %llx\n", base, size);
- if (!hotpluggable)
- continue;
+ early_init_dt_add_memory_arch(base, size);
- if (memblock_mark_hotplug(base, size))
- pr_warn("failed to mark hotplug range 0x%llx - 0x%llx\n",
- base, base + size);
- }
+ if (!hotpluggable)
+ continue;
+ if (memblock_mark_hotplug(base, size))
+ pr_warn("failed to mark hotplug range 0x%llx - 0x%llx\n",
+ base, base + size);
+ }
+ }
return 0;
}
-int __init early_init_dt_scan_chosen(unsigned long node, const char *uname,
- int depth, void *data)
+int __init early_init_dt_scan_chosen(char *cmdline)
{
- int l;
+ int l, node;
const char *p;
const void *rng_seed;
+ const void *fdt = initial_boot_params;
- pr_debug("search \"chosen\", depth: %d, uname: %s\n", depth, uname);
+ node = fdt_path_offset(fdt, "/chosen");
+ if (node < 0)
+ node = fdt_path_offset(fdt, "/chosen@0");
+ if (node < 0)
+ return -ENOENT;
- if (depth != 1 || !data ||
- (strcmp(uname, "chosen") != 0 && strcmp(uname, "chosen@0") != 0))
- return 0;
+ chosen_node_offset = node;
early_init_dt_check_for_initrd(node);
early_init_dt_check_for_elfcorehdr(node);
- early_init_dt_check_for_usable_mem_range(node);
/* Retrieve command line */
p = of_get_flat_dt_prop(node, "bootargs", &l);
if (p != NULL && l > 0)
- strlcpy(data, p, min(l, COMMAND_LINE_SIZE));
+ strlcpy(cmdline, p, min(l, COMMAND_LINE_SIZE));
/*
* CONFIG_CMDLINE is meant to be a default in case nothing else
*/
#ifdef CONFIG_CMDLINE
#if defined(CONFIG_CMDLINE_EXTEND)
- strlcat(data, " ", COMMAND_LINE_SIZE);
- strlcat(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
+ strlcat(cmdline, " ", COMMAND_LINE_SIZE);
+ strlcat(cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
#elif defined(CONFIG_CMDLINE_FORCE)
- strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
+ strlcpy(cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
#else
/* No arguments from boot loader, use kernel's cmdl*/
- if (!((char *)data)[0])
- strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
+ if (!((char *)cmdline)[0])
+ strlcpy(cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
#endif
#endif /* CONFIG_CMDLINE */
- pr_debug("Command line is: %s\n", (char *)data);
+ pr_debug("Command line is: %s\n", (char *)cmdline);
rng_seed = of_get_flat_dt_prop(node, "rng-seed", &l);
if (rng_seed && l > 0) {
fdt_totalsize(initial_boot_params));
}
- /* break now */
- return 1;
+ return 0;
}
#ifndef MIN_MEMBLOCK_ADDR
void __init early_init_dt_scan_nodes(void)
{
- int rc = 0;
+ int rc;
/* Initialize {size,address}-cells info */
- of_scan_flat_dt(early_init_dt_scan_root, NULL);
+ early_init_dt_scan_root();
/* Retrieve various information from the /chosen node */
- rc = of_scan_flat_dt(early_init_dt_scan_chosen, boot_command_line);
- if (!rc)
+ rc = early_init_dt_scan_chosen(boot_command_line);
+ if (rc)
pr_warn("No chosen node found, continuing without\n");
/* Setup memory, calling early_init_dt_add_memory_arch */
- of_scan_flat_dt(early_init_dt_scan_memory, NULL);
+ early_init_dt_scan_memory();
/* Handle linux,usable-memory-range property */
- memblock_cap_memory_range(cap_mem_addr, cap_mem_size);
+ early_init_dt_check_for_usable_mem_range();
}
bool __init early_init_dt_scan(void *params)
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fiemap.h>
- #include <linux/backing-dev.h>
#include <linux/iomap.h>
+ #include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "xattr.h"
EXT4_ERROR_INODE(inode,
"ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
- EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
- le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
+ le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block),
depth);
return -EFSCORRUPTED;
}
+ ext4_ext_get_actual_len(newext));
if (unwritten)
ext4_ext_mark_unwritten(ex);
- eh = path[depth].p_hdr;
nearex = ex;
goto merge;
}
+ ext4_ext_get_actual_len(newext));
if (unwritten)
ext4_ext_mark_unwritten(ex);
- eh = path[depth].p_hdr;
nearex = ex;
goto merge;
}
err = ext4_es_remove_extent(inode, last_block,
EXT_MAX_BLOCKS - last_block);
if (err == -ENOMEM) {
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ memalloc_retry_wait(GFP_ATOMIC);
goto retry;
}
if (err)
retry_remove_space:
err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
if (err == -ENOMEM) {
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ memalloc_retry_wait(GFP_ATOMIC);
goto retry_remove_space;
}
return err;
ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret))
goto out_handle;
- ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits,
- (offset + len - 1) >> inode->i_sb->s_blocksize_bits);
/* Zero out partial block at the edges of the range */
ret = ext4_zero_partial_blocks(handle, inode, offset, len);
if (ret >= 0)
FALLOC_FL_INSERT_RANGE))
return -EOPNOTSUPP;
- ext4_fc_start_update(inode);
-
if (mode & FALLOC_FL_PUNCH_HOLE) {
ret = ext4_punch_hole(inode, offset, len);
goto exit;
inode_unlock(inode);
trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
exit:
- ext4_fc_stop_update(inode);
return ret;
}
ret = PTR_ERR(handle);
goto out_mmap;
}
- ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode, 0);
out_stop:
ext4_journal_stop(handle);
- ext4_fc_stop_ineligible(sb);
out_mmap:
filemap_invalidate_unlock(mapping);
out_mutex:
ret = PTR_ERR(handle);
goto out_mmap;
}
- ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
out_stop:
ext4_journal_stop(handle);
- ext4_fc_stop_ineligible(sb);
out_mmap:
filemap_invalidate_unlock(mapping);
out_mutex:
}
XFS_STATS_INC(bp->b_mount, xb_page_retries);
- congestion_wait(BLK_RW_ASYNC, HZ / 50);
+ memalloc_retry_wait(gfp_mask);
}
return 0;
}
list_lru_destroy(&btp->bt_lru);
blkdev_issue_flush(btp->bt_bdev);
+ fs_put_dax(btp->bt_daxdev);
kmem_free(btp);
}
return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
}
-xfs_buftarg_t *
+struct xfs_buftarg *
xfs_alloc_buftarg(
struct xfs_mount *mp,
- struct block_device *bdev,
- struct dax_device *dax_dev)
+ struct block_device *bdev)
{
xfs_buftarg_t *btp;
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
btp->bt_bdev = bdev;
- btp->bt_daxdev = dax_dev;
+ btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
/*
* Buffer IO error rate limiting. Limit it to no more than 10 messages
#include <linux/stddef.h>
#include <linux/mount.h>
#include <linux/cred.h>
+#include <linux/mnt_idmapping.h>
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
struct list_head s_inodes_wb; /* writeback inodes */
} __randomize_layout;
+static inline struct user_namespace *i_user_ns(const struct inode *inode)
+{
+ return inode->i_sb->s_user_ns;
+}
+
/* Helper functions so that in most cases filesystems will
* not need to deal directly with kuid_t and kgid_t and can
* instead deal with the raw numeric values that are stored
*/
static inline uid_t i_uid_read(const struct inode *inode)
{
- return from_kuid(inode->i_sb->s_user_ns, inode->i_uid);
+ return from_kuid(i_user_ns(inode), inode->i_uid);
}
static inline gid_t i_gid_read(const struct inode *inode)
{
- return from_kgid(inode->i_sb->s_user_ns, inode->i_gid);
+ return from_kgid(i_user_ns(inode), inode->i_gid);
}
static inline void i_uid_write(struct inode *inode, uid_t uid)
{
- inode->i_uid = make_kuid(inode->i_sb->s_user_ns, uid);
+ inode->i_uid = make_kuid(i_user_ns(inode), uid);
}
static inline void i_gid_write(struct inode *inode, gid_t gid)
{
- inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
-}
-
-/**
- * kuid_into_mnt - map a kuid down into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
- * @kuid: kuid to be mapped
- *
- * Return: @kuid mapped according to @mnt_userns.
- * If @kuid has no mapping INVALID_UID is returned.
- */
-static inline kuid_t kuid_into_mnt(struct user_namespace *mnt_userns,
- kuid_t kuid)
-{
- return make_kuid(mnt_userns, __kuid_val(kuid));
-}
-
-/**
- * kgid_into_mnt - map a kgid down into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
- * @kgid: kgid to be mapped
- *
- * Return: @kgid mapped according to @mnt_userns.
- * If @kgid has no mapping INVALID_GID is returned.
- */
-static inline kgid_t kgid_into_mnt(struct user_namespace *mnt_userns,
- kgid_t kgid)
-{
- return make_kgid(mnt_userns, __kgid_val(kgid));
+ inode->i_gid = make_kgid(i_user_ns(inode), gid);
}
/**
static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
const struct inode *inode)
{
- return kuid_into_mnt(mnt_userns, inode->i_uid);
+ return mapped_kuid_fs(mnt_userns, i_user_ns(inode), inode->i_uid);
}
/**
static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns,
const struct inode *inode)
{
- return kgid_into_mnt(mnt_userns, inode->i_gid);
-}
-
-/**
- * kuid_from_mnt - map a kuid up into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
- * @kuid: kuid to be mapped
- *
- * Return: @kuid mapped up according to @mnt_userns.
- * If @kuid has no mapping INVALID_UID is returned.
- */
-static inline kuid_t kuid_from_mnt(struct user_namespace *mnt_userns,
- kuid_t kuid)
-{
- return KUIDT_INIT(from_kuid(mnt_userns, kuid));
-}
-
-/**
- * kgid_from_mnt - map a kgid up into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
- * @kgid: kgid to be mapped
- *
- * Return: @kgid mapped up according to @mnt_userns.
- * If @kgid has no mapping INVALID_GID is returned.
- */
-static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns,
- kgid_t kgid)
-{
- return KGIDT_INIT(from_kgid(mnt_userns, kgid));
-}
-
-/**
- * mapped_fsuid - return caller's fsuid mapped up into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
- *
- * Use this helper to initialize a new vfs or filesystem object based on
- * the caller's fsuid. A common example is initializing the i_uid field of
- * a newly allocated inode triggered by a creation event such as mkdir or
- * O_CREAT. Other examples include the allocation of quotas for a specific
- * user.
- *
- * Return: the caller's current fsuid mapped up according to @mnt_userns.
- */
-static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns)
-{
- return kuid_from_mnt(mnt_userns, current_fsuid());
-}
-
-/**
- * mapped_fsgid - return caller's fsgid mapped up into a mnt_userns
- * @mnt_userns: user namespace of the relevant mount
- *
- * Use this helper to initialize a new vfs or filesystem object based on
- * the caller's fsgid. A common example is initializing the i_gid field of
- * a newly allocated inode triggered by a creation event such as mkdir or
- * O_CREAT. Other examples include the allocation of quotas for a specific
- * user.
- *
- * Return: the caller's current fsgid mapped up according to @mnt_userns.
- */
-static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns)
-{
- return kgid_from_mnt(mnt_userns, current_fsgid());
+ return mapped_kgid_fs(mnt_userns, i_user_ns(inode), inode->i_gid);
}
/**
static inline void inode_fsuid_set(struct inode *inode,
struct user_namespace *mnt_userns)
{
- inode->i_uid = mapped_fsuid(mnt_userns);
+ inode->i_uid = mapped_fsuid(mnt_userns, i_user_ns(inode));
}
/**
static inline void inode_fsgid_set(struct inode *inode,
struct user_namespace *mnt_userns)
{
- inode->i_gid = mapped_fsgid(mnt_userns);
+ inode->i_gid = mapped_fsgid(mnt_userns, i_user_ns(inode));
}
/**
static inline bool fsuidgid_has_mapping(struct super_block *sb,
struct user_namespace *mnt_userns)
{
- struct user_namespace *s_user_ns = sb->s_user_ns;
+ struct user_namespace *fs_userns = sb->s_user_ns;
+ kuid_t kuid;
+ kgid_t kgid;
- return kuid_has_mapping(s_user_ns, mapped_fsuid(mnt_userns)) &&
- kgid_has_mapping(s_user_ns, mapped_fsgid(mnt_userns));
+ kuid = mapped_fsuid(mnt_userns, fs_userns);
+ if (!uid_valid(kuid))
+ return false;
+ kgid = mapped_fsgid(mnt_userns, fs_userns);
+ if (!gid_valid(kgid))
+ return false;
+ return kuid_has_mapping(fs_userns, kuid) &&
+ kgid_has_mapping(fs_userns, kgid);
}
extern struct timespec64 current_time(struct inode *inode);
#define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */
#define S_CASEFOLD (1 << 15) /* Casefolded file */
#define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */
+#define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
* Used to detect that mark_inode_dirty() should not move
* inode between dirty lists.
*
+ * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback.
+ *
* Q: What is the difference between I_WILL_FREE and I_FREEING?
*/
#define I_DIRTY_SYNC (1 << 0)
#define I_CREATING (1 << 15)
#define I_DONTCACHE (1 << 16)
#define I_SYNC_QUEUED (1 << 17)
+#define I_PINNING_FSCACHE_WB (1 << 18)
#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
{
return mnt_user_ns(file->f_path.mnt);
}
+
+/**
+ * is_idmapped_mnt - check whether a mount is mapped
+ * @mnt: the mount to check
+ *
+ * If @mnt has an idmapping attached different from the
+ * filesystem's idmapping then @mnt is mapped.
+ *
+ * Return: true if mount is mapped, false if not.
+ */
+static inline bool is_idmapped_mnt(const struct vfsmount *mnt)
+{
+ return mnt_user_ns(mnt) != mnt->mnt_sb->s_user_ns;
+}
+
extern long vfs_truncate(const struct path *, loff_t);
int do_truncate(struct user_namespace *, struct dentry *, loff_t start,
unsigned int time_attrs, struct file *filp);
extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
loff_t lend);
-extern bool filemap_range_needs_writeback(struct address_space *,
- loff_t lstart, loff_t lend);
extern int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
extern int __filemap_fdatawrite_range(struct address_space *mapping,
extern void discard_new_inode(struct inode *);
extern unsigned int get_next_ino(void);
extern void evict_inodes(struct super_block *sb);
+ void dump_mapping(const struct address_space *);
/*
* Userspace may rely on the the inode number being non-zero. For example, glibc
struct kmem_cache;
struct page;
+struct slab;
struct vm_struct;
struct task_struct;
return 0;
}
-void __kasan_poison_slab(struct page *page);
-static __always_inline void kasan_poison_slab(struct page *page)
+void __kasan_poison_slab(struct slab *slab);
+static __always_inline void kasan_poison_slab(struct slab *slab)
{
if (kasan_enabled())
- __kasan_poison_slab(page);
+ __kasan_poison_slab(slab);
}
void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
slab_flags_t *flags) {}
static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {}
static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
-static inline void kasan_poison_slab(struct page *page) {}
+static inline void kasan_poison_slab(struct slab *slab) {}
static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
void *object) {}
static inline void kasan_poison_object_data(struct kmem_cache *cache,
* allocations with real shadow memory. With KASAN vmalloc, the special
* case is unnecessary, as the work is handled in the generic case.
*/
- int kasan_module_alloc(void *addr, size_t size);
+ int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask);
void kasan_free_shadow(const struct vm_struct *vm);
#else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
- static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
+ static inline int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { return 0; }
static inline void kasan_free_shadow(const struct vm_struct *vm) {}
#endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
MEMCG_SOCK,
MEMCG_PERCPU_B,
+ MEMCG_VMALLOC,
MEMCG_NR_STAT,
};
MEMCG_MAX,
MEMCG_OOM,
MEMCG_OOM_KILL,
+ MEMCG_OOM_GROUP_KILL,
MEMCG_SWAP_HIGH,
MEMCG_SWAP_MAX,
MEMCG_SWAP_FAIL,
return folio->memcg_data & MEMCG_DATA_KMEM;
}
-/*
- * page_objcgs - get the object cgroups vector associated with a page
- * @page: a pointer to the page struct
- *
- * Returns a pointer to the object cgroups vector associated with the page,
- * or NULL. This function assumes that the page is known to have an
- * associated object cgroups vector. It's not safe to call this function
- * against pages, which might have an associated memory cgroup: e.g.
- * kernel stack pages.
- */
-static inline struct obj_cgroup **page_objcgs(struct page *page)
-{
- unsigned long memcg_data = READ_ONCE(page->memcg_data);
-
- VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page);
- VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
-
- return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
-}
-
-/*
- * page_objcgs_check - get the object cgroups vector associated with a page
- * @page: a pointer to the page struct
- *
- * Returns a pointer to the object cgroups vector associated with the page,
- * or NULL. This function is safe to use if the page can be directly associated
- * with a memory cgroup.
- */
-static inline struct obj_cgroup **page_objcgs_check(struct page *page)
-{
- unsigned long memcg_data = READ_ONCE(page->memcg_data);
-
- if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS))
- return NULL;
-
- VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
-
- return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
-}
#else
static inline bool folio_memcg_kmem(struct folio *folio)
return false;
}
-static inline struct obj_cgroup **page_objcgs(struct page *page)
-{
- return NULL;
-}
-
-static inline struct obj_cgroup **page_objcgs_check(struct page *page)
-{
- return NULL;
-}
#endif
static inline bool PageMemcgKmem(struct page *page)
local_irq_restore(flags);
}
+ static inline void mod_memcg_page_state(struct page *page,
+ int idx, int val)
+ {
+ struct mem_cgroup *memcg;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ rcu_read_lock();
+ memcg = page_memcg(page);
+ if (memcg)
+ mod_memcg_state(memcg, idx, val);
+ rcu_read_unlock();
+ }
+
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
return READ_ONCE(memcg->vmstats.state[idx]);
{
}
+ static inline void mod_memcg_page_state(struct page *page,
+ int idx, int val)
+ {
+ }
+
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
return 0;
*/
void (*page_free)(struct page *page);
- /*
- * Transition the refcount in struct dev_pagemap to the dead state.
- */
- void (*kill)(struct dev_pagemap *pgmap);
-
- /*
- * Wait for refcount in struct dev_pagemap to be idle and reap it.
- */
- void (*cleanup)(struct dev_pagemap *pgmap);
-
/*
* Used for private (un-addressable) device memory only. Must migrate
* the page back to a CPU accessible page.
* struct dev_pagemap - metadata for ZONE_DEVICE mappings
* @altmap: pre-allocated/reserved memory for vmemmap allocations
* @ref: reference count that pins the devm_memremap_pages() mapping
- * @internal_ref: internal reference if @ref is not provided by the caller
- * @done: completion for @internal_ref
+ * @done: completion for @ref
* @type: memory type: see MEMORY_* in memory_hotplug.h
* @flags: PGMAP_* flags to specify defailed behavior
+ * @vmemmap_shift: structural definition of how the vmemmap page metadata
+ * is populated, specifically the metadata page order.
+ * A zero value (default) uses base pages as the vmemmap metadata
+ * representation. A bigger value will set up compound struct pages
+ * of the requested order value.
* @ops: method table
* @owner: an opaque pointer identifying the entity that manages this
* instance. Used by various helpers to make sure that no
*/
struct dev_pagemap {
struct vmem_altmap altmap;
- struct percpu_ref *ref;
- struct percpu_ref internal_ref;
+ struct percpu_ref ref;
struct completion done;
enum memory_type type;
unsigned int flags;
+ unsigned long vmemmap_shift;
const struct dev_pagemap_ops *ops;
void *owner;
int nr_range;
return NULL;
}
+ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)
+ {
+ return 1 << pgmap->vmemmap_shift;
+ }
+
#ifdef CONFIG_ZONE_DEVICE
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
{
if (pgmap)
- percpu_ref_put(pgmap->ref);
+ percpu_ref_put(&pgmap->ref);
}
#endif /* _LINUX_MEMREMAP_H_ */
*/
extern pgprot_t protection_map[16];
- /**
- * enum fault_flag - Fault flag definitions.
- * @FAULT_FLAG_WRITE: Fault was a write fault.
- * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
- * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
- * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying.
- * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
- * @FAULT_FLAG_TRIED: The fault has been tried once.
- * @FAULT_FLAG_USER: The fault originated in userspace.
- * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
- * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
- * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
- *
- * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
- * whether we would allow page faults to retry by specifying these two
- * fault flags correctly. Currently there can be three legal combinations:
- *
- * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and
- * this is the first try
- *
- * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and
- * we've already tried at least once
- *
- * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
- *
- * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
- * be used. Note that page faults can be allowed to retry for multiple times,
- * in which case we'll have an initial fault with flags (a) then later on
- * continuous faults with flags (b). We should always try to detect pending
- * signals before a retry to make sure the continuous page faults can still be
- * interrupted if necessary.
- */
- enum fault_flag {
- FAULT_FLAG_WRITE = 1 << 0,
- FAULT_FLAG_MKWRITE = 1 << 1,
- FAULT_FLAG_ALLOW_RETRY = 1 << 2,
- FAULT_FLAG_RETRY_NOWAIT = 1 << 3,
- FAULT_FLAG_KILLABLE = 1 << 4,
- FAULT_FLAG_TRIED = 1 << 5,
- FAULT_FLAG_USER = 1 << 6,
- FAULT_FLAG_REMOTE = 1 << 7,
- FAULT_FLAG_INSTRUCTION = 1 << 8,
- FAULT_FLAG_INTERRUPTIBLE = 1 << 9,
- };
-
/*
* The default fault flags that should be used by most of the
* arch-specific page fault handlers.
*/
struct vm_operations_struct {
void (*open)(struct vm_area_struct * area);
+ /**
+ * @close: Called when the VMA is being removed from the MM.
+ * Context: User context. May sleep. Caller holds mmap_lock.
+ */
void (*close)(struct vm_area_struct * area);
/* Called any time before splitting to check if it's allowed */
int (*may_split)(struct vm_area_struct *area, unsigned long addr);
struct mmu_gather;
struct inode;
+static inline unsigned int compound_order(struct page *page)
+{
+ if (!PageHead(page))
+ return 0;
+ return page[1].compound_order;
+}
+
+/**
+ * folio_order - The allocation order of a folio.
+ * @folio: The folio.
+ *
+ * A folio is composed of 2^order pages. See get_order() for the definition
+ * of order.
+ *
+ * Return: The order of the folio.
+ */
+static inline unsigned int folio_order(struct folio *folio)
+{
+ return compound_order(&folio->page);
+}
+
#include <linux/huge_mm.h>
/*
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int total_mapcount(struct page *page);
- int page_trans_huge_mapcount(struct page *page, int *total_mapcount);
+ int page_trans_huge_mapcount(struct page *page);
#else
static inline int total_mapcount(struct page *page)
{
return page_mapcount(page);
}
- static inline int page_trans_huge_mapcount(struct page *page,
- int *total_mapcount)
+ static inline int page_trans_huge_mapcount(struct page *page)
{
- int mapcount = page_mapcount(page);
- if (total_mapcount)
- *total_mapcount = mapcount;
- return mapcount;
+ return page_mapcount(page);
}
#endif
return compound_head(page);
}
+static inline struct folio *virt_to_folio(const void *x)
+{
+ struct page *page = virt_to_page(x);
+
+ return page_folio(page);
+}
+
void __put_page(struct page *page);
void put_pages_list(struct list_head *pages);
compound_page_dtors[page[1].compound_dtor](page);
}
-static inline unsigned int compound_order(struct page *page)
-{
- if (!PageHead(page))
- return 0;
- return page[1].compound_order;
-}
-
-/**
- * folio_order - The allocation order of a folio.
- * @folio: The folio.
- *
- * A folio is composed of 2^order pages. See get_order() for the definition
- * of order.
- *
- * Return: The order of the folio.
- */
-static inline unsigned int folio_order(struct folio *folio)
-{
- return compound_order(&folio->page);
-}
-
static inline bool hpage_pincount_available(struct page *page)
{
/*
#define page_address_init() do { } while(0)
#endif
+static inline void *folio_address(const struct folio *folio)
+{
+ return page_address(&folio->page);
+}
+
extern void *page_rmapping(struct page *page);
extern struct anon_vma *page_anon_vma(struct page *page);
extern pgoff_t __page_file_index(struct page *page);
extern int user_shm_lock(size_t, struct ucounts *);
extern void user_shm_unlock(size_t, struct ucounts *);
-/*
- * Parameter block passed down to zap_pte_range in exceptional cases.
- */
-struct zap_details {
- struct address_space *zap_mapping; /* Check page->mapping if set */
- struct page *single_page; /* Locked page to be unmapped */
-};
-
-/*
- * We set details->zap_mappings when we want to unmap shared but keep private
- * pages. Return true if skip zapping this page, false otherwise.
- */
-static inline bool
-zap_skip_check_mapping(struct zap_details *details, struct page *page)
-{
- if (!details || !page)
- return false;
-
- return details->zap_mapping &&
- (details->zap_mapping != page_rmapping(page));
-}
-
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
-int truncate_inode_page(struct address_space *mapping, struct page *page);
int generic_error_remove_page(struct address_space *mapping, struct page *page);
int invalidate_inode_page(struct page *page);
extern int fixup_user_fault(struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked);
-void unmap_mapping_page(struct page *page);
void unmap_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t nr, bool even_cows);
void unmap_mapping_range(struct address_space *mapping,
BUG();
return -EFAULT;
}
-static inline void unmap_mapping_page(struct page *page) { }
static inline void unmap_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t nr, bool even_cows) { }
static inline void unmap_mapping_range(struct address_space *mapping,
struct page **pages);
struct page *get_dump_page(unsigned long addr);
-extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
extern void do_invalidatepage(struct page *page, unsigned int offset,
unsigned int length);
extern struct vm_area_struct *vma_merge(struct mm_struct *,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
- struct mempolicy *, struct vm_userfaultfd_ctx);
+ struct mempolicy *, struct vm_userfaultfd_ctx, const char *);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
#endif
void drop_slab(void);
- void drop_slab_node(int nid);
#ifndef CONFIG_MMU
#define randomize_va_space 0
MF_ACTION_REQUIRED = 1 << 1,
MF_MUST_KILL = 1 << 2,
MF_SOFT_OFFLINE = 1 << 3,
+ MF_UNPOISON = 1 << 4,
};
extern int memory_failure(unsigned long pfn, int flags);
extern void memory_failure_queue(unsigned long pfn, int flags);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
+#ifndef arch_memory_failure
+static inline int arch_memory_failure(unsigned long pfn, int flags)
+{
+ return -ENXIO;
+}
+#endif
+
+#ifndef arch_is_platform_page
+static inline bool arch_is_platform_page(u64 paddr)
+{
+ return false;
+}
+#endif
/*
* Error handlers for various types of pages.
MF_MSG_KERNEL_HIGH_ORDER,
MF_MSG_SLAB,
MF_MSG_DIFFERENT_COMPOUND,
- MF_MSG_POISONED_HUGE,
MF_MSG_HUGE,
MF_MSG_FREE_HUGE,
MF_MSG_NON_PMD_HUGE,
MF_MSG_CLEAN_LRU,
MF_MSG_TRUNCATED_LRU,
MF_MSG_BUDDY,
- MF_MSG_BUDDY_2ND,
MF_MSG_DAX,
MF_MSG_UNSPLIT_THP,
MF_MSG_UNKNOWN,
return 0;
}
+ #ifdef CONFIG_ANON_VMA_NAME
+ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
+ unsigned long len_in, const char *name);
+ #else
+ static inline int
+ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
+ unsigned long len_in, const char *name) {
+ return 0;
+ }
+ #endif
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
#include <linux/mm_types_task.h>
#include <linux/auxvec.h>
+ #include <linux/kref.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
* in each subpage, but you may need to restore some of their values
* afterwards.
*
- * SLUB uses cmpxchg_double() to atomically update its freelist and
- * counters. That requires that freelist & counters be adjacent and
- * double-word aligned. We align all struct pages to double-word
- * boundaries, and ensure that 'freelist' is aligned within the
- * struct.
+ * SLUB uses cmpxchg_double() to atomically update its freelist and counters.
+ * That requires that freelist & counters in struct slab be adjacent and
+ * double-word aligned. Because struct slab currently just reinterprets the
+ * bits of struct page, we align all struct pages to double-word boundaries,
+ * and ensure that 'freelist' is aligned within struct slab.
*/
#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
#define _struct_page_alignment __aligned(2 * sizeof(unsigned long))
struct vm_userfaultfd_ctx {};
#endif /* CONFIG_USERFAULTFD */
+ struct anon_vma_name {
+ struct kref kref;
+ /* The name needs to be at the end because it is dynamically sized. */
+ char name[];
+ };
+
/*
* This struct describes a virtual memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap interval tree.
+ *
+ * For private anonymous mappings, a pointer to a null terminated string
+ * containing the name given to the vma, or NULL if unnamed.
*/
- struct {
- struct rb_node rb;
- unsigned long rb_subtree_last;
- } shared;
+
+ union {
+ struct {
+ struct rb_node rb;
+ unsigned long rb_subtree_last;
+ } shared;
+ /* Serialized by mmap_sem. */
+ struct anon_vma_name *anon_name;
+ };
/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
atomic_t tlb_flush_pending;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/* See flush_tlb_batched_pending() */
- bool tlb_flush_batched;
+ atomic_t tlb_flush_batched;
#endif
struct uprobes_state uprobes_state;
#ifdef CONFIG_PREEMPT_RT
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_finish_mmu(struct mmu_gather *tlb);
- static inline void init_tlb_flush_pending(struct mm_struct *mm)
- {
- atomic_set(&mm->tlb_flush_pending, 0);
- }
-
- static inline void inc_tlb_flush_pending(struct mm_struct *mm)
- {
- atomic_inc(&mm->tlb_flush_pending);
- /*
- * The only time this value is relevant is when there are indeed pages
- * to flush. And we'll only flush pages after changing them, which
- * requires the PTL.
- *
- * So the ordering here is:
- *
- * atomic_inc(&mm->tlb_flush_pending);
- * spin_lock(&ptl);
- * ...
- * set_pte_at();
- * spin_unlock(&ptl);
- *
- * spin_lock(&ptl)
- * mm_tlb_flush_pending();
- * ....
- * spin_unlock(&ptl);
- *
- * flush_tlb_range();
- * atomic_dec(&mm->tlb_flush_pending);
- *
- * Where the increment if constrained by the PTL unlock, it thus
- * ensures that the increment is visible if the PTE modification is
- * visible. After all, if there is no PTE modification, nobody cares
- * about TLB flushes either.
- *
- * This very much relies on users (mm_tlb_flush_pending() and
- * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
- * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
- * locks (PPC) the unlock of one doesn't order against the lock of
- * another PTL.
- *
- * The decrement is ordered by the flush_tlb_range(), such that
- * mm_tlb_flush_pending() will not return false unless all flushes have
- * completed.
- */
- }
-
- static inline void dec_tlb_flush_pending(struct mm_struct *mm)
- {
- /*
- * See inc_tlb_flush_pending().
- *
- * This cannot be smp_mb__before_atomic() because smp_mb() simply does
- * not order against TLB invalidate completion, which is what we need.
- *
- * Therefore we must rely on tlb_flush_*() to guarantee order.
- */
- atomic_dec(&mm->tlb_flush_pending);
- }
-
- static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
- {
- /*
- * Must be called after having acquired the PTL; orders against that
- * PTLs release and therefore ensures that if we observe the modified
- * PTE we must also observe the increment from inc_tlb_flush_pending().
- *
- * That is, it only guarantees to return true if there is a flush
- * pending for _this_ PTL.
- */
- return atomic_read(&mm->tlb_flush_pending);
- }
-
- static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
- {
- /*
- * Similar to mm_tlb_flush_pending(), we must have acquired the PTL
- * for which there is a TLB flush pending in order to guarantee
- * we've seen both that PTE modification and the increment.
- *
- * (no requirement on actually still holding the PTL, that is irrelevant)
- */
- return atomic_read(&mm->tlb_flush_pending) > 1;
- }
-
struct vm_fault;
/**
unsigned long val;
} swp_entry_t;
+ /**
+ * enum fault_flag - Fault flag definitions.
+ * @FAULT_FLAG_WRITE: Fault was a write fault.
+ * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
+ * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
+ * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying.
+ * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
+ * @FAULT_FLAG_TRIED: The fault has been tried once.
+ * @FAULT_FLAG_USER: The fault originated in userspace.
+ * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
+ * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
+ * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
+ *
+ * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
+ * whether we would allow page faults to retry by specifying these two
+ * fault flags correctly. Currently there can be three legal combinations:
+ *
+ * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and
+ * this is the first try
+ *
+ * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and
+ * we've already tried at least once
+ *
+ * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
+ *
+ * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
+ * be used. Note that page faults can be allowed to retry for multiple times,
+ * in which case we'll have an initial fault with flags (a) then later on
+ * continuous faults with flags (b). We should always try to detect pending
+ * signals before a retry to make sure the continuous page faults can still be
+ * interrupted if necessary.
+ */
+ enum fault_flag {
+ FAULT_FLAG_WRITE = 1 << 0,
+ FAULT_FLAG_MKWRITE = 1 << 1,
+ FAULT_FLAG_ALLOW_RETRY = 1 << 2,
+ FAULT_FLAG_RETRY_NOWAIT = 1 << 3,
+ FAULT_FLAG_KILLABLE = 1 << 4,
+ FAULT_FLAG_TRIED = 1 << 5,
+ FAULT_FLAG_USER = 1 << 6,
+ FAULT_FLAG_REMOTE = 1 << 7,
+ FAULT_FLAG_INSTRUCTION = 1 << 8,
+ FAULT_FLAG_INTERRUPTIBLE = 1 << 9,
+ };
+
#endif /* _LINUX_MM_TYPES_H */
* might lose their PG_swapbacked flag when they simply can be dropped (e.g. as
* a result of MADV_FREE).
*
- * PG_uptodate tells whether the page's contents is valid. When a read
- * completes, the page becomes uptodate, unless a disk I/O error happened.
- *
* PG_referenced, PG_reclaim are used for page reclaim for anonymous and
* file-backed pagecache (see mm/vmscan.c).
*
TESTCLEARFLAG(uname, lname, policy)
#define TESTPAGEFLAG_FALSE(uname, lname) \
- static inline bool folio_test_##lname(const struct folio *folio) { return 0; } \
+ static inline bool folio_test_##lname(const struct folio *folio) { return false; } \
static inline int Page##uname(const struct page *page) { return 0; }
#define SETPAGEFLAG_NOOP(uname, lname) \
PAGEFLAG(HWPoison, hwpoison, PF_ANY)
TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
+ #define MAGIC_HWPOISON 0x48575053U /* HWPS */
+ extern void SetPageHWPoisonTakenOff(struct page *page);
+ extern void ClearPageHWPoisonTakenOff(struct page *page);
extern bool take_page_off_buddy(struct page *page);
+ extern bool put_page_back_buddy(struct page *page);
#else
PAGEFLAG_FALSE(HWPoison, hwpoison)
#define __PG_HWPOISON 0
u64 stable_page_flags(struct page *page);
+/**
+ * folio_test_uptodate - Is this folio up to date?
+ * @folio: The folio.
+ *
+ * The uptodate flag is set on a folio when every byte in the folio is
+ * at least as new as the corresponding bytes on storage. Anonymous
+ * and CoW folios are always uptodate. If the folio is not uptodate,
+ * some of the bytes in it may be; see the is_partially_uptodate()
+ * address_space operation.
+ */
static inline bool folio_test_uptodate(struct folio *folio)
{
bool ret = test_bit(PG_uptodate, folio_flags(folio, 0));
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
+ #include <linux/mm_inline.h>
#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
new->vm_next = new->vm_prev = NULL;
+ dup_vma_anon_name(orig, new);
}
return new;
}
void vm_area_free(struct vm_area_struct *vma)
{
+ free_vma_anon_name(vma);
kmem_cache_free(vm_area_cachep, vma);
}
return error;
}
-static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
-{
-#ifdef CONFIG_BLOCK
- struct io_context *ioc = current->io_context;
- struct io_context *new_ioc;
-
- if (!ioc)
- return 0;
- /*
- * Share io context with parent, if CLONE_IO is set
- */
- if (clone_flags & CLONE_IO) {
- ioc_task_link(ioc);
- tsk->io_context = ioc;
- } else if (ioprio_valid(ioc->ioprio)) {
- new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
- if (unlikely(!new_ioc))
- return -ENOMEM;
-
- new_ioc->ioprio = ioc->ioprio;
- put_io_context(new_ioc);
- }
-#endif
- return 0;
-}
-
static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
{
struct sighand_struct *sig;
#include <linux/oom.h>
#include <linux/tick.h>
#include <linux/rcupdate_trace.h>
+#include <linux/nmi.h>
#include "rcu.h"
/* Bits for ->extendables field, extendables param, and related definitions. */
-#define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1)
+#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */
+#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1)
+#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */
+#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2)
#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */
#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */
#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */
#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */
#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */
-#define RCUTORTURE_RDR_RCU 0x20 /* ... entering another RCU reader. */
-#define RCUTORTURE_RDR_NBITS 6 /* Number of bits defined above. */
+#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */
+#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */
+#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */
#define RCUTORTURE_MAX_EXTEND \
(RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
"Duration of fqs bursts (us), 0 to disable");
torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
-torture_param(bool, fwd_progress, 1, "Test grace-period forward progress");
+torture_param(int, fwd_progress, 1, "Test grace-period forward progress");
torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait");
torture_param(int, fwd_progress_holdoff, 60,
"Time between forward-progress tests (s)");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
torture_param(int, stall_cpu_holdoff, 10,
"Time to wait before starting stall (s).");
+torture_param(bool, stall_no_softlockup, false,
+ "Avoid softlockup warning during cpu stall.");
torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
torture_param(int, stall_cpu_block, 0, "Sleep while stalling.");
torture_param(int, stall_gp_kthread, 0,
static struct task_struct *fqs_task;
static struct task_struct *boost_tasks[NR_CPUS];
static struct task_struct *stall_task;
-static struct task_struct *fwd_prog_task;
+static struct task_struct **fwd_prog_tasks;
static struct task_struct **barrier_cbs_tasks;
static struct task_struct *barrier_task;
static struct task_struct *read_exit_task;
void (*gp_kthread_dbg)(void);
bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
int (*stall_dur)(void);
+ long cbflood_max;
int irq_capable;
int can_boost;
int extendables;
int slow_gps;
+ int no_pi_lock;
const char *name;
};
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.name = "srcu"
};
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.name = "srcud"
};
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.extendables = RCUTORTURE_MAX_EXTEND,
.name = "busted_srcud"
};
.call = call_rcu_tasks_rude,
.cb_barrier = rcu_barrier_tasks_rude,
.gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread,
+ .cbflood_max = 50000,
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
.call = call_rcu_tasks_trace,
.cb_barrier = rcu_barrier_tasks_trace,
.gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread,
+ .cbflood_max = 50000,
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
struct rt_read_seg *rtrsp)
{
unsigned long flags;
- int idxnew = -1;
- int idxold = *readstate;
+ int idxnew1 = -1;
+ int idxnew2 = -1;
+ int idxold1 = *readstate;
+ int idxold2 = idxold1;
int statesnew = ~*readstate & newstate;
int statesold = *readstate & ~newstate;
- WARN_ON_ONCE(idxold < 0);
- WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1);
+ WARN_ON_ONCE(idxold2 < 0);
+ WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
rtrsp->rt_readstate = newstate;
/* First, put new protection in place to avoid critical-section gap. */
preempt_disable();
if (statesnew & RCUTORTURE_RDR_SCHED)
rcu_read_lock_sched();
- if (statesnew & RCUTORTURE_RDR_RCU)
- idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT;
+ if (statesnew & RCUTORTURE_RDR_RCU_1)
+ idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1;
+ if (statesnew & RCUTORTURE_RDR_RCU_2)
+ idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2;
/*
* Next, remove old protection, in decreasing order of strength
local_bh_enable();
if (statesold & RCUTORTURE_RDR_RBH)
rcu_read_unlock_bh();
- if (statesold & RCUTORTURE_RDR_RCU) {
- bool lockit = !statesnew && !(torture_random(trsp) & 0xffff);
+ if (statesold & RCUTORTURE_RDR_RCU_2) {
+ cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1);
+ WARN_ON_ONCE(idxnew2 != -1);
+ idxold2 = 0;
+ }
+ if (statesold & RCUTORTURE_RDR_RCU_1) {
+ bool lockit;
+ lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff);
if (lockit)
raw_spin_lock_irqsave(¤t->pi_lock, flags);
- cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT);
+ cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1);
+ WARN_ON_ONCE(idxnew1 != -1);
+ idxold1 = 0;
if (lockit)
raw_spin_unlock_irqrestore(¤t->pi_lock, flags);
}
cur_ops->read_delay(trsp, rtrsp);
/* Update the reader state. */
- if (idxnew == -1)
- idxnew = idxold & ~RCUTORTURE_RDR_MASK;
- WARN_ON_ONCE(idxnew < 0);
- WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1);
- *readstate = idxnew | newstate;
- WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) < 0);
- WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) > 1);
+ if (idxnew1 == -1)
+ idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1;
+ WARN_ON_ONCE(idxnew1 < 0);
+ if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1))
+ pr_info("Unexpected idxnew1 value of %#x\n", idxnew1);
+ if (idxnew2 == -1)
+ idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2;
+ WARN_ON_ONCE(idxnew2 < 0);
+ WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
+ *readstate = idxnew1 | idxnew2 | newstate;
+ WARN_ON_ONCE(*readstate < 0);
+ if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1))
+ pr_info("Unexpected idxnew2 value of %#x\n", idxnew2);
}
/* Return the biggest extendables mask given current RCU and boot parameters. */
WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND);
mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables;
- mask = mask | RCUTORTURE_RDR_RCU;
+ mask = mask | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
return mask;
}
unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
- WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
+ WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1);
/* Mostly only one bit (need preemption!), sometimes lots of bits. */
if (!(randmask1 & 0x7))
mask = mask & randmask2;
else
mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS));
+ // Can't have nested RCU reader without outer RCU reader.
+ if (!(mask & RCUTORTURE_RDR_RCU_1) && (mask & RCUTORTURE_RDR_RCU_2)) {
+ if (oldmask & RCUTORTURE_RDR_RCU_1)
+ mask &= ~RCUTORTURE_RDR_RCU_2;
+ else
+ mask |= RCUTORTURE_RDR_RCU_1;
+ }
+
/*
* Can't enable bh w/irq disabled.
*/
mask |= oldmask & bhs;
}
- return mask ?: RCUTORTURE_RDR_RCU;
+ return mask ?: RCUTORTURE_RDR_RCU_1;
}
/*
rcu_torture_writer_state,
cookie, cur_ops->get_gp_state());
rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
- WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK);
+ WARN_ON_ONCE(readstate);
// This next splat is expected behavior if leakpointer, especially
// for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels.
WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1);
mutex_lock(&boost_mutex);
rcu_torture_disable_rt_throttle();
VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
- boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
- cpu_to_node(cpu),
- "rcu_torture_boost");
+ boost_tasks[cpu] = kthread_run_on_cpu(rcu_torture_boost, NULL,
+ cpu, "rcu_torture_boost_%u");
if (IS_ERR(boost_tasks[cpu])) {
retval = PTR_ERR(boost_tasks[cpu]);
VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
mutex_unlock(&boost_mutex);
return retval;
}
- kthread_bind(boost_tasks[cpu], cpu);
- wake_up_process(boost_tasks[cpu]);
mutex_unlock(&boost_mutex);
return 0;
}
#else
schedule_timeout_uninterruptible(HZ);
#endif
+ } else if (stall_no_softlockup) {
+ touch_softlockup_watchdog();
}
if (stall_cpu_irqsoff)
local_irq_enable();
unsigned long rcu_fwd_startat;
struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
unsigned long rcu_launder_gp_seq_start;
+ int rcu_fwd_id;
};
static DEFINE_MUTEX(rcu_fwd_mutex);
static struct rcu_fwd *rcu_fwds;
+static unsigned long rcu_fwd_seq;
+static atomic_long_t rcu_fwd_max_cbs;
static bool rcu_fwd_emergency_stop;
static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--)
if (rfp->n_launders_hist[i].n_launders > 0)
break;
- pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
- __func__, jiffies - rfp->rcu_fwd_startat);
+ mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
+ pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):",
+ __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat);
gps_old = rfp->rcu_launder_gp_seq_start;
for (j = 0; j <= i; j++) {
gps = rfp->n_launders_hist[j].launder_gp_seq;
gps_old = gps;
}
pr_cont("\n");
+ mutex_unlock(&rcu_fwd_mutex);
}
/* Callback function for continuous-flood RCU callbacks. */
cver = READ_ONCE(rcu_torture_current_version) - cver;
gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
WARN_ON(!cver && gps < 2);
- pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps);
+ pr_alert("%s: %d Duration %ld cver %ld gps %ld\n", __func__,
+ rfp->rcu_fwd_id, dur, cver, gps);
}
if (selfpropcb) {
WRITE_ONCE(fcs.stop, 1);
rfp->rcu_fwd_cb_head = rfcpn;
n_launders++;
n_launders_sa++;
- } else {
+ } else if (!cur_ops->cbflood_max || cur_ops->cbflood_max > n_max_cbs) {
rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL);
if (WARN_ON_ONCE(!rfcp)) {
schedule_timeout_interruptible(1);
n_launders_sa = 0;
rfcp->rfc_gps = 0;
rfcp->rfc_rfp = rfp;
+ } else {
+ rfcp = NULL;
}
- cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
+ if (rfcp)
+ cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
if (tick_nohz_full_enabled()) {
local_irq_save(flags);
n_launders + n_max_cbs - n_launders_cb_snap,
n_launders, n_launders_sa,
n_max_gps, n_max_cbs, cver, gps);
+ atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs);
rcu_torture_fwd_cb_hist(rfp);
}
schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
static int rcutorture_oom_notify(struct notifier_block *self,
unsigned long notused, void *nfreed)
{
+ int i;
+ long ncbs;
struct rcu_fwd *rfp;
mutex_lock(&rcu_fwd_mutex);
}
WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
__func__);
- rcu_torture_fwd_cb_hist(rfp);
- rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2);
+ for (i = 0; i < fwd_progress; i++) {
+ rcu_torture_fwd_cb_hist(&rfp[i]);
+ rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp[i].rcu_fwd_startat)) / 2);
+ }
WRITE_ONCE(rcu_fwd_emergency_stop, true);
smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
rcu_barrier();
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
rcu_barrier();
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
smp_mb(); /* Frees before return to avoid redoing OOM. */
(*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */
pr_info("%s returning after OOM processing.\n", __func__);
/* Carry out grace-period forward-progress testing. */
static int rcu_torture_fwd_prog(void *args)
{
+ bool firsttime = true;
+ long max_cbs;
int oldnice = task_nice(current);
+ unsigned long oldseq = READ_ONCE(rcu_fwd_seq);
struct rcu_fwd *rfp = args;
int tested = 0;
int tested_tries = 0;
if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST))
set_user_nice(current, MAX_NICE);
do {
- schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
- WRITE_ONCE(rcu_fwd_emergency_stop, false);
- if (!IS_ENABLED(CONFIG_TINY_RCU) ||
- rcu_inkernel_boot_has_ended())
- rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
- if (rcu_inkernel_boot_has_ended())
+ if (!rfp->rcu_fwd_id) {
+ schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
+ WRITE_ONCE(rcu_fwd_emergency_stop, false);
+ if (!firsttime) {
+ max_cbs = atomic_long_xchg(&rcu_fwd_max_cbs, 0);
+ pr_alert("%s n_max_cbs: %ld\n", __func__, max_cbs);
+ }
+ firsttime = false;
+ WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1);
+ } else {
+ while (READ_ONCE(rcu_fwd_seq) == oldseq)
+ schedule_timeout_interruptible(1);
+ oldseq = READ_ONCE(rcu_fwd_seq);
+ }
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
+ if (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id)
rcu_torture_fwd_prog_cr(rfp);
+ if ((cur_ops->stall_dur && cur_ops->stall_dur() > 0) &&
+ (!IS_ENABLED(CONFIG_TINY_RCU) ||
+ (rcu_inkernel_boot_has_ended() &&
+ torture_num_online_cpus() > rfp->rcu_fwd_id)))
+ rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
/* Avoid slow periods, better to test when busy. */
if (stutter_wait("rcu_torture_fwd_prog"))
sched_set_normal(current, oldnice);
} while (!torture_must_stop());
/* Short runs might not contain a valid forward-progress attempt. */
- WARN_ON(!tested && tested_tries >= 5);
- pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
+ if (!rfp->rcu_fwd_id) {
+ WARN_ON(!tested && tested_tries >= 5);
+ pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
+ }
torture_kthread_stopping("rcu_torture_fwd_prog");
return 0;
}
/* If forward-progress checking is requested and feasible, spawn the thread. */
static int __init rcu_torture_fwd_prog_init(void)
{
+ int i;
+ int ret = 0;
struct rcu_fwd *rfp;
if (!fwd_progress)
return 0; /* Not requested, so don't do it. */
+ if (fwd_progress >= nr_cpu_ids) {
+ VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Limiting fwd_progress to # CPUs.\n");
+ fwd_progress = nr_cpu_ids;
+ } else if (fwd_progress < 0) {
+ fwd_progress = nr_cpu_ids;
+ }
if ((!cur_ops->sync && !cur_ops->call) ||
- !cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || cur_ops == &rcu_busted_ops) {
+ (!cur_ops->cbflood_max && (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0)) ||
+ cur_ops == &rcu_busted_ops) {
VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test");
+ fwd_progress = 0;
return 0;
}
if (stall_cpu > 0) {
VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing");
+ fwd_progress = 0;
if (IS_MODULE(CONFIG_RCU_TORTURE_TEST))
return -EINVAL; /* In module, can fail back to user. */
WARN_ON(1); /* Make sure rcutorture notices conflict. */
fwd_progress_holdoff = 1;
if (fwd_progress_div <= 0)
fwd_progress_div = 4;
- rfp = kzalloc(sizeof(*rfp), GFP_KERNEL);
- if (!rfp)
+ rfp = kcalloc(fwd_progress, sizeof(*rfp), GFP_KERNEL);
+ fwd_prog_tasks = kcalloc(fwd_progress, sizeof(*fwd_prog_tasks), GFP_KERNEL);
+ if (!rfp || !fwd_prog_tasks) {
+ kfree(rfp);
+ kfree(fwd_prog_tasks);
+ fwd_prog_tasks = NULL;
+ fwd_progress = 0;
return -ENOMEM;
- spin_lock_init(&rfp->rcu_fwd_lock);
- rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
+ }
+ for (i = 0; i < fwd_progress; i++) {
+ spin_lock_init(&rfp[i].rcu_fwd_lock);
+ rfp[i].rcu_fwd_cb_tail = &rfp[i].rcu_fwd_cb_head;
+ rfp[i].rcu_fwd_id = i;
+ }
mutex_lock(&rcu_fwd_mutex);
rcu_fwds = rfp;
mutex_unlock(&rcu_fwd_mutex);
register_oom_notifier(&rcutorture_oom_nb);
- return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task);
+ for (i = 0; i < fwd_progress; i++) {
+ ret = torture_create_kthread(rcu_torture_fwd_prog, &rcu_fwds[i], fwd_prog_tasks[i]);
+ if (ret) {
+ fwd_progress = i;
+ return ret;
+ }
+ }
+ return 0;
}
static void rcu_torture_fwd_prog_cleanup(void)
{
+ int i;
struct rcu_fwd *rfp;
- torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
- rfp = rcu_fwds;
+ if (!rcu_fwds || !fwd_prog_tasks)
+ return;
+ for (i = 0; i < fwd_progress; i++)
+ torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_tasks[i]);
+ unregister_oom_notifier(&rcutorture_oom_nb);
mutex_lock(&rcu_fwd_mutex);
+ rfp = rcu_fwds;
rcu_fwds = NULL;
mutex_unlock(&rcu_fwd_mutex);
- unregister_oom_notifier(&rcutorture_oom_nb);
kfree(rfp);
+ kfree(fwd_prog_tasks);
+ fwd_prog_tasks = NULL;
}
/* Callback function for RCU barrier testing. */
&trs, "%s",
"rcu_torture_read_exit_child");
if (IS_ERR(tsp)) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
errexit = true;
tsp = NULL;
break;
sizeof(fakewriter_tasks[0]),
GFP_KERNEL);
if (fakewriter_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk),
GFP_KERNEL);
if (!reader_tasks || !rcu_torture_reader_mbchk) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
if (nrealnocbers > 0) {
nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL);
if (nocb_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
#include <linux/security.h>
#include <linux/ctype.h>
#include <linux/kmemleak.h>
+#include <linux/filter.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
static int one_hundred = 100;
static int two_hundred = 200;
static int one_thousand = 1000;
+ static int three_thousand = 3000;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
.mode = 0644,
.proc_handler = watermark_scale_factor_sysctl_handler,
.extra1 = SYSCTL_ONE,
- .extra2 = &one_thousand,
+ .extra2 = &three_thousand,
},
{
.procname = "percpu_pagelist_high_fraction",
KCSAN_SANITIZE_slab.o := n
KCSAN_SANITIZE_slub.o := n
KCSAN_SANITIZE_page_alloc.o := n
+# But enable explicit instrumentation for memory barriers.
+KCSAN_INSTRUMENT_BARRIERS := y
# These files are disabled because they produce non-interesting and/or
# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
+ obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
obj-$(CONFIG_SECRETMEM) += secretmem.o
obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
* We can only reuse the page if nobody else maps the huge page or it's
* part.
*/
- if (reuse_swap_page(page, NULL)) {
+ if (reuse_swap_page(page)) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
* need full accuracy to avoid breaking page pinning, because
* page_trans_huge_mapcount() is slower than page_mapcount().
*/
- int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
+ int page_trans_huge_mapcount(struct page *page)
{
- int i, ret, _total_mapcount, mapcount;
+ int i, ret;
/* hugetlbfs shouldn't call it */
VM_BUG_ON_PAGE(PageHuge(page), page);
- if (likely(!PageTransCompound(page))) {
- mapcount = atomic_read(&page->_mapcount) + 1;
- if (total_mapcount)
- *total_mapcount = mapcount;
- return mapcount;
- }
+ if (likely(!PageTransCompound(page)))
+ return atomic_read(&page->_mapcount) + 1;
page = compound_head(page);
- _total_mapcount = ret = 0;
+ ret = 0;
for (i = 0; i < thp_nr_pages(page); i++) {
- mapcount = atomic_read(&page[i]._mapcount) + 1;
+ int mapcount = atomic_read(&page[i]._mapcount) + 1;
ret = max(ret, mapcount);
- _total_mapcount += mapcount;
}
- if (PageDoubleMap(page)) {
+
+ if (PageDoubleMap(page))
ret -= 1;
- _total_mapcount -= thp_nr_pages(page);
- }
- mapcount = compound_mapcount(page);
- ret += mapcount;
- _total_mapcount += mapcount;
- if (total_mapcount)
- *total_mapcount = _total_mapcount;
- return ret;
+
+ return ret + compound_mapcount(page);
}
/* Racy check whether the huge page can be split */
{
struct page *head = compound_head(page);
struct deferred_split *ds_queue = get_deferred_split_queue(head);
+ XA_STATE(xas, &head->mapping->i_pages, head->index);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int extra_pins, ret;
goto out;
}
+ xas_split_alloc(&xas, head, compound_order(head),
+ mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK);
+ if (xas_error(&xas)) {
+ ret = xas_error(&xas);
+ goto out;
+ }
+
anon_vma = NULL;
i_mmap_lock_read(mapping);
/* block interrupt reentry in xa_lock and spinlock */
local_irq_disable();
if (mapping) {
- XA_STATE(xas, &mapping->i_pages, page_index(head));
-
/*
* Check if the head page is present in page cache.
* We assume all tail are present too, if head is there.
*/
- xa_lock(&mapping->i_pages);
+ xas_lock(&xas);
+ xas_reset(&xas);
if (xas_load(&xas) != head)
goto fail;
}
if (mapping) {
int nr = thp_nr_pages(head);
+ xas_split(&xas, head, thp_order(head));
if (PageSwapBacked(head)) {
__mod_lruvec_page_state(head, NR_SHMEM_THPS,
-nr);
spin_unlock(&ds_queue->split_queue_lock);
fail:
if (mapping)
- xa_unlock(&mapping->i_pages);
+ xas_unlock(&xas);
local_irq_enable();
remap_page(head, thp_nr_pages(head));
ret = -EBUSY;
if (mapping)
i_mmap_unlock_read(mapping);
out:
+ /* Free any memory we didn't use */
+ xas_nomem(&xas, 0);
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
return ret;
}
#include <linux/pagemap.h>
#include <linux/tracepoint-defs.h>
+struct folio_batch;
+
/*
* The set of flags that only affect watermark checking and reclaim
* behaviour. This is used by the MM to obey the caller constraints
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
- __GFP_ATOMIC)
+ __GFP_ATOMIC|__GFP_NOLOCKDEP)
/* The GFP flags allowed during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
}
+struct zap_details;
void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
}
unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
- pgoff_t end, struct pagevec *pvec, pgoff_t *indices);
+ pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
+unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
+ pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
+void filemap_free_folio(struct address_space *mapping, struct folio *folio);
+int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
+bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
+ loff_t end);
/**
* folio_evictable - Test whether a folio is evictable.
*/
extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
- /*
- * in mm/memcontrol.c:
- */
- extern bool cgroup_memory_nokmem;
-
/*
* in mm/page_alloc.c
*/
void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
#ifdef CONFIG_MMU
+void unmap_mapping_folio(struct folio *folio);
extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *locked);
extern long faultin_vma_page_range(struct vm_area_struct *vma,
}
return fpin;
}
-
#else /* !CONFIG_MMU */
+static inline void unmap_mapping_folio(struct folio *folio) { }
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
{
- return virt_to_head_page(qlink)->slab_cache;
+ return virt_to_slab(qlink)->slab_cache;
}
static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
{
void *object = qlink_to_object(qlink, cache);
+ struct kasan_free_meta *meta = kasan_get_free_meta(cache, object);
unsigned long flags;
if (IS_ENABLED(CONFIG_SLAB))
local_irq_save(flags);
+ /*
+ * If init_on_free is enabled and KASAN's free metadata is stored in
+ * the object, zero the metadata. Otherwise, the object's memory will
+ * not be properly zeroed, as KASAN saves the metadata after the slab
+ * allocator zeroes the object.
+ */
+ if (slab_want_init_on_free(cache) &&
+ cache->kasan_info.free_meta_offset == 0)
+ memzero_explicit(meta, sizeof(*meta));
+
/*
* As the object now gets freed from the quarantine, assume that its
* free track is no longer valid.
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
goto out;
}
}
if (page_mapcount(page) > 1 &&
++shared > khugepaged_max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out;
}
goto out;
}
if (!pte_write(pteval) && PageSwapCache(page) &&
- !reuse_swap_page(page, NULL)) {
+ !reuse_swap_page(page)) {
/*
* Page is in the swap cache and cannot be re-used.
* It cannot be collapsed into a THP.
* ptl mostly unnecessary.
*/
spin_lock(ptl);
- /*
- * paravirt calls inside pte_clear here are
- * superfluous.
- */
- pte_clear(vma->vm_mm, address, _pte);
+ ptep_clear(vma->vm_mm, address, _pte);
spin_unlock(ptl);
}
} else {
* inside page_remove_rmap().
*/
spin_lock(ptl);
- /*
- * paravirt calls inside pte_clear here are
- * superfluous.
- */
- pte_clear(vma->vm_mm, address, _pte);
+ ptep_clear(vma->vm_mm, address, _pte);
page_remove_rmap(src_page, false);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
continue;
} else {
result = SCAN_EXCEED_SWAP_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
goto out_unmap;
}
}
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
goto out_unmap;
}
}
if (page_mapcount(page) > 1 &&
++shared > khugepaged_max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out_unmap;
}
/*
* Record which node the original page is from and save this
* information to khugepaged_node_load[].
- * Khupaged will allocate hugepage from the node has the max
+ * Khugepaged will allocate hugepage from the node has the max
* hit record.
*/
node = page_to_nid(page);
}
count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
- /* This will be less messy when we use multi-index entries */
+ /*
+ * Ensure we have slots for all the pages in the range. This is
+ * almost certainly a no-op because most of the pages must be present
+ */
do {
xas_lock_irq(&xas);
xas_create_range(&xas);
__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
}
+ /* Join all the small entries into a single multi-index entry */
+ xas_set_order(&xas, start, HPAGE_PMD_ORDER);
+ xas_store(&xas, new_page);
xa_locked:
xas_unlock_irq(&xas);
xa_unlocked:
if (xa_is_value(page)) {
if (++swap > khugepaged_max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
break;
}
continue;
}
+ /*
+ * XXX: khugepaged should compact smaller compound pages
+ * into a PMD sized page
+ */
if (PageTransCompound(page)) {
result = SCAN_PAGE_COMPOUND;
break;
if (result == SCAN_SUCCEED) {
if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
} else {
node = khugepaged_find_target_node();
collapse_file(mm, file, start, hpage, node);
static bool cgroup_memory_nosocket __ro_after_init;
/* Kernel memory accounting disabled? */
- bool cgroup_memory_nokmem __ro_after_init;
+ static bool cgroup_memory_nokmem __ro_after_init;
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
static DEFINE_PER_CPU(unsigned int, stats_updates);
static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
- static inline void memcg_rstat_updated(struct mem_cgroup *memcg)
+ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
{
+ unsigned int x;
+
cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
- if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH))
- atomic_inc(&stats_flush_threshold);
+
+ x = __this_cpu_add_return(stats_updates, abs(val));
+ if (x > MEMCG_CHARGE_BATCH) {
+ atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
+ __this_cpu_write(stats_updates, 0);
+ }
}
static void __mem_cgroup_flush_stats(void)
static void flush_memcg_stats_dwork(struct work_struct *w)
{
- mem_cgroup_flush_stats();
+ __mem_cgroup_flush_stats();
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
}
return;
__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
- memcg_rstat_updated(memcg);
+ memcg_rstat_updated(memcg, val);
}
/* idx can be of type enum memcg_stat_item or node_stat_item. */
/* Update lruvec */
__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
- memcg_rstat_updated(memcg);
+ memcg_rstat_updated(memcg, val);
}
/**
return;
__this_cpu_add(memcg->vmstats_percpu->events[idx], count);
- memcg_rstat_updated(memcg);
+ memcg_rstat_updated(memcg, count);
}
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
{ "pagetables", NR_PAGETABLE },
{ "percpu", MEMCG_PERCPU_B },
{ "sock", MEMCG_SOCK },
+ { "vmalloc", MEMCG_VMALLOC },
{ "shmem", NR_SHMEM },
{ "file_mapped", NR_FILE_MAPPED },
{ "file_dirty", NR_FILE_DIRTY },
rcu_read_unlock();
}
-int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
- gfp_t gfp, bool new_page)
+int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
+ gfp_t gfp, bool new_slab)
{
- unsigned int objects = objs_per_slab_page(s, page);
+ unsigned int objects = objs_per_slab(s, slab);
unsigned long memcg_data;
void *vec;
gfp &= ~OBJCGS_CLEAR_MASK;
vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
- page_to_nid(page));
+ slab_nid(slab));
if (!vec)
return -ENOMEM;
memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
- if (new_page) {
+ if (new_slab) {
/*
- * If the slab page is brand new and nobody can yet access
- * it's memcg_data, no synchronization is required and
- * memcg_data can be simply assigned.
+ * If the slab is brand new and nobody can yet access its
+ * memcg_data, no synchronization is required and memcg_data can
+ * be simply assigned.
*/
- page->memcg_data = memcg_data;
- } else if (cmpxchg(&page->memcg_data, 0, memcg_data)) {
+ slab->memcg_data = memcg_data;
+ } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) {
/*
- * If the slab page is already in use, somebody can allocate
- * and assign obj_cgroups in parallel. In this case the existing
+ * If the slab is already in use, somebody can allocate and
+ * assign obj_cgroups in parallel. In this case the existing
* objcg vector should be reused.
*/
kfree(vec);
*/
struct mem_cgroup *mem_cgroup_from_obj(void *p)
{
- struct page *page;
+ struct folio *folio;
if (mem_cgroup_disabled())
return NULL;
- page = virt_to_head_page(p);
+ folio = virt_to_folio(p);
/*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
- * the page->obj_cgroups.
+ * slab->memcg_data.
*/
- if (page_objcgs_check(page)) {
- struct obj_cgroup *objcg;
+ if (folio_test_slab(folio)) {
+ struct obj_cgroup **objcgs;
+ struct slab *slab;
unsigned int off;
- off = obj_to_index(page->slab_cache, page, p);
- objcg = page_objcgs(page)[off];
- if (objcg)
- return obj_cgroup_memcg(objcg);
+ slab = folio_slab(folio);
+ objcgs = slab_objcgs(slab);
+ if (!objcgs)
+ return NULL;
+
+ off = obj_to_index(slab->slab_cache, slab, p);
+ if (objcgs[off])
+ return obj_cgroup_memcg(objcgs[off]);
return NULL;
}
/*
- * page_memcg_check() is used here, because page_has_obj_cgroups()
- * check above could fail because the object cgroups vector wasn't set
- * at that moment, but it can be set concurrently.
+ * page_memcg_check() is used here, because in theory we can encounter
+ * a folio where the slab flag has been cleared already, but
+ * slab->memcg_data has not been freed yet
* page_memcg_check(page) will guarantee that a proper memory
* cgroup pointer or NULL will be returned.
*/
- return page_memcg_check(page);
+ return page_memcg_check(folio_page(folio, 0));
}
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
return ret;
}
+ #if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
+ static int mem_cgroup_slab_show(struct seq_file *m, void *p)
+ {
+ /*
+ * Deprecated.
+ * Please, take a look at tools/cgroup/slabinfo.py .
+ */
+ return 0;
+ }
+ #endif
+
static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "usage_in_bytes",
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
{
.name = "kmem.slabinfo",
- .seq_show = memcg_slab_show,
+ .seq_show = mem_cgroup_slab_show,
},
#endif
{
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- unsigned int size;
int node;
int __maybe_unused i;
long error = -ENOMEM;
- size = sizeof(struct mem_cgroup);
- size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-
- memcg = kzalloc(size, GFP_KERNEL);
+ memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
if (!memcg)
return ERR_PTR(error);
seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
seq_printf(m, "oom_kill %lu\n",
atomic_long_read(&events[MEMCG_OOM_KILL]));
+ seq_printf(m, "oom_group_kill %lu\n",
+ atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
}
static int memory_events_show(struct seq_file *m, void *v)
#include <linux/ratelimit.h>
#include <linux/page-isolation.h>
#include <linux/pagewalk.h>
+ #include <linux/shmem_fs.h>
#include "internal.h"
#include "ras/ras_event.h"
[MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
[MF_MSG_SLAB] = "kernel slab page",
[MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
- [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
[MF_MSG_HUGE] = "huge page",
[MF_MSG_FREE_HUGE] = "free huge page",
[MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
[MF_MSG_CLEAN_LRU] = "clean LRU page",
[MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
[MF_MSG_BUDDY] = "free buddy page",
- [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
[MF_MSG_DAX] = "dax page",
[MF_MSG_UNSPLIT_THP] = "unsplit thp",
[MF_MSG_UNKNOWN] = "unknown page",
{
int ret;
struct address_space *mapping;
+ bool extra_pins;
delete_from_lru_cache(p);
goto out;
}
+ /*
+ * The shmem page is kept in page cache instead of truncating
+ * so is expected to have an extra refcount after error-handling.
+ */
+ extra_pins = shmem_mapping(mapping);
+
/*
* Truncation is a bit tricky. Enable it per file system for now.
*
* Open: to take i_rwsem or not for this? Right now we don't.
*/
ret = truncate_error_page(p, page_to_pfn(p), mapping);
+ if (has_extra_refcount(ps, p, extra_pins))
+ ret = MF_FAILED;
+
out:
unlock_page(p);
- if (has_extra_refcount(ps, p, false))
- ret = MF_FAILED;
-
return ret;
}
return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
}
+ static inline bool PageHWPoisonTakenOff(struct page *page)
+ {
+ return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON;
+ }
+
+ void SetPageHWPoisonTakenOff(struct page *page)
+ {
+ set_page_private(page, MAGIC_HWPOISON);
+ }
+
+ void ClearPageHWPoisonTakenOff(struct page *page)
+ {
+ if (PageHWPoison(page))
+ set_page_private(page, 0);
+ }
+
/*
* Return true if a page type of a given page is supported by hwpoison
* mechanism (while handling could fail), otherwise false. This function
return ret;
}
+ static int __get_unpoison_page(struct page *page)
+ {
+ struct page *head = compound_head(page);
+ int ret = 0;
+ bool hugetlb = false;
+
+ ret = get_hwpoison_huge_page(head, &hugetlb);
+ if (hugetlb)
+ return ret;
+
+ /*
+ * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
+ * but also isolated from buddy freelist, so need to identify the
+ * state and have to cancel both operations to unpoison.
+ */
+ if (PageHWPoisonTakenOff(page))
+ return -EHWPOISON;
+
+ return get_page_unless_zero(page) ? 1 : 0;
+ }
+
/**
* get_hwpoison_page() - Get refcount for memory error handling
* @p: Raw error page (hit by memory error)
*
* get_hwpoison_page() takes a page refcount of an error page to handle memory
* error on it, after checking that the error page is in a well-defined state
- * (defined as a page-type we can successfully handle the memor error on it,
+ * (defined as a page-type we can successfully handle the memory error on it,
* such as LRU page and hugetlb page).
*
* Memory error handling could be triggered at any time on any type of page,
* extra care for the error page's state (as done in __get_hwpoison_page()),
* and has some retry logic in get_any_page().
*
+ * When called from unpoison_memory(), the caller should already ensure that
+ * the given page has PG_hwpoison. So it's never reused for other page
+ * allocations, and __get_unpoison_page() never races with them.
+ *
* Return: 0 on failure,
* 1 on success for in-use pages in a well-defined state,
* -EIO for pages on which we can not handle memory errors,
* -EBUSY when get_hwpoison_page() has raced with page lifecycle
- * operations like allocation and free.
+ * operations like allocation and free,
+ * -EHWPOISON when the page is hwpoisoned and taken off from buddy.
*/
static int get_hwpoison_page(struct page *p, unsigned long flags)
{
int ret;
zone_pcp_disable(page_zone(p));
- ret = get_any_page(p, flags);
+ if (flags & MF_UNPOISON)
+ ret = __get_unpoison_page(p);
+ else
+ ret = get_any_page(p, flags);
zone_pcp_enable(page_zone(p));
return ret;
lock_page(head);
page_flags = head->flags;
- if (!PageHWPoison(head)) {
- pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
- num_poisoned_pages_dec();
- unlock_page(head);
- put_page(head);
- return 0;
- }
-
/*
* TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
* simply disable it. In order to make it work properly, we need
return rc;
}
+ static DEFINE_MUTEX(mf_mutex);
+
/**
* memory_failure - Handle memory failure of a page.
* @pfn: Page Number of the corrupted page
int res = 0;
unsigned long page_flags;
bool retry = true;
- static DEFINE_MUTEX(mf_mutex);
if (!sysctl_memory_failure_recovery)
panic("Memory failure on page %lx", pfn);
+ mutex_lock(&mf_mutex);
+
p = pfn_to_online_page(pfn);
if (!p) {
+ res = arch_memory_failure(pfn, flags);
+ if (res == 0)
+ goto unlock_mutex;
+
if (pfn_valid(pfn)) {
pgmap = get_dev_pagemap(pfn, NULL);
- if (pgmap)
- return memory_failure_dev_pagemap(pfn, flags,
- pgmap);
+ if (pgmap) {
+ res = memory_failure_dev_pagemap(pfn, flags,
+ pgmap);
+ goto unlock_mutex;
+ }
}
pr_err("Memory failure: %#lx: memory outside kernel control\n",
pfn);
- return -ENXIO;
+ res = -ENXIO;
+ goto unlock_mutex;
}
- mutex_lock(&mf_mutex);
-
try_again:
if (PageHuge(p)) {
res = memory_failure_hugetlb(pfn, flags);
*/
page_flags = p->flags;
- /*
- * unpoison always clear PG_hwpoison inside page lock
- */
- if (!PageHWPoison(p)) {
- pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
- num_poisoned_pages_dec();
- unlock_page(p);
- put_page(p);
- goto unlock_mutex;
- }
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
pr_info(fmt, pfn); \
})
+ static inline int clear_page_hwpoison(struct ratelimit_state *rs, struct page *p)
+ {
+ if (TestClearPageHWPoison(p)) {
+ unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
+ page_to_pfn(p), rs);
+ num_poisoned_pages_dec();
+ return 1;
+ }
+ return 0;
+ }
+
+ static inline int unpoison_taken_off_page(struct ratelimit_state *rs,
+ struct page *p)
+ {
+ if (put_page_back_buddy(p)) {
+ unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
+ page_to_pfn(p), rs);
+ return 0;
+ }
+ return -EBUSY;
+ }
+
/**
* unpoison_memory - Unpoison a previously poisoned page
* @pfn: Page number of the to be unpoisoned page
{
struct page *page;
struct page *p;
- int freeit = 0;
- unsigned long flags = 0;
+ int ret = -EBUSY;
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
p = pfn_to_page(pfn);
page = compound_head(p);
+ mutex_lock(&mf_mutex);
+
if (!PageHWPoison(p)) {
unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
pfn, &unpoison_rs);
- return 0;
+ goto unlock_mutex;
}
if (page_count(page) > 1) {
unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
pfn, &unpoison_rs);
- return 0;
+ goto unlock_mutex;
}
if (page_mapped(page)) {
unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
pfn, &unpoison_rs);
- return 0;
+ goto unlock_mutex;
}
if (page_mapping(page)) {
unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
pfn, &unpoison_rs);
- return 0;
- }
-
- /*
- * unpoison_memory() can encounter thp only when the thp is being
- * worked by memory_failure() and the page lock is not held yet.
- * In such case, we yield to memory_failure() and make unpoison fail.
- */
- if (!PageHuge(page) && PageTransHuge(page)) {
- unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
- pfn, &unpoison_rs);
- return 0;
+ goto unlock_mutex;
}
- if (!get_hwpoison_page(p, flags)) {
- if (TestClearPageHWPoison(p))
- num_poisoned_pages_dec();
- unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
- pfn, &unpoison_rs);
- return 0;
- }
+ if (PageSlab(page) || PageTable(page))
+ goto unlock_mutex;
- lock_page(page);
- /*
- * This test is racy because PG_hwpoison is set outside of page lock.
- * That's acceptable because that won't trigger kernel panic. Instead,
- * the PG_hwpoison page will be caught and isolated on the entrance to
- * the free buddy page pool.
- */
- if (TestClearPageHWPoison(page)) {
- unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
- pfn, &unpoison_rs);
- num_poisoned_pages_dec();
- freeit = 1;
- }
- unlock_page(page);
+ ret = get_hwpoison_page(p, MF_UNPOISON);
+ if (!ret) {
+ if (clear_page_hwpoison(&unpoison_rs, page))
+ ret = 0;
+ else
+ ret = -EBUSY;
+ } else if (ret < 0) {
+ if (ret == -EHWPOISON) {
+ ret = unpoison_taken_off_page(&unpoison_rs, p);
+ } else
+ unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
+ pfn, &unpoison_rs);
+ } else {
+ int freeit = clear_page_hwpoison(&unpoison_rs, p);
- put_page(page);
- if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
put_page(page);
+ if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) {
+ put_page(page);
+ ret = 0;
+ }
+ }
- return 0;
+ unlock_mutex:
+ mutex_unlock(&mf_mutex);
+ return ret;
}
EXPORT_SYMBOL(unpoison_memory);
return -EIO;
}
+ mutex_lock(&mf_mutex);
+
if (PageHWPoison(page)) {
pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
put_ref_page(ref_page);
+ mutex_unlock(&mf_mutex);
return 0;
}
}
}
+ mutex_unlock(&mf_mutex);
+
return ret;
}
#include <linux/kernel_stat.h>
#include <linux/mm.h>
+ #include <linux/mm_inline.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
else if (is_writable_device_exclusive_entry(entry))
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- set_pte_at(vma->vm_mm, address, ptep, pte);
-
/*
* No need to take a page reference as one was already
* created when the swap entry was made.
*/
WARN_ON_ONCE(!PageAnon(page));
+ set_pte_at(vma->vm_mm, address, ptep, pte);
+
if (vma->vm_flags & VM_LOCKED)
mlock_vma_page(page);
return ret;
}
+/*
+ * Parameter block passed down to zap_pte_range in exceptional cases.
+ */
+struct zap_details {
+ struct address_space *zap_mapping; /* Check page->mapping if set */
+ struct folio *single_folio; /* Locked folio to be unmapped */
+};
+
+/*
+ * We set details->zap_mapping when we want to unmap shared but keep private
+ * pages. Return true if skip zapping this page, false otherwise.
+ */
+static inline bool
+zap_skip_check_mapping(struct zap_details *details, struct page *page)
+{
+ if (!details || !page)
+ return false;
+
+ return details->zap_mapping &&
+ (details->zap_mapping != page_rmapping(page));
+}
+
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
- } else if (details && details->single_page &&
- PageTransCompound(details->single_page) &&
+ } else if (details && details->single_folio &&
+ folio_test_pmd_mappable(details->single_folio) &&
next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
/*
}
/**
- * unmap_mapping_page() - Unmap single page from processes.
- * @page: The locked page to be unmapped.
+ * unmap_mapping_folio() - Unmap single folio from processes.
+ * @folio: The locked folio to be unmapped.
*
- * Unmap this page from any userspace process which still has it mmaped.
+ * Unmap this folio from any userspace process which still has it mmaped.
* Typically, for efficiency, the range of nearby pages has already been
* unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
- * truncation or invalidation holds the lock on a page, it may find that
- * the page has been remapped again: and then uses unmap_mapping_page()
+ * truncation or invalidation holds the lock on a folio, it may find that
+ * the page has been remapped again: and then uses unmap_mapping_folio()
* to unmap it finally.
*/
-void unmap_mapping_page(struct page *page)
+void unmap_mapping_folio(struct folio *folio)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
struct zap_details details = { };
pgoff_t first_index;
pgoff_t last_index;
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(PageTail(page));
+ VM_BUG_ON(!folio_test_locked(folio));
- first_index = page->index;
- last_index = page->index + thp_nr_pages(page) - 1;
+ first_index = folio->index;
+ last_index = folio->index + folio_nr_pages(folio) - 1;
details.zap_mapping = mapping;
- details.single_page = page;
+ details.single_folio = folio;
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+ if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
pte = pte_mkuffd_wp(pte);
pte = pte_wrprotect(pte);
}
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
- arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
/* ksm created a completely new copy */
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
}
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
+
swap_free(entry);
if (mem_cgroup_swap_full(page) ||
(vma->vm_flags & VM_LOCKED) || PageMlocked(page))
return (range->start + range_len(range)) >> PAGE_SHIFT;
}
- static unsigned long pfn_next(unsigned long pfn)
+ static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
{
- if (pfn % 1024 == 0)
+ if (pfn % (1024 << pgmap->vmemmap_shift))
cond_resched();
- return pfn + 1;
+ return pfn + pgmap_vmemmap_nr(pgmap);
+ }
+
+ static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id)
+ {
+ return (pfn_end(pgmap, range_id) -
+ pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift;
}
#define for_each_device_pfn(pfn, map, i) \
- for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
+ for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \
+ pfn = pfn_next(map, pfn))
-static void dev_pagemap_kill(struct dev_pagemap *pgmap)
-{
- if (pgmap->ops && pgmap->ops->kill)
- pgmap->ops->kill(pgmap);
- else
- percpu_ref_kill(pgmap->ref);
-}
-
-static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
-{
- if (pgmap->ops && pgmap->ops->cleanup) {
- pgmap->ops->cleanup(pgmap);
- } else {
- wait_for_completion(&pgmap->done);
- percpu_ref_exit(pgmap->ref);
- }
- /*
- * Undo the pgmap ref assignment for the internal case as the
- * caller may re-enable the same pgmap.
- */
- if (pgmap->ref == &pgmap->internal_ref)
- pgmap->ref = NULL;
-}
-
static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
{
struct range *range = &pgmap->ranges[range_id];
unsigned long pfn;
int i;
- dev_pagemap_kill(pgmap);
+ percpu_ref_kill(&pgmap->ref);
for (i = 0; i < pgmap->nr_range; i++)
for_each_device_pfn(pfn, pgmap, i)
put_page(pfn_to_page(pfn));
- dev_pagemap_cleanup(pgmap);
+ wait_for_completion(&pgmap->done);
+ percpu_ref_exit(&pgmap->ref);
for (i = 0; i < pgmap->nr_range; i++)
pageunmap_range(pgmap, i);
static void dev_pagemap_percpu_release(struct percpu_ref *ref)
{
- struct dev_pagemap *pgmap =
- container_of(ref, struct dev_pagemap, internal_ref);
+ struct dev_pagemap *pgmap = container_of(ref, struct dev_pagemap, ref);
complete(&pgmap->done);
}
memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), pgmap);
- percpu_ref_get_many(&pgmap->ref,
- pfn_end(pgmap, range_id) - pfn_first(pgmap, range_id));
- percpu_ref_get_many(pgmap->ref, pfn_len(pgmap, range_id));
++ percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
return 0;
err_add_memory:
break;
}
- if (!pgmap->ref) {
- if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
- return ERR_PTR(-EINVAL);
-
- init_completion(&pgmap->done);
- error = percpu_ref_init(&pgmap->internal_ref,
- dev_pagemap_percpu_release, 0, GFP_KERNEL);
- if (error)
- return ERR_PTR(error);
- pgmap->ref = &pgmap->internal_ref;
- } else {
- if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
- WARN(1, "Missing reference count teardown definition\n");
- return ERR_PTR(-EINVAL);
- }
- }
+ init_completion(&pgmap->done);
+ error = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0,
+ GFP_KERNEL);
+ if (error)
+ return ERR_PTR(error);
devmap_managed_enable_get(pgmap);
/* fall back to slow path lookup */
rcu_read_lock();
pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
- if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
+ if (pgmap && !percpu_ref_tryget_live(&pgmap->ref))
pgmap = NULL;
rcu_read_unlock();
#include <linux/ptrace.h>
#include <linux/oom.h>
#include <linux/memory.h>
+ #include <linux/random.h>
#include <asm/tlbflush.h>
pte = pte_mkhuge(pte);
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
- set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
if (PageAnon(new))
hugepage_add_anon_rmap(new, vma, pvmw.address);
else
page_dup_rmap(new, true);
+ set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
} else
#endif
{
- set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
-
if (PageAnon(new))
page_add_anon_rmap(new, vma, pvmw.address, false);
else
page_add_file_rmap(new, false);
+ set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
}
if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
mlock_vma_page(new);
{
pte_t pte;
swp_entry_t entry;
- struct page *page;
+ struct folio *folio;
spin_lock(ptl);
pte = *ptep;
if (!is_migration_entry(entry))
goto out;
- page = pfn_swap_entry_to_page(entry);
- page = compound_head(page);
+ folio = page_folio(pfn_swap_entry_to_page(entry));
/*
* Once page cache replacement of page migration started, page_count
- * is zero; but we must not call put_and_wait_on_page_locked() without
- * a ref. Use get_page_unless_zero(), and just fault again if it fails.
+ * is zero; but we must not call folio_put_wait_locked() without
+ * a ref. Use folio_try_get(), and just fault again if it fails.
*/
- if (!get_page_unless_zero(page))
+ if (!folio_try_get(folio))
goto out;
pte_unmap_unlock(ptep, ptl);
- put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
+ folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
return;
out:
pte_unmap_unlock(ptep, ptl);
void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
{
spinlock_t *ptl;
- struct page *page;
+ struct folio *folio;
ptl = pmd_lock(mm, pmd);
if (!is_pmd_migration_entry(*pmd))
goto unlock;
- page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd));
- if (!get_page_unless_zero(page))
+ folio = page_folio(pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd)));
+ if (!folio_try_get(folio))
goto unlock;
spin_unlock(ptl);
- put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
+ folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
return;
unlock:
spin_unlock(ptl);
}
xas_store(&xas, newfolio);
- if (nr > 1) {
- int i;
-
- for (i = 1; i < nr; i++) {
- xas_next(&xas);
- xas_store(&xas, newfolio);
- }
- }
/*
* Drop cache reference from old page by unfreezing
return rc;
}
-
- /*
- * node_demotion[] example:
- *
- * Consider a system with two sockets. Each socket has
- * three classes of memory attached: fast, medium and slow.
- * Each memory class is placed in its own NUMA node. The
- * CPUs are placed in the node with the "fast" memory. The
- * 6 NUMA nodes (0-5) might be split among the sockets like
- * this:
- *
- * Socket A: 0, 1, 2
- * Socket B: 3, 4, 5
- *
- * When Node 0 fills up, its memory should be migrated to
- * Node 1. When Node 1 fills up, it should be migrated to
- * Node 2. The migration path start on the nodes with the
- * processors (since allocations default to this node) and
- * fast memory, progress through medium and end with the
- * slow memory:
- *
- * 0 -> 1 -> 2 -> stop
- * 3 -> 4 -> 5 -> stop
- *
- * This is represented in the node_demotion[] like this:
- *
- * { 1, // Node 0 migrates to 1
- * 2, // Node 1 migrates to 2
- * -1, // Node 2 does not migrate
- * 4, // Node 3 migrates to 4
- * 5, // Node 4 migrates to 5
- * -1} // Node 5 does not migrate
- */
-
- /*
- * Writes to this array occur without locking. Cycles are
- * not allowed: Node X demotes to Y which demotes to X...
- *
- * If multiple reads are performed, a single rcu_read_lock()
- * must be held over all reads to ensure that no cycles are
- * observed.
- */
- static int node_demotion[MAX_NUMNODES] __read_mostly =
- {[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE};
-
- /**
- * next_demotion_node() - Get the next node in the demotion path
- * @node: The starting node to lookup the next node
- *
- * Return: node id for next memory node in the demotion path hierarchy
- * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
- * @node online or guarantee that it *continues* to be the next demotion
- * target.
- */
- int next_demotion_node(int node)
- {
- int target;
-
- /*
- * node_demotion[] is updated without excluding this
- * function from running. RCU doesn't provide any
- * compiler barriers, so the READ_ONCE() is required
- * to avoid compiler reordering or read merging.
- *
- * Make sure to use RCU over entire code blocks if
- * node_demotion[] reads need to be consistent.
- */
- rcu_read_lock();
- target = READ_ONCE(node_demotion[node]);
- rcu_read_unlock();
-
- return target;
- }
-
/*
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
* @mode: The migration mode that specifies the constraints for
* page migration, if any.
* @reason: The reason for page migration.
- * @ret_succeeded: Set to the number of pages migrated successfully if
+ * @ret_succeeded: Set to the number of normal pages migrated successfully if
* the caller passes a non-NULL pointer.
*
* The function returns after 10 attempts or if no pages are movable any more
* It is caller's responsibility to call putback_movable_pages() to return pages
* to the LRU or free list only if ret != 0.
*
- * Returns the number of pages that were not migrated, or an error code.
+ * Returns the number of {normal page, THP, hugetlb} that were not migrated, or
+ * an error code. The number of THP splits will be considered as the number of
+ * non-migrated THP, no matter how many subpages of the THP are migrated successfully.
*/
int migrate_pages(struct list_head *from, new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
int retry = 1;
int thp_retry = 1;
int nr_failed = 0;
+ int nr_failed_pages = 0;
int nr_succeeded = 0;
int nr_thp_succeeded = 0;
int nr_thp_failed = 0;
int swapwrite = current->flags & PF_SWAPWRITE;
int rc, nr_subpages;
LIST_HEAD(ret_pages);
+ LIST_HEAD(thp_split_pages);
bool nosplit = (reason == MR_NUMA_MISPLACED);
+ bool no_subpage_counting = false;
trace_mm_migrate_pages_start(mode, reason);
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
+ thp_subpage_migration:
for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
retry = 0;
thp_retry = 0;
* during migration.
*/
is_thp = PageTransHuge(page) && !PageHuge(page);
- nr_subpages = thp_nr_pages(page);
+ nr_subpages = compound_nr(page);
cond_resched();
if (PageHuge(page))
case -ENOSYS:
/* THP migration is unsupported */
if (is_thp) {
- if (!try_split_thp(page, &page2, from)) {
+ nr_thp_failed++;
+ if (!try_split_thp(page, &page2, &thp_split_pages)) {
nr_thp_split++;
goto retry;
}
- nr_thp_failed++;
- nr_failed += nr_subpages;
+ nr_failed_pages += nr_subpages;
break;
}
/* Hugetlb migration is unsupported */
- nr_failed++;
+ if (!no_subpage_counting)
+ nr_failed++;
+ nr_failed_pages += nr_subpages;
break;
case -ENOMEM:
/*
* THP NUMA faulting doesn't split THP to retry.
*/
if (is_thp && !nosplit) {
- if (!try_split_thp(page, &page2, from)) {
+ nr_thp_failed++;
+ if (!try_split_thp(page, &page2, &thp_split_pages)) {
nr_thp_split++;
goto retry;
}
- nr_thp_failed++;
- nr_failed += nr_subpages;
+ nr_failed_pages += nr_subpages;
goto out;
}
- nr_failed++;
+
+ if (!no_subpage_counting)
+ nr_failed++;
+ nr_failed_pages += nr_subpages;
goto out;
case -EAGAIN:
if (is_thp) {
retry++;
break;
case MIGRATEPAGE_SUCCESS:
+ nr_succeeded += nr_subpages;
if (is_thp) {
nr_thp_succeeded++;
- nr_succeeded += nr_subpages;
break;
}
- nr_succeeded++;
break;
default:
/*
*/
if (is_thp) {
nr_thp_failed++;
- nr_failed += nr_subpages;
+ nr_failed_pages += nr_subpages;
break;
}
- nr_failed++;
+
+ if (!no_subpage_counting)
+ nr_failed++;
+ nr_failed_pages += nr_subpages;
break;
}
}
}
- nr_failed += retry + thp_retry;
+ nr_failed += retry;
nr_thp_failed += thp_retry;
- rc = nr_failed;
+ /*
+ * Try to migrate subpages of fail-to-migrate THPs, no nr_failed
+ * counting in this round, since all subpages of a THP is counted
+ * as 1 failure in the first round.
+ */
+ if (!list_empty(&thp_split_pages)) {
+ /*
+ * Move non-migrated pages (after 10 retries) to ret_pages
+ * to avoid migrating them again.
+ */
+ list_splice_init(from, &ret_pages);
+ list_splice_init(&thp_split_pages, from);
+ no_subpage_counting = true;
+ retry = 1;
+ goto thp_subpage_migration;
+ }
+
+ rc = nr_failed + nr_thp_failed;
out:
/*
* Put the permanent failure page back to migration list, they
list_splice(&ret_pages, from);
count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
- count_vm_events(PGMIGRATE_FAIL, nr_failed);
+ count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
- trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
+ trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
nr_thp_failed, nr_thp_split, mode, reason);
if (!swapwrite)
static void migrate_vma_unmap(struct migrate_vma *migrate)
{
const unsigned long npages = migrate->npages;
- const unsigned long start = migrate->start;
- unsigned long addr, i, restore = 0;
+ unsigned long i, restore = 0;
bool allow_drain = true;
lru_add_drain();
}
}
- for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
+ for (i = 0; i < npages && restore; i++) {
struct page *page = migrate_pfn_to_page(migrate->src[i]);
if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
EXPORT_SYMBOL(migrate_vma_finalize);
#endif /* CONFIG_DEVICE_PRIVATE */
+ /*
+ * node_demotion[] example:
+ *
+ * Consider a system with two sockets. Each socket has
+ * three classes of memory attached: fast, medium and slow.
+ * Each memory class is placed in its own NUMA node. The
+ * CPUs are placed in the node with the "fast" memory. The
+ * 6 NUMA nodes (0-5) might be split among the sockets like
+ * this:
+ *
+ * Socket A: 0, 1, 2
+ * Socket B: 3, 4, 5
+ *
+ * When Node 0 fills up, its memory should be migrated to
+ * Node 1. When Node 1 fills up, it should be migrated to
+ * Node 2. The migration path start on the nodes with the
+ * processors (since allocations default to this node) and
+ * fast memory, progress through medium and end with the
+ * slow memory:
+ *
+ * 0 -> 1 -> 2 -> stop
+ * 3 -> 4 -> 5 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ *
+ * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1
+ * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2
+ * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate
+ * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4
+ * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5
+ * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate
+ *
+ * Moreover some systems may have multiple slow memory nodes.
+ * Suppose a system has one socket with 3 memory nodes, node 0
+ * is fast memory type, and node 1/2 both are slow memory
+ * type, and the distance between fast memory node and slow
+ * memory node is same. So the migration path should be:
+ *
+ * 0 -> 1/2 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
+ * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
+ * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
+ */
+
+ /*
+ * Writes to this array occur without locking. Cycles are
+ * not allowed: Node X demotes to Y which demotes to X...
+ *
+ * If multiple reads are performed, a single rcu_read_lock()
+ * must be held over all reads to ensure that no cycles are
+ * observed.
+ */
+ #define DEFAULT_DEMOTION_TARGET_NODES 15
+
+ #if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
+ #define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1)
+ #else
+ #define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES
+ #endif
+
+ struct demotion_nodes {
+ unsigned short nr;
+ short nodes[DEMOTION_TARGET_NODES];
+ };
+
+ static struct demotion_nodes *node_demotion __read_mostly;
+
+ /**
+ * next_demotion_node() - Get the next node in the demotion path
+ * @node: The starting node to lookup the next node
+ *
+ * Return: node id for next memory node in the demotion path hierarchy
+ * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
+ * @node online or guarantee that it *continues* to be the next demotion
+ * target.
+ */
+ int next_demotion_node(int node)
+ {
+ struct demotion_nodes *nd;
+ unsigned short target_nr, index;
+ int target;
+
+ if (!node_demotion)
+ return NUMA_NO_NODE;
+
+ nd = &node_demotion[node];
+
+ /*
+ * node_demotion[] is updated without excluding this
+ * function from running. RCU doesn't provide any
+ * compiler barriers, so the READ_ONCE() is required
+ * to avoid compiler reordering or read merging.
+ *
+ * Make sure to use RCU over entire code blocks if
+ * node_demotion[] reads need to be consistent.
+ */
+ rcu_read_lock();
+ target_nr = READ_ONCE(nd->nr);
+
+ switch (target_nr) {
+ case 0:
+ target = NUMA_NO_NODE;
+ goto out;
+ case 1:
+ index = 0;
+ break;
+ default:
+ /*
+ * If there are multiple target nodes, just select one
+ * target node randomly.
+ *
+ * In addition, we can also use round-robin to select
+ * target node, but we should introduce another variable
+ * for node_demotion[] to record last selected target node,
+ * that may cause cache ping-pong due to the changing of
+ * last target node. Or introducing per-cpu data to avoid
+ * caching issue, which seems more complicated. So selecting
+ * target node randomly seems better until now.
+ */
+ index = get_random_int() % target_nr;
+ break;
+ }
+
+ target = READ_ONCE(nd->nodes[index]);
+
+ out:
+ rcu_read_unlock();
+ return target;
+ }
+
#if defined(CONFIG_HOTPLUG_CPU)
/* Disable reclaim-based migration. */
static void __disable_all_migrate_targets(void)
{
- int node;
+ int node, i;
+
+ if (!node_demotion)
+ return;
- for_each_online_node(node)
- node_demotion[node] = NUMA_NO_NODE;
+ for_each_online_node(node) {
+ node_demotion[node].nr = 0;
+ for (i = 0; i < DEMOTION_TARGET_NODES; i++)
+ node_demotion[node].nodes[i] = NUMA_NO_NODE;
+ }
}
static void disable_all_migrate_targets(void)
* Failing here is OK. It might just indicate
* being at the end of a chain.
*/
- static int establish_migrate_target(int node, nodemask_t *used)
+ static int establish_migrate_target(int node, nodemask_t *used,
+ int best_distance)
{
- int migration_target;
+ int migration_target, index, val;
+ struct demotion_nodes *nd;
- /*
- * Can not set a migration target on a
- * node with it already set.
- *
- * No need for READ_ONCE() here since this
- * in the write path for node_demotion[].
- * This should be the only thread writing.
- */
- if (node_demotion[node] != NUMA_NO_NODE)
+ if (!node_demotion)
return NUMA_NO_NODE;
+ nd = &node_demotion[node];
+
migration_target = find_next_best_node(node, used);
if (migration_target == NUMA_NO_NODE)
return NUMA_NO_NODE;
- node_demotion[node] = migration_target;
+ /*
+ * If the node has been set a migration target node before,
+ * which means it's the best distance between them. Still
+ * check if this node can be demoted to other target nodes
+ * if they have a same best distance.
+ */
+ if (best_distance != -1) {
+ val = node_distance(node, migration_target);
+ if (val > best_distance)
+ return NUMA_NO_NODE;
+ }
+
+ index = nd->nr;
+ if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
+ "Exceeds maximum demotion target nodes\n"))
+ return NUMA_NO_NODE;
+
+ nd->nodes[index] = migration_target;
+ nd->nr++;
return migration_target;
}
*
* The difference here is that cycles must be avoided. If
* node0 migrates to node1, then neither node1, nor anything
- * node1 migrates to can migrate to node0.
+ * node1 migrates to can migrate to node0. Also one node can
+ * be migrated to multiple nodes if the target nodes all have
+ * a same best-distance against the source node.
*
* This function can run simultaneously with readers of
* node_demotion[]. However, it can not run simultaneously
nodemask_t next_pass = NODE_MASK_NONE;
nodemask_t this_pass = NODE_MASK_NONE;
nodemask_t used_targets = NODE_MASK_NONE;
- int node;
+ int node, best_distance;
/*
* Avoid any oddities like cycles that could occur
* multiple source nodes to share a destination.
*/
nodes_or(used_targets, used_targets, this_pass);
- for_each_node_mask(node, this_pass) {
- int target_node = establish_migrate_target(node, &used_targets);
- if (target_node == NUMA_NO_NODE)
- continue;
+ for_each_node_mask(node, this_pass) {
+ best_distance = -1;
/*
- * Visit targets from this pass in the next pass.
- * Eventually, every node will have been part of
- * a pass, and will become set in 'used_targets'.
+ * Try to set up the migration path for the node, and the target
+ * migration nodes can be multiple, so doing a loop to find all
+ * the target nodes if they all have a best node distance.
*/
- node_set(target_node, next_pass);
+ do {
+ int target_node =
+ establish_migrate_target(node, &used_targets,
+ best_distance);
+
+ if (target_node == NUMA_NO_NODE)
+ break;
+
+ if (best_distance == -1)
+ best_distance = node_distance(node, target_node);
+
+ /*
+ * Visit targets from this pass in the next pass.
+ * Eventually, every node will have been part of
+ * a pass, and will become set in 'used_targets'.
+ */
+ node_set(target_node, next_pass);
+ } while (1);
}
/*
* 'next_pass' contains nodes which became migration
{
int ret;
+ node_demotion = kmalloc_array(nr_node_ids,
+ sizeof(struct demotion_nodes),
+ GFP_KERNEL);
+ WARN_ON(!node_demotion);
+
ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
NULL, migration_offline_cpu);
/*
struct shmem_inode_info *info;
struct page *page;
unsigned long batch = sc ? sc->nr_to_scan : 128;
- int removed = 0, split = 0;
+ int split = 0;
if (list_empty(&sbinfo->shrinklist))
return SHRINK_STOP;
/* inode is about to be evicted */
if (!inode) {
list_del_init(&info->shrinklist);
- removed++;
goto next;
}
if (round_up(inode->i_size, PAGE_SIZE) ==
round_up(inode->i_size, HPAGE_PMD_SIZE)) {
list_move(&info->shrinklist, &to_remove);
- removed++;
goto next;
}
list_move(&info->shrinklist, &list);
next:
+ sbinfo->shrinklist_len--;
if (!--batch)
break;
}
inode = &info->vfs_inode;
if (nr_to_split && split >= nr_to_split)
- goto leave;
+ goto move_back;
page = find_get_page(inode->i_mapping,
(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
}
/*
- * Leave the inode on the list if we failed to lock
- * the page at this time.
+ * Move the inode on the list back to shrinklist if we failed
+ * to lock the page at this time.
*
* Waiting for the lock may lead to deadlock in the
* reclaim path.
*/
if (!trylock_page(page)) {
put_page(page);
- goto leave;
+ goto move_back;
}
ret = split_huge_page(page);
unlock_page(page);
put_page(page);
- /* If split failed leave the inode on the list */
+ /* If split failed move the inode on the list back to shrinklist */
if (ret)
- goto leave;
+ goto move_back;
split++;
drop:
list_del_init(&info->shrinklist);
- removed++;
- leave:
+ goto put;
+ move_back:
+ /*
+ * Make sure the inode is either on the global list or deleted
+ * from any local list before iput() since it could be deleted
+ * in another thread once we put the inode (then the local list
+ * is corrupted).
+ */
+ spin_lock(&sbinfo->shrinklist_lock);
+ list_move(&info->shrinklist, &sbinfo->shrinklist);
+ sbinfo->shrinklist_len++;
+ spin_unlock(&sbinfo->shrinklist_lock);
+ put:
iput(inode);
}
- spin_lock(&sbinfo->shrinklist_lock);
- list_splice_tail(&list, &sbinfo->shrinklist);
- sbinfo->shrinklist_len -= removed;
- spin_unlock(&sbinfo->shrinklist_lock);
-
return split;
}
struct mm_struct *charge_mm)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
- unsigned long i = 0;
unsigned long nr = compound_nr(page);
int error;
cgroup_throttle_swaprate(page, gfp);
do {
- void *entry;
xas_lock_irq(&xas);
- entry = xas_find_conflict(&xas);
- if (entry != expected)
+ if (expected != xas_find_conflict(&xas)) {
xas_set_err(&xas, -EEXIST);
- xas_create_range(&xas);
- if (xas_error(&xas))
goto unlock;
-next:
- xas_store(&xas, page);
- if (++i < nr) {
- xas_next(&xas);
- goto next;
}
+ if (expected && xas_find_conflict(&xas)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ xas_store(&xas, page);
+ if (xas_error(&xas))
+ goto unlock;
if (PageTransHuge(page)) {
count_vm_event(THP_FILE_ALLOC);
__mod_lruvec_page_state(page, NR_SHMEM_THPS, nr);
}
}
-/*
- * Check whether a hole-punch or truncation needs to split a huge page,
- * returning true if no split was required, or the split has been successful.
- *
- * Eviction (or truncation to 0 size) should never need to split a huge page;
- * but in rare cases might do so, if shmem_undo_range() failed to trylock on
- * head, and then succeeded to trylock on tail.
- *
- * A split can only succeed when there are no additional references on the
- * huge page: so the split below relies upon find_get_entries() having stopped
- * when it found a subpage of the huge page, without getting further references.
- */
-static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
+static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
{
- if (!PageTransCompound(page))
- return true;
-
- /* Just proceed to delete a huge page wholly within the range punched */
- if (PageHead(page) &&
- page->index >= start && page->index + HPAGE_PMD_NR <= end)
- return true;
+ struct folio *folio;
+ struct page *page;
- /* Try to split huge page, so we can truly punch the hole or truncate */
- return split_huge_page(page) >= 0;
+ /*
+ * At first avoid shmem_getpage(,,,SGP_READ): that fails
+ * beyond i_size, and reports fallocated pages as holes.
+ */
+ folio = __filemap_get_folio(inode->i_mapping, index,
+ FGP_ENTRY | FGP_LOCK, 0);
+ if (!xa_is_value(folio))
+ return folio;
+ /*
+ * But read a page back from swap if any of it is within i_size
+ * (although in some cases this is just a waste of time).
+ */
+ page = NULL;
+ shmem_getpage(inode, index, &page, SGP_READ);
+ return page ? page_folio(page) : NULL;
}
/*
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
pgoff_t end = (lend + 1) >> PAGE_SHIFT;
- unsigned int partial_start = lstart & (PAGE_SIZE - 1);
- unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE];
+ struct folio *folio;
+ bool same_folio;
long nr_swaps_freed = 0;
pgoff_t index;
int i;
if (info->fallocend > start && info->fallocend <= end && !unfalloc)
info->fallocend = start;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
index = start;
while (index < end && find_lock_entries(mapping, index, end - 1,
- &pvec, indices)) {
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
+ &fbatch, indices)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ folio = fbatch.folios[i];
index = indices[i];
- if (xa_is_value(page)) {
+ if (xa_is_value(folio)) {
if (unfalloc)
continue;
nr_swaps_freed += !shmem_free_swap(mapping,
- index, page);
+ index, folio);
continue;
}
- index += thp_nr_pages(page) - 1;
+ index += folio_nr_pages(folio) - 1;
- if (!unfalloc || !PageUptodate(page))
- truncate_inode_page(mapping, page);
- unlock_page(page);
+ if (!unfalloc || !folio_test_uptodate(folio))
+ truncate_inode_folio(mapping, folio);
+ folio_unlock(folio);
}
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
+ folio_batch_remove_exceptionals(&fbatch);
+ folio_batch_release(&fbatch);
cond_resched();
index++;
}
- if (partial_start) {
- struct page *page = NULL;
- shmem_getpage(inode, start - 1, &page, SGP_READ);
- if (page) {
- unsigned int top = PAGE_SIZE;
- if (start > end) {
- top = partial_end;
- partial_end = 0;
- }
- zero_user_segment(page, partial_start, top);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
+ folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
+ if (folio) {
+ same_folio = lend < folio_pos(folio) + folio_size(folio);
+ folio_mark_dirty(folio);
+ if (!truncate_inode_partial_folio(folio, lstart, lend)) {
+ start = folio->index + folio_nr_pages(folio);
+ if (same_folio)
+ end = folio->index;
}
+ folio_unlock(folio);
+ folio_put(folio);
+ folio = NULL;
}
- if (partial_end) {
- struct page *page = NULL;
- shmem_getpage(inode, end, &page, SGP_READ);
- if (page) {
- zero_user_segment(page, 0, partial_end);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
- }
+
+ if (!same_folio)
+ folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
+ if (folio) {
+ folio_mark_dirty(folio);
+ if (!truncate_inode_partial_folio(folio, lstart, lend))
+ end = folio->index;
+ folio_unlock(folio);
+ folio_put(folio);
}
- if (start >= end)
- return;
index = start;
while (index < end) {
cond_resched();
- if (!find_get_entries(mapping, index, end - 1, &pvec,
+ if (!find_get_entries(mapping, index, end - 1, &fbatch,
indices)) {
/* If all gone or hole-punch or unfalloc, we're done */
if (index == start || end != -1)
index = start;
continue;
}
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ folio = fbatch.folios[i];
index = indices[i];
- if (xa_is_value(page)) {
+ if (xa_is_value(folio)) {
if (unfalloc)
continue;
- if (shmem_free_swap(mapping, index, page)) {
+ if (shmem_free_swap(mapping, index, folio)) {
/* Swap was replaced by page: retry */
index--;
break;
continue;
}
- lock_page(page);
+ folio_lock(folio);
- if (!unfalloc || !PageUptodate(page)) {
- if (page_mapping(page) != mapping) {
+ if (!unfalloc || !folio_test_uptodate(folio)) {
+ if (folio_mapping(folio) != mapping) {
/* Page was replaced by swap: retry */
- unlock_page(page);
+ folio_unlock(folio);
index--;
break;
}
- VM_BUG_ON_PAGE(PageWriteback(page), page);
- if (shmem_punch_compound(page, start, end))
- truncate_inode_page(mapping, page);
- else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- /* Wipe the page and don't get stuck */
- clear_highpage(page);
- flush_dcache_page(page);
- set_page_dirty(page);
- if (index <
- round_up(start, HPAGE_PMD_NR))
- start = index + 1;
- }
+ VM_BUG_ON_FOLIO(folio_test_writeback(folio),
+ folio);
+ truncate_inode_folio(mapping, folio);
}
- unlock_page(page);
+ index = folio->index + folio_nr_pages(folio) - 1;
+ folio_unlock(folio);
}
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
+ folio_batch_remove_exceptionals(&fbatch);
+ folio_batch_release(&fbatch);
index++;
}
return NULL;
shmem_pseudo_vma_init(&pvma, info, hindex);
- page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(),
- true);
+ page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
shmem_pseudo_vma_destroy(&pvma);
if (page)
prep_transhuge_page(page);
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t index = pos >> PAGE_SHIFT;
+ int ret = 0;
/* i_rwsem is held by caller */
if (unlikely(info->seals & (F_SEAL_GROW |
return -EPERM;
}
- return shmem_getpage(inode, index, pagep, SGP_WRITE);
+ ret = shmem_getpage(inode, index, pagep, SGP_WRITE);
+
+ if (ret)
+ return ret;
+
+ if (PageHWPoison(*pagep)) {
+ unlock_page(*pagep);
+ put_page(*pagep);
+ *pagep = NULL;
+ return -EIO;
+ }
+
+ return 0;
}
static int
if (sgp == SGP_CACHE)
set_page_dirty(page);
unlock_page(page);
+
+ if (PageHWPoison(page)) {
+ put_page(page);
+ error = -EIO;
+ break;
+ }
}
/*
page = find_get_page(inode->i_mapping, 0);
if (!page)
return ERR_PTR(-ECHILD);
- if (!PageUptodate(page)) {
+ if (PageHWPoison(page) ||
+ !PageUptodate(page)) {
put_page(page);
return ERR_PTR(-ECHILD);
}
error = shmem_getpage(inode, 0, &page, SGP_READ);
if (error)
return ERR_PTR(error);
+ if (!page)
+ return ERR_PTR(-ECHILD);
+ if (PageHWPoison(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-ECHILD);
+ }
unlock_page(page);
}
set_delayed_call(done, shmem_put_link, page);
kmem_cache_destroy(shmem_inode_cachep);
}
+ /* Keep the page in page cache instead of truncating it */
+ static int shmem_error_remove_page(struct address_space *mapping,
+ struct page *page)
+ {
+ return 0;
+ }
+
const struct address_space_operations shmem_aops = {
.writepage = shmem_writepage,
.set_page_dirty = __set_page_dirty_no_writeback,
#ifdef CONFIG_MIGRATION
.migratepage = migrate_page,
#endif
- .error_remove_page = generic_error_remove_page,
+ .error_remove_page = shmem_error_remove_page,
};
EXPORT_SYMBOL(shmem_aops);
error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
gfp, NULL, NULL, NULL);
if (error)
- page = ERR_PTR(error);
- else
- unlock_page(page);
+ return ERR_PTR(error);
+
+ unlock_page(page);
+ if (PageHWPoison(page)) {
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+
return page;
#else
/*
* Internal slab definitions
*/
+/* Reuses the bits in struct page */
+struct slab {
+ unsigned long __page_flags;
+
+#if defined(CONFIG_SLAB)
+
+ union {
+ struct list_head slab_list;
+ struct rcu_head rcu_head;
+ };
+ struct kmem_cache *slab_cache;
+ void *freelist; /* array of free object indexes */
+ void *s_mem; /* first object */
+ unsigned int active;
+
+#elif defined(CONFIG_SLUB)
+
+ union {
+ struct list_head slab_list;
+ struct rcu_head rcu_head;
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ struct {
+ struct slab *next;
+ int slabs; /* Nr of slabs left */
+ };
+#endif
+ };
+ struct kmem_cache *slab_cache;
+ /* Double-word boundary */
+ void *freelist; /* first free object */
+ union {
+ unsigned long counters;
+ struct {
+ unsigned inuse:16;
+ unsigned objects:15;
+ unsigned frozen:1;
+ };
+ };
+ unsigned int __unused;
+
+#elif defined(CONFIG_SLOB)
+
+ struct list_head slab_list;
+ void *__unused_1;
+ void *freelist; /* first free block */
+ long units;
+ unsigned int __unused_2;
+
+#else
+#error "Unexpected slab allocator configured"
+#endif
+
+ atomic_t __page_refcount;
+#ifdef CONFIG_MEMCG
+ unsigned long memcg_data;
+#endif
+};
+
+#define SLAB_MATCH(pg, sl) \
+ static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
+SLAB_MATCH(flags, __page_flags);
+SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */
+SLAB_MATCH(slab_list, slab_list);
+#ifndef CONFIG_SLOB
+SLAB_MATCH(rcu_head, rcu_head);
+SLAB_MATCH(slab_cache, slab_cache);
+#endif
+#ifdef CONFIG_SLAB
+SLAB_MATCH(s_mem, s_mem);
+SLAB_MATCH(active, active);
+#endif
+SLAB_MATCH(_refcount, __page_refcount);
+#ifdef CONFIG_MEMCG
+SLAB_MATCH(memcg_data, memcg_data);
+#endif
+#undef SLAB_MATCH
+static_assert(sizeof(struct slab) <= sizeof(struct page));
+
+/**
+ * folio_slab - Converts from folio to slab.
+ * @folio: The folio.
+ *
+ * Currently struct slab is a different representation of a folio where
+ * folio_test_slab() is true.
+ *
+ * Return: The slab which contains this folio.
+ */
+#define folio_slab(folio) (_Generic((folio), \
+ const struct folio *: (const struct slab *)(folio), \
+ struct folio *: (struct slab *)(folio)))
+
+/**
+ * slab_folio - The folio allocated for a slab
+ * @slab: The slab.
+ *
+ * Slabs are allocated as folios that contain the individual objects and are
+ * using some fields in the first struct page of the folio - those fields are
+ * now accessed by struct slab. It is occasionally necessary to convert back to
+ * a folio in order to communicate with the rest of the mm. Please use this
+ * helper function instead of casting yourself, as the implementation may change
+ * in the future.
+ */
+#define slab_folio(s) (_Generic((s), \
+ const struct slab *: (const struct folio *)s, \
+ struct slab *: (struct folio *)s))
+
+/**
+ * page_slab - Converts from first struct page to slab.
+ * @p: The first (either head of compound or single) page of slab.
+ *
+ * A temporary wrapper to convert struct page to struct slab in situations where
+ * we know the page is the compound head, or single order-0 page.
+ *
+ * Long-term ideally everything would work with struct slab directly or go
+ * through folio to struct slab.
+ *
+ * Return: The slab which contains this page
+ */
+#define page_slab(p) (_Generic((p), \
+ const struct page *: (const struct slab *)(p), \
+ struct page *: (struct slab *)(p)))
+
+/**
+ * slab_page - The first struct page allocated for a slab
+ * @slab: The slab.
+ *
+ * A convenience wrapper for converting slab to the first struct page of the
+ * underlying folio, to communicate with code not yet converted to folio or
+ * struct slab.
+ */
+#define slab_page(s) folio_page(slab_folio(s), 0)
+
+/*
+ * If network-based swap is enabled, sl*b must keep track of whether pages
+ * were allocated from pfmemalloc reserves.
+ */
+static inline bool slab_test_pfmemalloc(const struct slab *slab)
+{
+ return folio_test_active((struct folio *)slab_folio(slab));
+}
+
+static inline void slab_set_pfmemalloc(struct slab *slab)
+{
+ folio_set_active(slab_folio(slab));
+}
+
+static inline void slab_clear_pfmemalloc(struct slab *slab)
+{
+ folio_clear_active(slab_folio(slab));
+}
+
+static inline void __slab_clear_pfmemalloc(struct slab *slab)
+{
+ __folio_clear_active(slab_folio(slab));
+}
+
+static inline void *slab_address(const struct slab *slab)
+{
+ return folio_address(slab_folio(slab));
+}
+
+static inline int slab_nid(const struct slab *slab)
+{
+ return folio_nid(slab_folio(slab));
+}
+
+static inline pg_data_t *slab_pgdat(const struct slab *slab)
+{
+ return folio_pgdat(slab_folio(slab));
+}
+
+static inline struct slab *virt_to_slab(const void *addr)
+{
+ struct folio *folio = virt_to_folio(addr);
+
+ if (!folio_test_slab(folio))
+ return NULL;
+
+ return folio_slab(folio);
+}
+
+static inline int slab_order(const struct slab *slab)
+{
+ return folio_order((struct folio *)slab_folio(slab));
+}
+
+static inline size_t slab_size(const struct slab *slab)
+{
+ return PAGE_SIZE << slab_order(slab);
+}
+
#ifdef CONFIG_SLOB
/*
* Common fields provided in kmem_cache by all slab allocators
}
#ifdef CONFIG_MEMCG_KMEM
-int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
- gfp_t gfp, bool new_page);
+/*
+ * slab_objcgs - get the object cgroups vector associated with a slab
+ * @slab: a pointer to the slab struct
+ *
+ * Returns a pointer to the object cgroups vector associated with the slab,
+ * or NULL if no such vector has been associated yet.
+ */
+static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
+{
+ unsigned long memcg_data = READ_ONCE(slab->memcg_data);
+
+ VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS),
+ slab_page(slab));
+ VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab));
+
+ return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+}
+
+int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
+ gfp_t gfp, bool new_slab);
void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr);
-static inline void memcg_free_page_obj_cgroups(struct page *page)
+static inline void memcg_free_slab_cgroups(struct slab *slab)
{
- kfree(page_objcgs(page));
- page->memcg_data = 0;
+ kfree(slab_objcgs(slab));
+ slab->memcg_data = 0;
}
static inline size_t obj_full_size(struct kmem_cache *s)
gfp_t flags, size_t size,
void **p)
{
- struct page *page;
+ struct slab *slab;
unsigned long off;
size_t i;
for (i = 0; i < size; i++) {
if (likely(p[i])) {
- page = virt_to_head_page(p[i]);
+ slab = virt_to_slab(p[i]);
- if (!page_objcgs(page) &&
- memcg_alloc_page_obj_cgroups(page, s, flags,
+ if (!slab_objcgs(slab) &&
+ memcg_alloc_slab_cgroups(slab, s, flags,
false)) {
obj_cgroup_uncharge(objcg, obj_full_size(s));
continue;
}
- off = obj_to_index(s, page, p[i]);
+ off = obj_to_index(s, slab, p[i]);
obj_cgroup_get(objcg);
- page_objcgs(page)[off] = objcg;
- mod_objcg_state(objcg, page_pgdat(page),
+ slab_objcgs(slab)[off] = objcg;
+ mod_objcg_state(objcg, slab_pgdat(slab),
cache_vmstat_idx(s), obj_full_size(s));
} else {
obj_cgroup_uncharge(objcg, obj_full_size(s));
struct kmem_cache *s;
struct obj_cgroup **objcgs;
struct obj_cgroup *objcg;
- struct page *page;
+ struct slab *slab;
unsigned int off;
int i;
if (unlikely(!p[i]))
continue;
- page = virt_to_head_page(p[i]);
- objcgs = page_objcgs_check(page);
+ slab = virt_to_slab(p[i]);
+ /* we could be given a kmalloc_large() object, skip those */
+ if (!slab)
+ continue;
+
+ objcgs = slab_objcgs(slab);
if (!objcgs)
continue;
if (!s_orig)
- s = page->slab_cache;
+ s = slab->slab_cache;
else
s = s_orig;
- off = obj_to_index(s, page, p[i]);
+ off = obj_to_index(s, slab, p[i]);
objcg = objcgs[off];
if (!objcg)
continue;
objcgs[off] = NULL;
obj_cgroup_uncharge(objcg, obj_full_size(s));
- mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
+ mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
-obj_full_size(s));
obj_cgroup_put(objcg);
}
}
#else /* CONFIG_MEMCG_KMEM */
+static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
+{
+ return NULL;
+}
+
static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
{
return NULL;
}
-static inline int memcg_alloc_page_obj_cgroups(struct page *page,
+static inline int memcg_alloc_slab_cgroups(struct slab *slab,
struct kmem_cache *s, gfp_t gfp,
- bool new_page)
+ bool new_slab)
{
return 0;
}
-static inline void memcg_free_page_obj_cgroups(struct page *page)
+static inline void memcg_free_slab_cgroups(struct slab *slab)
{
}
}
#endif /* CONFIG_MEMCG_KMEM */
+#ifndef CONFIG_SLOB
static inline struct kmem_cache *virt_to_cache(const void *obj)
{
- struct page *page;
+ struct slab *slab;
- page = virt_to_head_page(obj);
- if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n",
+ slab = virt_to_slab(obj);
+ if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n",
__func__))
return NULL;
- return page->slab_cache;
+ return slab->slab_cache;
}
-static __always_inline void account_slab_page(struct page *page, int order,
- struct kmem_cache *s,
- gfp_t gfp)
+static __always_inline void account_slab(struct slab *slab, int order,
+ struct kmem_cache *s, gfp_t gfp)
{
if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT))
- memcg_alloc_page_obj_cgroups(page, s, gfp, true);
+ memcg_alloc_slab_cgroups(slab, s, gfp, true);
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
PAGE_SIZE << order);
}
-static __always_inline void unaccount_slab_page(struct page *page, int order,
- struct kmem_cache *s)
+static __always_inline void unaccount_slab(struct slab *slab, int order,
+ struct kmem_cache *s)
{
if (memcg_kmem_enabled())
- memcg_free_page_obj_cgroups(page);
+ memcg_free_slab_cgroups(slab);
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
-(PAGE_SIZE << order));
}
print_tracking(cachep, x);
return cachep;
}
+#endif /* CONFIG_SLOB */
static inline size_t slab_ksize(const struct kmem_cache *s)
{
#endif
- void *slab_start(struct seq_file *m, loff_t *pos);
- void *slab_next(struct seq_file *m, void *p, loff_t *pos);
- void slab_stop(struct seq_file *m, void *p);
- int memcg_slab_show(struct seq_file *m, void *p);
-
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
void dump_unreclaimable_slab(void);
#else
#define KS_ADDRS_COUNT 16
struct kmem_obj_info {
void *kp_ptr;
- struct page *kp_page;
+ struct slab *kp_slab;
void *kp_objp;
unsigned long kp_data_offset;
struct kmem_cache *kp_slab_cache;
void *kp_stack[KS_ADDRS_COUNT];
void *kp_free_stack[KS_ADDRS_COUNT];
};
-void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab);
+#endif
+
+#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
+void __check_heap_object(const void *ptr, unsigned long n,
+ const struct slab *slab, bool to_user);
+#else
+static inline
+void __check_heap_object(const void *ptr, unsigned long n,
+ const struct slab *slab, bool to_user)
+{
+}
#endif
#endif /* MM_SLAB_H */
void kmem_cache_destroy(struct kmem_cache *s)
{
- int err;
-
- if (unlikely(!s))
+ if (unlikely(!s) || !kasan_check_byte(s))
return;
cpus_read_lock();
if (s->refcount)
goto out_unlock;
- err = shutdown_cache(s);
- if (err) {
- pr_err("%s %s: Slab cache still has objects\n",
- __func__, s->name);
- dump_stack();
- }
+ WARN(shutdown_cache(s),
+ "%s %s: Slab cache still has objects when called from %pS",
+ __func__, s->name, (void *)_RET_IP_);
out_unlock:
mutex_unlock(&slab_mutex);
cpus_read_unlock();
*/
bool kmem_valid_obj(void *object)
{
- struct page *page;
+ struct folio *folio;
/* Some arches consider ZERO_SIZE_PTR to be a valid address. */
if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
return false;
- page = virt_to_head_page(object);
- return PageSlab(page);
+ folio = virt_to_folio(object);
+ return folio_test_slab(folio);
}
EXPORT_SYMBOL_GPL(kmem_valid_obj);
{
char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
int i;
- struct page *page;
+ struct slab *slab;
unsigned long ptroffset;
struct kmem_obj_info kp = { };
if (WARN_ON_ONCE(!virt_addr_valid(object)))
return;
- page = virt_to_head_page(object);
- if (WARN_ON_ONCE(!PageSlab(page))) {
+ slab = virt_to_slab(object);
+ if (WARN_ON_ONCE(!slab)) {
pr_cont(" non-slab memory.\n");
return;
}
- kmem_obj_info(&kp, object, page);
+ kmem_obj_info(&kp, object, slab);
if (kp.kp_slab_cache)
pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
else
if (KMALLOC_MIN_SIZE >= 64) {
/*
- * The 96 byte size cache is not used if the alignment
+ * The 96 byte sized cache is not used if the alignment
* is 64 byte.
*/
for (i = 64 + 8; i <= 96; i += 8)
if (type == KMALLOC_RECLAIM) {
flags |= SLAB_RECLAIM_ACCOUNT;
} else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) {
- if (cgroup_memory_nokmem) {
+ if (mem_cgroup_kmem_disabled()) {
kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
return;
}
seq_putc(m, '\n');
}
- void *slab_start(struct seq_file *m, loff_t *pos)
+ static void *slab_start(struct seq_file *m, loff_t *pos)
{
mutex_lock(&slab_mutex);
return seq_list_start(&slab_caches, *pos);
}
- void *slab_next(struct seq_file *m, void *p, loff_t *pos)
+ static void *slab_next(struct seq_file *m, void *p, loff_t *pos)
{
return seq_list_next(p, &slab_caches, pos);
}
- void slab_stop(struct seq_file *m, void *p)
+ static void slab_stop(struct seq_file *m, void *p)
{
mutex_unlock(&slab_mutex);
}
mutex_unlock(&slab_mutex);
}
- #if defined(CONFIG_MEMCG_KMEM)
- int memcg_slab_show(struct seq_file *m, void *p)
- {
- /*
- * Deprecated.
- * Please, take a look at tools/cgroup/slabinfo.py .
- */
- return 0;
- }
- #endif
-
/*
* slabinfo_op - iterator that generates /proc/slabinfo
*
* all online CPUs so any calls of lru_cache_disabled wrapped by
* local_lock or preemption disabled would be ordered by that.
* The atomic operation doesn't need to have stronger ordering
- * requirements because that is enforeced by the scheduling
+ * requirements because that is enforced by the scheduling
* guarantees.
*/
__lru_add_drain_all(true);
}
/**
- * pagevec_remove_exceptionals - pagevec exceptionals pruning
- * @pvec: The pagevec to prune
+ * folio_batch_remove_exceptionals() - Prune non-folios from a batch.
+ * @fbatch: The batch to prune
*
- * find_get_entries() fills both pages and XArray value entries (aka
- * exceptional entries) into the pagevec. This function prunes all
- * exceptionals from @pvec without leaving holes, so that it can be
- * passed on to page-only pagevec operations.
+ * find_get_entries() fills a batch with both folios and shadow/swap/DAX
+ * entries. This function prunes all the non-folio entries from @fbatch
+ * without leaving holes, so that it can be passed on to folio-only batch
+ * operations.
*/
-void pagevec_remove_exceptionals(struct pagevec *pvec)
+void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
{
- int i, j;
+ unsigned int i, j;
- for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- if (!xa_is_value(page))
- pvec->pages[j++] = page;
+ for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) {
+ struct folio *folio = fbatch->folios[i];
+ if (!xa_is_value(folio))
+ fbatch->folios[j++] = folio;
}
- pvec->nr = j;
+ fbatch->nr = j;
}
/**
/*
* Unconditionally remove exceptional entries. Usually called from truncate
- * path. Note that the pagevec may be altered by this function by removing
- * exceptional entries similar to what pagevec_remove_exceptionals does.
+ * path. Note that the folio_batch may be altered by this function by removing
+ * exceptional entries similar to what folio_batch_remove_exceptionals() does.
*/
-static void truncate_exceptional_pvec_entries(struct address_space *mapping,
- struct pagevec *pvec, pgoff_t *indices)
+static void truncate_folio_batch_exceptionals(struct address_space *mapping,
+ struct folio_batch *fbatch, pgoff_t *indices)
{
int i, j;
bool dax;
if (shmem_mapping(mapping))
return;
- for (j = 0; j < pagevec_count(pvec); j++)
- if (xa_is_value(pvec->pages[j]))
+ for (j = 0; j < folio_batch_count(fbatch); j++)
+ if (xa_is_value(fbatch->folios[j]))
break;
- if (j == pagevec_count(pvec))
+ if (j == folio_batch_count(fbatch))
return;
dax = dax_mapping(mapping);
xa_lock_irq(&mapping->i_pages);
}
- for (i = j; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
+ for (i = j; i < folio_batch_count(fbatch); i++) {
+ struct folio *folio = fbatch->folios[i];
pgoff_t index = indices[i];
- if (!xa_is_value(page)) {
- pvec->pages[j++] = page;
+ if (!xa_is_value(folio)) {
+ fbatch->folios[j++] = folio;
continue;
}
continue;
}
- __clear_shadow_entry(mapping, index, page);
+ __clear_shadow_entry(mapping, index, folio);
}
if (!dax) {
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
}
- pvec->nr = j;
+ fbatch->nr = j;
}
/*
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
*/
-static void truncate_cleanup_page(struct page *page)
+static void truncate_cleanup_folio(struct folio *folio)
{
- if (page_mapped(page))
- unmap_mapping_page(page);
+ if (folio_mapped(folio))
+ unmap_mapping_folio(folio);
- if (page_has_private(page))
- do_invalidatepage(page, 0, thp_size(page));
+ if (folio_has_private(folio))
+ do_invalidatepage(&folio->page, 0, folio_size(folio));
/*
* Some filesystems seem to re-dirty the page even after
* the VM has canceled the dirty bit (eg ext3 journaling).
* Hence dirty accounting check is placed after invalidation.
*/
- cancel_dirty_page(page);
- ClearPageMappedToDisk(page);
+ folio_cancel_dirty(folio);
+ folio_clear_mappedtodisk(folio);
}
/*
static int
invalidate_complete_page(struct address_space *mapping, struct page *page)
{
- int ret;
if (page->mapping != mapping)
return 0;
if (page_has_private(page) && !try_to_release_page(page, 0))
return 0;
- ret = remove_mapping(mapping, page);
-
- return ret;
+ return remove_mapping(mapping, page);
}
-int truncate_inode_page(struct address_space *mapping, struct page *page)
+int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
{
- VM_BUG_ON_PAGE(PageTail(page), page);
-
- if (page->mapping != mapping)
+ if (folio->mapping != mapping)
return -EIO;
- truncate_cleanup_page(page);
- delete_from_page_cache(page);
+ truncate_cleanup_folio(folio);
+ filemap_remove_folio(folio);
return 0;
}
+/*
+ * Handle partial folios. The folio may be entirely within the
+ * range if a split has raced with us. If not, we zero the part of the
+ * folio that's within the [start, end] range, and then split the folio if
+ * it's large. split_page_range() will discard pages which now lie beyond
+ * i_size, and we rely on the caller to discard pages which lie within a
+ * newly created hole.
+ *
+ * Returns false if splitting failed so the caller can avoid
+ * discarding the entire folio which is stubbornly unsplit.
+ */
+bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
+{
+ loff_t pos = folio_pos(folio);
+ unsigned int offset, length;
+
+ if (pos < start)
+ offset = start - pos;
+ else
+ offset = 0;
+ length = folio_size(folio);
+ if (pos + length <= (u64)end)
+ length = length - offset;
+ else
+ length = end + 1 - pos - offset;
+
+ folio_wait_writeback(folio);
+ if (length == folio_size(folio)) {
+ truncate_inode_folio(folio->mapping, folio);
+ return true;
+ }
+
+ /*
+ * We may be zeroing pages we're about to discard, but it avoids
+ * doing a complex calculation here, and then doing the zeroing
+ * anyway if the page split fails.
+ */
+ folio_zero_range(folio, offset, length);
+
+ cleancache_invalidate_page(folio->mapping, &folio->page);
+ if (folio_has_private(folio))
+ do_invalidatepage(&folio->page, offset, length);
+ if (!folio_test_large(folio))
+ return true;
+ if (split_huge_page(&folio->page) == 0)
+ return true;
+ if (folio_test_dirty(folio))
+ return false;
+ truncate_inode_folio(folio->mapping, folio);
+ return true;
+}
+
/*
* Used to get rid of pages on hardware memory corruption.
*/
int generic_error_remove_page(struct address_space *mapping, struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
if (!mapping)
return -EINVAL;
/*
*/
if (!S_ISREG(mapping->host->i_mode))
return -EIO;
- return truncate_inode_page(mapping, page);
+ return truncate_inode_folio(mapping, page_folio(page));
}
EXPORT_SYMBOL(generic_error_remove_page);
{
pgoff_t start; /* inclusive */
pgoff_t end; /* exclusive */
- unsigned int partial_start; /* inclusive */
- unsigned int partial_end; /* exclusive */
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE];
pgoff_t index;
int i;
+ struct folio *folio;
+ bool same_folio;
if (mapping_empty(mapping))
goto out;
- /* Offsets within partial pages */
- partial_start = lstart & (PAGE_SIZE - 1);
- partial_end = (lend + 1) & (PAGE_SIZE - 1);
-
/*
* 'start' and 'end' always covers the range of pages to be fully
* truncated. Partial pages are covered with 'partial_start' at the
else
end = (lend + 1) >> PAGE_SHIFT;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
index = start;
while (index < end && find_lock_entries(mapping, index, end - 1,
- &pvec, indices)) {
- index = indices[pagevec_count(&pvec) - 1] + 1;
- truncate_exceptional_pvec_entries(mapping, &pvec, indices);
- for (i = 0; i < pagevec_count(&pvec); i++)
- truncate_cleanup_page(pvec.pages[i]);
- delete_from_page_cache_batch(mapping, &pvec);
- for (i = 0; i < pagevec_count(&pvec); i++)
- unlock_page(pvec.pages[i]);
- pagevec_release(&pvec);
+ &fbatch, indices)) {
+ index = indices[folio_batch_count(&fbatch) - 1] + 1;
+ truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
+ for (i = 0; i < folio_batch_count(&fbatch); i++)
+ truncate_cleanup_folio(fbatch.folios[i]);
+ delete_from_page_cache_batch(mapping, &fbatch);
+ for (i = 0; i < folio_batch_count(&fbatch); i++)
+ folio_unlock(fbatch.folios[i]);
+ folio_batch_release(&fbatch);
cond_resched();
}
- if (partial_start) {
- struct page *page = find_lock_page(mapping, start - 1);
- if (page) {
- unsigned int top = PAGE_SIZE;
- if (start > end) {
- /* Truncation within a single page */
- top = partial_end;
- partial_end = 0;
- }
- wait_on_page_writeback(page);
- zero_user_segment(page, partial_start, top);
- cleancache_invalidate_page(mapping, page);
- if (page_has_private(page))
- do_invalidatepage(page, partial_start,
- top - partial_start);
- unlock_page(page);
- put_page(page);
+ same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
+ folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0);
+ if (folio) {
+ same_folio = lend < folio_pos(folio) + folio_size(folio);
+ if (!truncate_inode_partial_folio(folio, lstart, lend)) {
+ start = folio->index + folio_nr_pages(folio);
+ if (same_folio)
+ end = folio->index;
}
+ folio_unlock(folio);
+ folio_put(folio);
+ folio = NULL;
}
- if (partial_end) {
- struct page *page = find_lock_page(mapping, end);
- if (page) {
- wait_on_page_writeback(page);
- zero_user_segment(page, 0, partial_end);
- cleancache_invalidate_page(mapping, page);
- if (page_has_private(page))
- do_invalidatepage(page, 0,
- partial_end);
- unlock_page(page);
- put_page(page);
- }
+
+ if (!same_folio)
+ folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT,
+ FGP_LOCK, 0);
+ if (folio) {
+ if (!truncate_inode_partial_folio(folio, lstart, lend))
+ end = folio->index;
+ folio_unlock(folio);
+ folio_put(folio);
}
- /*
- * If the truncation happened within a single page no pages
- * will be released, just zeroed, so we can bail out now.
- */
- if (start >= end)
- goto out;
index = start;
- for ( ; ; ) {
+ while (index < end) {
cond_resched();
- if (!find_get_entries(mapping, index, end - 1, &pvec,
+ if (!find_get_entries(mapping, index, end - 1, &fbatch,
indices)) {
/* If all gone from start onwards, we're done */
if (index == start)
continue;
}
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing page->index */
index = indices[i];
- if (xa_is_value(page))
+ if (xa_is_value(folio))
continue;
- lock_page(page);
- WARN_ON(page_to_index(page) != index);
- wait_on_page_writeback(page);
- truncate_inode_page(mapping, page);
- unlock_page(page);
+ folio_lock(folio);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
+ folio_wait_writeback(folio);
+ truncate_inode_folio(mapping, folio);
+ folio_unlock(folio);
+ index = folio_index(folio) + folio_nr_pages(folio) - 1;
}
- truncate_exceptional_pvec_entries(mapping, &pvec, indices);
- pagevec_release(&pvec);
+ truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
+ folio_batch_release(&fbatch);
index++;
}
pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
{
pgoff_t indices[PAGEVEC_SIZE];
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t index = start;
unsigned long ret;
unsigned long count = 0;
int i;
- pagevec_init(&pvec);
- while (find_lock_entries(mapping, index, end, &pvec, indices)) {
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
+ folio_batch_init(&fbatch);
+ while (find_lock_entries(mapping, index, end, &fbatch, indices)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct page *page = &fbatch.folios[i]->page;
/* We rely upon deletion not changing page->index */
index = indices[i];
}
count += ret;
}
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
+ folio_batch_remove_exceptionals(&fbatch);
+ folio_batch_release(&fbatch);
cond_resched();
index++;
}
* shrink_page_list() has a temp ref on them, or because they're transiently
* sitting in the lru_cache_add() pagevecs.
*/
-static int
-invalidate_complete_page2(struct address_space *mapping, struct page *page)
+static int invalidate_complete_folio2(struct address_space *mapping,
+ struct folio *folio)
{
- if (page->mapping != mapping)
+ if (folio->mapping != mapping)
return 0;
- if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
+ if (folio_has_private(folio) &&
+ !filemap_release_folio(folio, GFP_KERNEL))
return 0;
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
- if (PageDirty(page))
+ if (folio_test_dirty(folio))
goto failed;
- BUG_ON(page_has_private(page));
- __delete_from_page_cache(page, NULL);
+ BUG_ON(folio_has_private(folio));
+ __filemap_remove_folio(folio, NULL);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
- if (mapping->a_ops->freepage)
- mapping->a_ops->freepage(page);
-
- put_page(page); /* pagecache ref */
+ filemap_free_folio(mapping, folio);
return 1;
failed:
xa_unlock_irq(&mapping->i_pages);
return 0;
}
-static int do_launder_page(struct address_space *mapping, struct page *page)
+static int do_launder_folio(struct address_space *mapping, struct folio *folio)
{
- if (!PageDirty(page))
+ if (!folio_test_dirty(folio))
return 0;
- if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
+ if (folio->mapping != mapping || mapping->a_ops->launder_page == NULL)
return 0;
- return mapping->a_ops->launder_page(page);
+ return mapping->a_ops->launder_page(&folio->page);
}
/**
pgoff_t start, pgoff_t end)
{
pgoff_t indices[PAGEVEC_SIZE];
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t index;
int i;
int ret = 0;
if (mapping_empty(mapping))
goto out;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
index = start;
- while (find_get_entries(mapping, index, end, &pvec, indices)) {
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
+ while (find_get_entries(mapping, index, end, &fbatch, indices)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
- /* We rely upon deletion not changing page->index */
+ /* We rely upon deletion not changing folio->index */
index = indices[i];
- if (xa_is_value(page)) {
+ if (xa_is_value(folio)) {
if (!invalidate_exceptional_entry2(mapping,
- index, page))
+ index, folio))
ret = -EBUSY;
continue;
}
- if (!did_range_unmap && page_mapped(page)) {
+ if (!did_range_unmap && folio_mapped(folio)) {
/*
- * If page is mapped, before taking its lock,
+ * If folio is mapped, before taking its lock,
* zap the rest of the file in one hit.
*/
unmap_mapping_pages(mapping, index,
did_range_unmap = 1;
}
- lock_page(page);
- WARN_ON(page_to_index(page) != index);
- if (page->mapping != mapping) {
- unlock_page(page);
+ folio_lock(folio);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
continue;
}
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
- if (page_mapped(page))
- unmap_mapping_page(page);
- BUG_ON(page_mapped(page));
+ if (folio_mapped(folio))
+ unmap_mapping_folio(folio);
+ BUG_ON(folio_mapped(folio));
- ret2 = do_launder_page(mapping, page);
+ ret2 = do_launder_folio(mapping, folio);
if (ret2 == 0) {
- if (!invalidate_complete_page2(mapping, page))
+ if (!invalidate_complete_folio2(mapping, folio))
ret2 = -EBUSY;
}
if (ret2 < 0)
ret = ret2;
- unlock_page(page);
+ folio_unlock(folio);
}
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
+ folio_batch_remove_exceptionals(&fbatch);
+ folio_batch_release(&fbatch);
cond_resched();
index++;
}