3.10 /proc/<pid>/timerslack_ns - Task timerslack value
3.11 /proc/<pid>/patch_state - Livepatch patch operation state
3.12 /proc/<pid>/arch_status - Task architecture specific information
+ 3.13 /proc/<pid>/fd - List of symlinks to open files
4 Configuring procfs
4.1 Mount options
Ngid NUMA group ID (0 if none)
Pid process id
PPid process id of the parent process
- TracerPid PID of process tracing this process (0 if not)
+ TracerPid PID of process tracing this process (0 if not, or
+ the tracer is outside of the current pid namespace)
Uid Real, effective, saved set, and file system UIDs
Gid Real, effective, saved set, and file system GIDs
FDSize number of file descriptor slots currently allocated
The "pathname" shows the name associated file for this mapping. If the mapping
is not associated with a file:
- ============= ====================================
+ =================== ===========================================
[heap] the heap of the program
[stack] the stack of the main process
[vdso] the "virtual dynamic shared object",
the kernel system call handler
- [anon:<name>] an anonymous mapping that has been
+ [anon:<name>] a private anonymous mapping that has been
named by userspace
- ============= ====================================
+ [anon_shmem:<name>] an anonymous shared memory mapping that has
+ been named by userspace
+ =================== ===========================================
or if empty, the mapping is anonymous.
the task is unlikely an AVX512 user, but depends on the workload and the
scheduling scenario, it also could be a false negative mentioned above.
+3.13 /proc/<pid>/fd - List of symlinks to open files
+-------------------------------------------------------
+This directory contains symbolic links which represent open files
+the process is maintaining. Example output::
+
+ lr-x------ 1 root root 64 Sep 20 17:53 0 -> /dev/null
+ l-wx------ 1 root root 64 Sep 20 17:53 1 -> /dev/null
+ lrwx------ 1 root root 64 Sep 20 17:53 10 -> 'socket:[12539]'
+ lrwx------ 1 root root 64 Sep 20 17:53 11 -> 'socket:[12540]'
+ lrwx------ 1 root root 64 Sep 20 17:53 12 -> 'socket:[12542]'
+
+The number of open files for the process is stored in 'size' member
+of stat() output for /proc/<pid>/fd for fast access.
+-------------------------------------------------------
+
+
Chapter 4: Configuring procfs
=============================
F: Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml
F: drivers/media/platform/sunxi/sun4i-csi/
+ALLWINNER A31 CSI DRIVER
+S: Maintained
+T: git git://linuxtv.org/media_tree.git
+F: Documentation/devicetree/bindings/media/allwinner,sun6i-a31-csi.yaml
+F: drivers/media/platform/sunxi/sun6i-csi/
+
+ALLWINNER A31 ISP DRIVER
+S: Maintained
+T: git git://linuxtv.org/media_tree.git
+F: Documentation/devicetree/bindings/media/allwinner,sun6i-a31-isp.yaml
+F: drivers/staging/media/sunxi/sun6i-isp/
+F: drivers/staging/media/sunxi/sun6i-isp/uapi/sun6i-isp-config.h
+
ALLWINNER A31 MIPI CSI-2 BRIDGE DRIVER
F: Documentation/hid/amd-sfh*
F: drivers/hid/amd-sfh-hid/
+AMLOGIC DDR PMU DRIVER
+S: Supported
+W: http://www.amlogic.com
+F: Documentation/admin-guide/perf/meson-ddr-pmu.rst
+F: Documentation/devicetree/bindings/perf/amlogic,g12-ddr-pmu.yaml
+F: drivers/perf/amlogic/
+F: include/soc/amlogic/
+
AMPHION VPU CODEC V4L2 DRIVER
S: Maintained
-F: Documentation/devicetree/bindings/memory-controllers/arm,pl353-smc.yaml
+F: Documentation/devicetree/bindings/memory-controllers/arm,pl35x-smc.yaml
F: drivers/memory/pl353-smc.c
ARM PRIMECELL CLCD PL110 DRIVER
F: Documentation/devicetree/bindings/arm/apple.yaml
F: Documentation/devicetree/bindings/arm/apple/*
F: Documentation/devicetree/bindings/clock/apple,nco.yaml
+F: Documentation/devicetree/bindings/cpufreq/apple,cluster-cpufreq.yaml
F: Documentation/devicetree/bindings/dma/apple,admac.yaml
F: Documentation/devicetree/bindings/i2c/apple,i2c.yaml
F: Documentation/devicetree/bindings/interrupt-controller/apple,*
F: Documentation/devicetree/bindings/iommu/apple,dart.yaml
F: Documentation/devicetree/bindings/iommu/apple,sart.yaml
F: Documentation/devicetree/bindings/mailbox/apple,mailbox.yaml
+F: Documentation/devicetree/bindings/net/bluetooth/brcm,bcm4377-bluetooth.yaml
F: Documentation/devicetree/bindings/nvme/apple,nvme-ans.yaml
F: Documentation/devicetree/bindings/nvmem/apple,efuses.yaml
F: Documentation/devicetree/bindings/pci/apple,pcie.yaml
F: Documentation/devicetree/bindings/power/apple*
F: Documentation/devicetree/bindings/watchdog/apple,wdt.yaml
F: arch/arm64/boot/dts/apple/
+F: drivers/bluetooth/hci_bcm4377.c
F: drivers/clk/clk-apple-nco.c
+F: drivers/cpufreq/apple-soc-cpufreq.c
F: drivers/dma/apple-admac.c
F: drivers/i2c/busses/i2c-pasemi-core.c
F: drivers/i2c/busses/i2c-pasemi-platform.c
S: Supported
W: http://www.hisilicon.com
-T: git git://github.com/hisilicon/linux-hisi.git
+T: git https://github.com/hisilicon/linux-hisi.git
F: arch/arm/boot/dts/hi3*
F: arch/arm/boot/dts/hip*
F: arch/arm/boot/dts/hisi*
F: drivers/crypto/ixp4xx_crypto.c
F: drivers/gpio/gpio-ixp4xx.c
F: drivers/irqchip/irq-ixp4xx.c
-F: include/linux/irqchip/irq-ixp4xx.h
-F: include/linux/platform_data/timer-ixp4xx.h
ARM/INTEL KEEMBAY ARCHITECTURE
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/gclement/mvebu.git
+F: Documentation/devicetree/bindings/arm/marvell/marvell,dove.txt
+F: Documentation/devicetree/bindings/arm/marvell/marvell,orion5x.txt
F: Documentation/devicetree/bindings/soc/dove/
F: arch/arm/boot/dts/dove*
F: arch/arm/boot/dts/orion5x*
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/gclement/mvebu.git
+F: Documentation/devicetree/bindings/arm/marvell/
F: arch/arm/boot/dts/armada*
F: arch/arm/boot/dts/kirkwood*
F: arch/arm/configs/mvebu_*_defconfig
S: Supported
T: git git://github.com/microchip-ung/linux-upstream.git
F: arch/arm64/boot/dts/microchip/
+F: drivers/net/ethernet/microchip/vcap/
F: drivers/pinctrl/pinctrl-microchip-sgpio.c
N: sparx5
ARM/QUALCOMM SUPPORT
-R: Konrad Dybcio <konrad.dybcio@somainline.org>
+R: Konrad Dybcio <konrad.dybcio@linaro.org>
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux.git
F: arch/arm/mach-realtek/
F: arch/arm64/boot/dts/realtek/
-ARM/RENESAS ARCHITECTURE
+ARM/RISC-V/RENESAS ARCHITECTURE
Q: http://patchwork.kernel.org/project/linux-renesas-soc/list/
C: irc://irc.libera.chat/renesas-soc
T: git git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git next
-F: Documentation/devicetree/bindings/arm/renesas.yaml
F: Documentation/devicetree/bindings/hwinfo/renesas,prr.yaml
F: Documentation/devicetree/bindings/soc/renesas/
F: arch/arm/boot/dts/emev2*
F: arch/arm/include/debug/renesas-scif.S
F: arch/arm/mach-shmobile/
F: arch/arm64/boot/dts/renesas/
+F: arch/riscv/boot/dts/renesas/
F: drivers/soc/renesas/
F: include/linux/soc/renesas/
S: Supported
W: http://ceph.com/
-T: git git://github.com/ceph/ceph-client.git
+T: git https://github.com/ceph/ceph-client.git
F: include/linux/ceph/
F: include/linux/crush/
F: net/ceph/
S: Supported
W: http://ceph.com/
-T: git git://github.com/ceph/ceph-client.git
+T: git https://github.com/ceph/ceph-client.git
F: Documentation/filesystems/ceph.rst
F: fs/ceph/
CHROMEOS EC CODEC DRIVER
-M: Tzung-Bi Shih <tzungbi@google.com>
+M: Tzung-Bi Shih <tzungbi@kernel.org>
S: Maintained
F: drivers/platform/chrome/cros_usbpd_notify.c
F: include/linux/platform_data/cros_usbpd_notify.h
+CHROMEOS HPS DRIVER
+S: Maintained
+F: drivers/platform/chrome/cros_hps_i2c.c
+
CHRONTEL CH7322 CEC DRIVER
S: Maintained
F: sound/pci/cs5535audio/
-CSI DRIVERS FOR ALLWINNER V3s
-S: Maintained
-T: git git://linuxtv.org/media_tree.git
-F: Documentation/devicetree/bindings/media/allwinner,sun6i-a31-csi.yaml
-F: drivers/media/platform/sunxi/sun6i-csi/
-
CTU CAN FD DRIVER
CXGB4 CRYPTO DRIVER (chcr)
S: Supported
W: http://www.chelsio.com
CXGB4 INLINE CRYPTO DRIVER
S: Supported
W: http://www.chelsio.com
S: Maintained
F: drivers/platform/x86/dell/dell-wmi-descriptor.c
+DELL WMI DDV DRIVER
+S: Maintained
+F: Documentation/ABI/testing/debugfs-dell-wmi-ddv
+F: Documentation/ABI/testing/sysfs-platform-dell-wmi-ddv
+F: drivers/platform/x86/dell/dell-wmi-ddv.c
+
DELL WMI SYSMAN DRIVER
F: include/uapi/linux/devlink.h
F: net/core/devlink.c
-DH ELECTRONICS IMX6 DHCOM BOARD SUPPORT
+DH ELECTRONICS IMX6 DHCOM/DHCOR BOARD SUPPORT
S: Maintained
F: arch/arm/boot/dts/imx6*-dhcom-*
+F: arch/arm/boot/dts/imx6*-dhcor-*
DH ELECTRONICS STM32MP1 DHCOM/DHCOR BOARD SUPPORT
F: drivers/net/ethernet/freescale/dpaa2/Makefile
F: drivers/net/ethernet/freescale/dpaa2/dpaa2-eth*
F: drivers/net/ethernet/freescale/dpaa2/dpaa2-mac*
+F: drivers/net/ethernet/freescale/dpaa2/dpaa2-xsk*
F: drivers/net/ethernet/freescale/dpaa2/dpkg.h
F: drivers/net/ethernet/freescale/dpaa2/dpmac*
F: drivers/net/ethernet/freescale/dpaa2/dpni*
F: drivers/gpu/drm/i810/
F: include/uapi/drm/i810_drm.h
+DRM DRIVER FOR JADARD JD9365DA-H3 MIPI-DSI LCD PANELS
+S: Maintained
+F: Documentation/devicetree/bindings/display/panel/jadard,jd9365da-h3.yaml
+F: drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c
+
DRM DRIVER FOR LOGICVC DISPLAY CONTROLLER
S: Supported
S: Maintained
T: git git://anongit.freedesktop.org/drm/drm-misc
F: drivers/gpu/drm/drm_aperture.c
+F: drivers/gpu/drm/tiny/ofdrm.c
F: drivers/gpu/drm/tiny/simpledrm.c
F: drivers/video/aperture.c
+F: drivers/video/nomodeset.c
F: include/drm/drm_aperture.h
F: include/linux/aperture.h
+F: include/video/nomodeset.h
DRM DRIVER FOR SIS VIDEO CARDS
S: Orphan / Obsolete
F: include/linux/vga*
F: include/uapi/drm/drm*
+DRM COMPUTE ACCELERATORS DRIVERS AND FRAMEWORK
+S: Maintained
+C: irc://irc.oftc.net/dri-devel
+T: git https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/accel.git
+F: Documentation/accel/
+F: drivers/accel/
+
DRM DRIVERS FOR ALLWINNER A10
F: include/drm/ttm/
DRM GPU SCHEDULER
-M: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
+M: Luben Tuikov <luben.tuikov@amd.com>
S: Maintained
T: git git://anongit.freedesktop.org/drm/drm-misc
EDAC-CORE
S: Supported
F: drivers/edac/pnd2_edac.[ch]
EDAC-QCOM
S: Maintained
S: Maintained
+F: Documentation/networking/devlink/etas_es58x.rst
F: drivers/net/can/usb/etas_es58x/
ETHERNET BRIDGE
F: arch/*/include/asm/efi.h
F: arch/*/kernel/efi.c
F: arch/arm/boot/compressed/efi-header.S
-F: arch/arm64/kernel/efi-entry.S
F: arch/x86/platform/efi/
F: drivers/firmware/efi/
F: include/linux/efi*.h
F: drivers/i2c/busses/i2c-cpm.c
FREESCALE IMX / MXC FEC DRIVER
S: Maintained
F: Documentation/devicetree/bindings/net/fsl,fec.yaml
F: include/uapi/asm-generic/
GENERIC PHY FRAMEWORK
S: Supported
Q: https://patchwork.kernel.org/project/linux-phy/list/
S: Maintained
+F: Documentation/devicetree/bindings/leds/irled/gpio-ir-tx.yaml
F: drivers/media/rc/gpio-ir-tx.c
GPIO MOCKUP DRIVER
F: Documentation/scsi/hptiop.rst
F: drivers/scsi/hptiop.c
+HIMAX HX83112B TOUCHSCREEN SUPPORT
+S: Maintained
+F: Documentation/devicetree/bindings/input/touchscreen/himax,hx83112b.yaml
+F: drivers/input/touchscreen/himax_hx83112b.c
+
HIPPI
HISILICON PMU DRIVER
-M: Qi Liu <liuqi115@huawei.com>
+M: Jonathan Cameron <jonathan.cameron@huawei.com>
S: Supported
W: http://www.hisilicon.com
F: Documentation/admin-guide/perf/hisi-pcie-pmu.rst
F: drivers/crypto/hisilicon/trng/trng.c
HISILICON V3XX SPI NOR FLASH Controller Driver
-M: John Garry <john.garry@huawei.com>
+M: Jay Fang <f.fangjian@huawei.com>
S: Maintained
W: http://www.hisilicon.com
F: drivers/spi/spi-hisi-sfc-v3xx.c
HP COMPAQ TC1100 TABLET WMI EXTRAS DRIVER
S: Orphan
-F: drivers/platform/x86/tc1100-wmi.c
+F: drivers/platform/x86/hp/tc1100-wmi.c
HPET: High Precision Event Timers driver
F: drivers/iio/humidity/hts221*
HUAWEI ETHERNET DRIVER
-S: Orphan
+S: Maintained
F: Documentation/networking/device_drivers/ethernet/huawei/hinic.rst
F: drivers/net/ethernet/huawei/hinic/
Hyper-V/Azure CORE AND DRIVERS
F: include/asm-generic/mshyperv.h
F: include/clocksource/hyperv_timer.h
F: include/linux/hyperv.h
+F: include/net/mana
F: include/uapi/linux/hyperv.h
F: net/vmw_vsock/hyperv_transport.c
F: tools/hv/
F: drivers/hwmon/ina2xx.c
F: include/linux/platform_data/ina2xx.h
+INDEX OF FURTHER KERNEL DOCUMENTATION
+S: Maintained
+F: Documentation/process/kernel-docs.rst
+
INDUSTRY PACK SUBSYSTEM (IPACK)
F: include/rdma/
F: include/trace/events/ib_mad.h
F: include/trace/events/ib_umad.h
+F: include/trace/misc/rdma.h
F: include/uapi/linux/if_infiniband.h
F: include/uapi/rdma/
F: samples/bpf/ibumad_kern.c
B: https://gitlab.freedesktop.org/drm/intel/-/wikis/How-to-file-i915-bugs
C: irc://irc.oftc.net/intel-gfx
T: git git://anongit.freedesktop.org/drm-intel
+F: Documentation/ABI/testing/sysfs-driver-intel-i915-hwmon
F: Documentation/gpu/i915.rst
F: drivers/gpu/drm/i915/
F: include/drm/i915*
F: drivers/isdn/hardware/
F: drivers/isdn/mISDN/
+ISOFS FILESYSTEM
+S: Maintained
+F: Documentation/filesystems/isofs.rst
+F: fs/isofs/
+
IT87 HARDWARE MONITORING DRIVER
JFS FILESYSTEM
-S: Maintained
+S: Odd Fixes
W: http://jfs.sourceforge.net/
-T: git git://github.com/kleikamp/linux-shaggy.git
+T: git https://github.com/kleikamp/linux-shaggy.git
F: Documentation/admin-guide/jfs.rst
F: fs/jfs/
S: Maintained
+Q: https://patchwork.kernel.org/project/linux-kbuild/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git kconfig
F: Documentation/kbuild/kconfig*
F: scripts/Kconfig.include
KERNEL BUILD + files below scripts/ (unless maintained elsewhere)
S: Maintained
+Q: https://patchwork.kernel.org/project/linux-kbuild/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git
F: Documentation/kbuild/
F: Makefile
S: Supported
W: http://nfs.sourceforge.net/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git
+F: fs/exportfs/
F: fs/lockd/
F: fs/nfs_common/
F: fs/nfsd/
F: include/linux/lockd/
F: include/linux/sunrpc/
+F: include/trace/events/rpcgss.h
+F: include/trace/events/rpcrdma.h
+F: include/trace/events/sunrpc.h
+F: include/trace/misc/fs.h
+F: include/trace/misc/nfs.h
+F: include/trace/misc/sunrpc.h
F: include/uapi/linux/nfsd/
F: include/uapi/linux/sunrpc/
F: net/sunrpc/
S: Maintained
F: Documentation/misc-devices/lis3lv02d.rst
F: drivers/misc/lis3lv02d/
-F: drivers/platform/x86/hp_accel.c
+F: drivers/platform/x86/hp/hp_accel.c
LIST KUNIT TEST
F: Documentation/loongarch/
F: Documentation/translations/zh_CN/loongarch/
+LOONGSON-2 SOC SERIES GUTS DRIVER
+S: Maintained
+F: Documentation/devicetree/bindings/hwinfo/loongson,ls2k-chipid.yaml
+F: drivers/soc/loongson/loongson2_guts.c
+
+LOONGSON-2 SOC SERIES PINCTRL DRIVER
+S: Maintained
+F: Documentation/devicetree/bindings/pinctrl/loongson,ls2k-pinctrl.yaml
+F: drivers/pinctrl/pinctrl-loongson2.c
+
LSILOGIC MPT FUSION DRIVERS (FC/SAS/SPI)
S: Maintained
W: http://linux-test-project.github.io/
-T: git git://github.com/linux-test-project/ltp.git
+T: git https://github.com/linux-test-project/ltp.git
LYNX 28G SERDES PHY DRIVER
MARVELL 88E6XXX ETHERNET SWITCH FABRIC DRIVER
S: Maintained
F: Documentation/devicetree/bindings/net/dsa/marvell.txt
S: Maintained
-F: Documentation/devicetree/bindings/net/marvell-pp2.txt
+F: Documentation/devicetree/bindings/net/marvell,pp2.yaml
F: drivers/net/ethernet/marvell/mvpp2/
MARVELL MWIFIEX WIRELESS DRIVER
F: drivers/net/ethernet/marvell/octeontx2/af/
MARVELL PRESTERA ETHERNET SWITCH DRIVER
S: Supported
W: https://github.com/Marvell-switching/switchdev-prestera
F: drivers/net/ethernet/marvell/prestera/
F: Documentation/devicetree/bindings/media/nxp,imx-mipi-csi2.yaml
F: Documentation/devicetree/bindings/media/nxp,imx7-csi.yaml
F: drivers/media/platform/nxp/imx-mipi-csis.c
-F: drivers/staging/media/imx/imx7-media-csi.c
+F: drivers/media/platform/nxp/imx7-media-csi.c
MEDIA DRIVERS FOR HELENE
S: Maintained
F: drivers/net/ethernet/mediatek/
F: include/linux/mm.h
F: include/linux/mmzone.h
F: include/linux/pagewalk.h
- F: include/linux/vmalloc.h
F: mm/
F: tools/testing/selftests/vm/
+ VMALLOC
+ S: Maintained
+ W: http://www.linux-mm.org
+ T: git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
+ F: include/linux/vmalloc.h
+ F: mm/vmalloc.c
+
MEMORY HOT(UN)PLUG
S: Maintained
-F: Documentation/devicetree/bindings/mtd/amlogic,meson-nand.txt
+F: Documentation/devicetree/bindings/mtd/amlogic,meson-nand.yaml
F: drivers/mtd/nand/raw/meson_*
MESON VIDEO DECODER DRIVER FOR AMLOGIC SOCS
S: Supported
F: Documentation/devicetree/bindings/media/microchip,csi2dc.yaml
-F: drivers/media/platform/atmel/microchip-csi2dc.c
+F: drivers/media/platform/microchip/microchip-csi2dc.c
MICROCHIP ECC DRIVER
S: Supported
F: Documentation/devicetree/bindings/media/atmel,isc.yaml
F: Documentation/devicetree/bindings/media/microchip,xisc.yaml
-F: drivers/media/platform/atmel/atmel-isc*
-F: drivers/media/platform/atmel/atmel-sama*-isc*
+F: drivers/staging/media/deprecated/atmel/atmel-isc*
+F: drivers/staging/media/deprecated/atmel/atmel-sama*-isc*
+F: drivers/media/platform/microchip/microchip-isc*
+F: drivers/media/platform/microchip/microchip-sama*-isc*
F: include/linux/atmel-isc-media.h
MICROCHIP ISI DRIVER
F: drivers/misc/atmel-ssc.c
F: include/linux/atmel-ssc.h
+MICROCHIP SOC DRIVERS
+S: Supported
+T: git https://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git/
+F: drivers/soc/microchip/
+
MICROCHIP USB251XB DRIVER
MOTORCOMM PHY DRIVER
S: Maintained
F: drivers/net/phy/motorcomm.c
NETWORKING [DSA]
S: Maintained
T: git git://linuxtv.org/media_tree.git
F: drivers/media/i2c/ov08d10.c
+OMNIVISION OV08X40 SENSOR DRIVER
+S: Maintained
+T: git git://linuxtv.org/media_tree.git
+F: drivers/media/i2c/ov08x40.c
+
OMNIVISION OV13858 SENSOR DRIVER
T: git git://linuxtv.org/media_tree.git
F: drivers/media/i2c/ov2740.c
+OMNIVISION OV4689 SENSOR DRIVER
+S: Maintained
+T: git git://linuxtv.org/media_tree.git
+F: Documentation/devicetree/bindings/media/i2c/ovti,ov4689.yaml
+F: drivers/media/i2c/ov5647.c
+
OMNIVISION OV5640 SENSOR DRIVER
F: drivers/mtd/nand/onenand/
F: include/linux/mtd/onenand*.h
+ONEXPLAYER FAN DRIVER
+S: Maintained
+F: drivers/hwmon/oxp-sensors.c
+
ONION OMEGA2+ BOARD
B: https://bugzilla.kernel.org
C: irc://irc.oftc.net/linux-pci
T: git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git
+F: Documentation/devicetree/bindings/pci/
F: drivers/pci/controller/
F: drivers/pci/pci-bridge-emul.c
F: drivers/pci/pci-bridge-emul.h
F: drivers/pci/controller/*microchip*
PCIE DRIVER FOR QUALCOMM MSM
S: Maintained
F: include/linux/peci.h
PENSANDO ETHERNET DRIVERS
S: Supported
S: Maintained
F: Documentation/devicetree/bindings/pinctrl/mediatek,mt65xx-pinctrl.yaml
-F: Documentation/devicetree/bindings/pinctrl/mediatek,mt6797-pinctrl.yaml
+F: Documentation/devicetree/bindings/pinctrl/mediatek,mt6779-pinctrl.yaml
F: Documentation/devicetree/bindings/pinctrl/mediatek,mt7622-pinctrl.yaml
F: Documentation/devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml
F: drivers/pinctrl/mediatek/
F: Documentation/devicetree/bindings/input/pine64,pinephone-keyboard.yaml
F: drivers/input/keyboard/pinephone-keyboard.c
-PKTCDVD DRIVER
-S: Orphan
-F: drivers/block/pktcdvd.c
-F: include/linux/pktcdvd.h
-F: include/uapi/linux/pktcdvd.h
-
PLANTOWER PMS7003 AIR POLLUTION SENSOR DRIVER
S: Maintained
PSTORE FILESYSTEM
-M: Colin Cross <ccross@android.com>
-S: Maintained
+R: Guilherme G. Piccoli <gpiccoli@igalia.com>
+S: Supported
T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/pstore
F: Documentation/admin-guide/ramoops.rst
F: Documentation/admin-guide/pstore-blk.rst
S: Maintained
T: git git://linuxtv.org/media_tree.git
-F: Documentation/admin-guide/media/pulse8-cec.rst
F: drivers/media/cec/usb/pulse8/
PURELIFI PLFXLC DRIVER
S: Maintained
+F: Documentation/devicetree/bindings/leds/irled/pwm-ir-tx.yaml
F: drivers/media/rc/pwm-ir-tx.c
PWM SUBSYSTEM
S: Supported
-F: Documentation/devicetree/bindings/soc/qcom/qcom,apr.yaml
+F: Documentation/devicetree/bindings/soc/qcom/qcom,apr*
F: Documentation/devicetree/bindings/sound/qcom,*
F: drivers/soc/qcom/apr.c
F: include/dt-bindings/sound/qcom,wcd9335.h
F: drivers/thermal/qcom/
QUALCOMM VENUS VIDEO ACCELERATOR DRIVER
S: Maintained
S: Supported
W: http://ceph.com/
-T: git git://github.com/ceph/ceph-client.git
+T: git https://github.com/ceph/ceph-client.git
F: Documentation/ABI/testing/sysfs-bus-rbd
F: drivers/block/rbd.c
F: drivers/block/rbd_types.h
F: drivers/net/wireless/realtek/rtw89/
REDPINE WIRELESS DRIVER
-S: Maintained
+S: Orphan
F: drivers/net/wireless/rsi/
REGISTER MAP ABSTRACTION
N: riscv
K: riscv
-RISC-V/MICROCHIP POLARFIRE SOC SUPPORT
+RISC-V MICROCHIP FPGA SUPPORT
F: arch/riscv/boot/dts/microchip/
F: drivers/char/hw_random/mpfs-rng.c
F: drivers/clk/microchip/clk-mpfs.c
-F: drivers/i2c/busses/i2c-microchip-core.c
+F: drivers/i2c/busses/i2c-microchip-corei2c.c
F: drivers/mailbox/mailbox-mpfs.c
F: drivers/pci/controller/pcie-microchip-host.c
F: drivers/reset/reset-mpfs.c
F: drivers/rtc/rtc-mpfs.c
-F: drivers/soc/microchip/
+F: drivers/soc/microchip/mpfs-sys-controller.c
F: drivers/spi/spi-microchip-core-qspi.c
F: drivers/spi/spi-microchip-core.c
F: drivers/usb/musb/mpfs.c
F: include/soc/microchip/mpfs.h
+RISC-V MISC SOC SUPPORT
+S: Maintained
+Q: https://patchwork.kernel.org/project/linux-riscv/list/
+T: git https://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git/
+F: Documentation/devicetree/bindings/riscv/
+F: arch/riscv/boot/dts/
+
RNBD BLOCK DRIVERS
S: Maintained
F: drivers/video/fbdev/savage/
-S390
+S390 ARCHITECTURE
S: Supported
F: drivers/s390/net/
+S390 MM
+S: Supported
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git
+F: arch/s390/include/asm/pgtable.h
+F: arch/s390/mm
+
S390 PCI SUBSYSTEM
K: \bTIF_SECCOMP\b
SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) Broadcom BRCMSTB DRIVER
S: Supported
+F: Documentation/devicetree/bindings/mmc/sdhci-common.yaml
F: drivers/mmc/host/sdhci*
SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) MICROCHIP DRIVER
S: Supported
-T: git https://github.com/sifive/riscv-linux.git
N: sifive
K: [^@]sifive
F: Documentation/devicetree/bindings/dma/sifive,fu540-c000-pdma.yaml
F: drivers/dma/sf-pdma/
+SIFIVE SOC DRIVERS
+S: Maintained
+T: git https://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git/
+F: drivers/soc/sifive/
+
SILEAD TOUCHSCREEN DRIVER
F: include/video/sisfb.h
SIS I2C TOUCHSCREEN DRIVER
-M: Mika Penttilä <mika.penttila@nextfour.com>
+M: Mika Penttilä <mpenttil@redhat.com>
S: Maintained
F: Documentation/devicetree/bindings/input/touchscreen/sis_i2c.txt
S: Maintained
-F: Documentation/devicetree/bindings/net/socionext-netsec.txt
+F: Documentation/devicetree/bindings/net/socionext,synquacer-netsec.yaml
F: drivers/net/ethernet/socionext/netsec.c
SOCIONEXT (SNI) Synquacer SPI DRIVER
S: Maintained
-F: Documentation/devicetree/bindings/spi/spi-synquacer.txt
+F: Documentation/devicetree/bindings/spi/socionext,synquacer-spi.yaml
F: drivers/spi/spi-synquacer.c
SOCIONEXT SYNQUACER I2C DRIVER
S: Maintained
T: git git://linuxtv.org/media_tree.git
-F: Documentation/devicetree/bindings/media/i2c/imx290.txt
+F: Documentation/devicetree/bindings/media/i2c/sony,imx290.yaml
F: drivers/media/i2c/imx290.c
SONY IMX319 SENSOR DRIVER
Q: http://patchwork.linuxtv.org/project/linux-media/list/
F: drivers/media/dvb-frontends/sp2*
+SPANISH DOCUMENTATION
+S: Maintained
+F: Documentation/translations/sp_SP/
+
SPARC + UltraSPARC (sparc/sparc64)
S: Maintained
T: git git://linuxtv.org/media_tree.git
-F: Documentation/devicetree/bindings/media/i2c/st,st-mipid02.txt
+F: Documentation/devicetree/bindings/media/i2c/st,st-mipid02.yaml
F: drivers/media/i2c/st-mipid02.c
ST STM32 I2C/SMBUS DRIVER
F: Documentation/hwmon/stpddc60.rst
F: drivers/hwmon/pmbus/stpddc60.c
+ST VGXY61 DRIVER
+S: Maintained
+T: git git://linuxtv.org/media_tree.git
+F: Documentation/devicetree/bindings/media/i2c/st,st-vgxy61.yaml
+F: Documentation/userspace-api/media/drivers/st-vgxy61.rst
+F: drivers/media/i2c/st-vgxy61.c
+
ST VL53L0X ToF RANGER(I2C) IIO DRIVER
F: Documentation/process/stable-kernel-rules.rst
STAGING - ATOMISP DRIVER
S: Odd Fixes
F: drivers/net/ethernet/adaptec/starfire*
+STARFIVE DEVICETREES
+S: Maintained
+F: arch/riscv/boot/dts/starfive/
+
STARFIVE JH7100 CLOCK DRIVERS
S: Maintained
F: Documentation/devicetree/bindings/net/sunplus,sp7021-emac.yaml
F: drivers/net/ethernet/sunplus/
+SUNPLUS MMC DRIVER
+S: Maintained
+F: Documentation/devicetree/bindings/mmc/sunplus,mmc.yaml
+F: drivers/mmc/host/sunplus-mmc.c
+
SUNPLUS OCOTP DRIVER
S: Maintained
F: drivers/cpufreq/sc[mp]i-cpufreq.c
F: drivers/firmware/arm_scmi/
F: drivers/firmware/arm_scpi.c
+F: drivers/powercap/arm_scmi_powercap.c
F: drivers/regulator/scmi-regulator.c
F: drivers/reset/reset-scmi.c
F: include/linux/sc[mp]i_protocol.h
S: Maintained
-T: git git://github.com/czankel/xtensa-linux.git
+T: git https://github.com/jcmvbkbc/linux-xtensa.git
F: arch/xtensa/
F: drivers/irqchip/irq-xtensa-*
W: https://wireless.wiki.kernel.org/en/users/Drivers/wl1251
T: git git://git.kernel.org/pub/scm/linux/kernel/git/luca/wl12xx.git
F: drivers/net/wireless/ti/
-F: include/linux/wl12xx.h
TIMEKEEPING, CLOCKSOURCE CORE, NTP, ALARMTIMER
F: include/uapi/linux/virtio_*.h
F: tools/virtio/
+VISL VIRTUAL STATELESS DECODER DRIVER
+S: Supported
+F: drivers/media/test-drivers/visl
+
IFCVF VIRTIO DATA PATH ACCELERATOR
F: drivers/vdpa/ifcvf/
#define pgd_none(pgd) (0)
#define pgd_bad(pgd) (0)
#define pgd_clear(pgdp)
- #define kern_addr_valid(addr) (1)
- /* FIXME */
/*
* PMD_SHIFT determines the size of the area a second-level page table can map
* PGDIR_SHIFT determines what a third-level page table entry can map
typedef pte_t *pte_addr_t;
-/*
- * ZERO_PAGE is a global shared page that is always zero: used
- * for zero-mapped memory areas etc..
- */
-#define ZERO_PAGE(vaddr) (virt_to_page(0))
-
/*
* Mark the prot value as uncacheable and unbufferable.
*/
#include <linux/const.h>
#include <asm/proc-fns.h>
+#ifndef __ASSEMBLY__
+/*
+ * ZERO_PAGE is a global shared page that is always zero: used
+ * for zero-mapped memory areas etc..
+ */
+extern struct page *empty_zero_page;
+#define ZERO_PAGE(vaddr) (empty_zero_page)
+#endif
+
#ifndef CONFIG_MMU
#include <asm-generic/pgtable-nopud.h>
*/
#ifndef __ASSEMBLY__
-/*
- * ZERO_PAGE is a global shared page that is always zero: used
- * for zero-mapped memory areas etc..
- */
-extern struct page *empty_zero_page;
-#define ZERO_PAGE(vaddr) (empty_zero_page)
-
extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
*/
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)
- /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
- /* FIXME: this is not correct */
- #define kern_addr_valid(addr) (1)
-
/*
* We provide our own arch_get_unmapped_area to cope with VIPT caches.
*/
static inline phys_addr_t __pte_to_phys(pte_t pte)
{
return (pte_val(pte) & PTE_ADDR_LOW) |
- ((pte_val(pte) & PTE_ADDR_HIGH) << 36);
+ ((pte_val(pte) & PTE_ADDR_HIGH) << PTE_ADDR_HIGH_SHIFT);
}
static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
{
- return (phys | (phys >> 36)) & PTE_ADDR_MASK;
+ return (phys | (phys >> PTE_ADDR_HIGH_SHIFT)) & PTE_ADDR_MASK;
}
#else
#define __pte_to_phys(pte) (pte_val(pte) & PTE_ADDR_MASK)
extern pgd_t init_pg_end[];
extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
-extern pgd_t idmap_pg_end[];
extern pgd_t tramp_pg_dir[PTRS_PER_PGD];
extern pgd_t reserved_pg_dir[PTRS_PER_PGD];
static inline bool pmd_user_accessible_page(pmd_t pmd)
{
- return pmd_present(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
+ return pmd_leaf(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
}
static inline bool pud_user_accessible_page(pud_t pud)
{
- return pud_present(pud) && pud_user(pud);
+ return pud_leaf(pud) && pud_user(pud);
}
#endif
*/
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)
- extern int kern_addr_valid(unsigned long addr);
-
#ifdef CONFIG_ARM64_MTE
#define __HAVE_ARCH_PREPARE_TO_SWAP
}
+#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
+#define ptep_modify_prot_start ptep_modify_prot_start
+extern pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+
+#define ptep_modify_prot_commit ptep_modify_prot_commit
+extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t new_pte);
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_PGTABLE_H */
create_idmap();
}
- /*
- * Check whether a kernel address is valid (derived from arch/x86/).
- */
- int kern_addr_valid(unsigned long addr)
- {
- pgd_t *pgdp;
- p4d_t *p4dp;
- pud_t *pudp, pud;
- pmd_t *pmdp, pmd;
- pte_t *ptep, pte;
-
- addr = arch_kasan_reset_tag(addr);
- if ((((long)addr) >> VA_BITS) != -1UL)
- return 0;
-
- pgdp = pgd_offset_k(addr);
- if (pgd_none(READ_ONCE(*pgdp)))
- return 0;
-
- p4dp = p4d_offset(pgdp, addr);
- if (p4d_none(READ_ONCE(*p4dp)))
- return 0;
-
- pudp = pud_offset(p4dp, addr);
- pud = READ_ONCE(*pudp);
- if (pud_none(pud))
- return 0;
-
- if (pud_sect(pud))
- return pfn_valid(pud_pfn(pud));
-
- pmdp = pmd_offset(pudp, addr);
- pmd = READ_ONCE(*pmdp);
- if (pmd_none(pmd))
- return 0;
-
- if (pmd_sect(pmd))
- return pfn_valid(pmd_pfn(pmd));
-
- ptep = pte_offset_kernel(pmdp, addr);
- pte = READ_ONCE(*ptep);
- if (pte_none(pte))
- return 0;
-
- return pfn_valid(pte_pfn(pte));
- }
-
#ifdef CONFIG_MEMORY_HOTPLUG
static void free_hotplug_page_range(struct page *page, size_t size,
struct vmem_altmap *altmap)
}
#endif
+ void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
+ unsigned long addr, unsigned long next)
+ {
+ pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
+ }
+
+ int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
+ unsigned long addr, unsigned long next)
+ {
+ vmemmap_verify((pte_t *)pmdp, node, addr, next);
+ return 1;
+ }
+
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap)
{
- unsigned long addr = start;
- unsigned long next;
- pgd_t *pgdp;
- p4d_t *p4dp;
- pud_t *pudp;
- pmd_t *pmdp;
-
WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
- if (!ARM64_KERNEL_USES_PMD_MAPS)
+ if (!IS_ENABLED(CONFIG_ARM64_4K_PAGES))
return vmemmap_populate_basepages(start, end, node, altmap);
-
- do {
- next = pmd_addr_end(addr, end);
-
- pgdp = vmemmap_pgd_populate(addr, node);
- if (!pgdp)
- return -ENOMEM;
-
- p4dp = vmemmap_p4d_populate(pgdp, addr, node);
- if (!p4dp)
- return -ENOMEM;
-
- pudp = vmemmap_pud_populate(p4dp, addr, node);
- if (!pudp)
- return -ENOMEM;
-
- pmdp = pmd_offset(pudp, addr);
- if (pmd_none(READ_ONCE(*pmdp))) {
- void *p = NULL;
-
- p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
- if (!p) {
- if (vmemmap_populate_basepages(addr, next, node, altmap))
- return -ENOMEM;
- continue;
- }
-
- pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
- } else
- vmemmap_verify((pte_t *)pmdp, node, addr, next);
- } while (addr = next, addr != end);
-
- return 0;
+ else
+ return vmemmap_populate_hugepages(start, end, node, altmap);
}
#ifdef CONFIG_MEMORY_HOTPLUG
}
early_initcall(prevent_bootmem_remove_init);
#endif
+
+pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
+{
+ if (IS_ENABLED(CONFIG_ARM64_WORKAROUND_2645198) &&
+ cpus_have_const_cap(ARM64_WORKAROUND_2645198)) {
+ /*
+ * Break-before-make (BBM) is required for all user space mappings
+ * when the permission changes from executable to non-executable
+ * in cases where cpu is affected with errata #2645198.
+ */
+ if (pte_user_exec(READ_ONCE(*ptep)))
+ return ptep_clear_flush(vma, addr, ptep);
+ }
+ return ptep_get_and_clear(vma->vm_mm, addr, ptep);
+}
+
+void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ set_pte_at(vma->vm_mm, addr, ptep, pte);
+}
* mapped at page granularity, so that it is possible to
* protect/unprotect single pages.
*/
- return rodata_full || debug_pagealloc_enabled() ||
+ return (rodata_enabled && rodata_full) || debug_pagealloc_enabled() ||
IS_ENABLED(CONFIG_KFENCE);
}
* If we are manipulating read-only permissions, apply the same
* change to the linear mapping of the pages that back this VM area.
*/
- if (rodata_full && (pgprot_val(set_mask) == PTE_RDONLY ||
+ if (rodata_enabled &&
+ rodata_full && (pgprot_val(set_mask) == PTE_RDONLY ||
pgprot_val(clear_mask) == PTE_RDONLY)) {
for (i = 0; i < area->nr_pages; i++) {
__change_memory_common((u64)page_address(area->pages[i]),
/*
* This function is used to determine if a linear map page has been marked as
- * not-valid. Walk the page table and check the PTE_VALID bit. This is based
- * on kern_addr_valid(), which almost does what we need.
+ * not-valid. Walk the page table and check the PTE_VALID bit.
*
* Because this is only called on the kernel linear map, p?d_sect() implies
* p?d_present(). When debug_pagealloc is enabled, sections mappings are
select ARCH_ENABLE_MEMORY_HOTPLUG
select ARCH_ENABLE_MEMORY_HOTREMOVE
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
+ select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_INLINE_READ_LOCK if !PREEMPTION
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+ select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
select ARCH_WANT_LD_ORPHAN_WARN
select ARCH_WANTS_NO_INSTR
select BUILDTIME_TABLE_SORT
config ARCH_SPARSEMEM_ENABLE
def_bool y
+ select SPARSEMEM_VMEMMAP_ENABLE
help
Say Y to support efficient handling of sparse physical memory,
for architectures which are either NUMA (Non-Uniform Memory Access)
#include <linux/compiler.h>
#include <asm/addrspace.h>
+ #include <asm/page.h>
#include <asm/pgtable-bits.h>
#if CONFIG_PGTABLE_LEVELS == 2
#include <linux/mm_types.h>
#include <linux/mmzone.h>
#include <asm/fixmap.h>
+ #include <asm/sparsemem.h>
struct mm_struct;
struct vm_area_struct;
#define VMALLOC_START MODULES_END
#define VMALLOC_END \
(vm_map_base + \
- min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits)) - PMD_SIZE)
+ min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits)) - PMD_SIZE - VMEMMAP_SIZE)
+
+ #define vmemmap ((struct page *)((VMALLOC_END + PMD_SIZE) & PMD_MASK))
+ #define VMEMMAP_END ((unsigned long)vmemmap + VMEMMAP_SIZE - 1)
#define pte_ERROR(e) \
pr_err("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))
#define pfn_pmd(pfn, prot) __pmd(((pfn) << _PFN_SHIFT) | pgprot_val(prot))
/*
- * Initialize a new pgd / pmd table with invalid pointers.
+ * Initialize a new pgd / pud / pmd table with invalid pointers.
*/
- extern void pgd_init(unsigned long page);
- extern void pud_init(unsigned long page, unsigned long pagetable);
- extern void pmd_init(unsigned long page, unsigned long pagetable);
+ extern void pgd_init(void *addr);
+ extern void pud_init(void *addr);
+ extern void pmd_init(void *addr);
/*
* Non-present pages: high 40 bits are offset, next 8 bits type,
static inline pte_t pte_mkdirty(pte_t pte)
{
- pte_val(pte) |= (_PAGE_DIRTY | _PAGE_MODIFIED);
+ pte_val(pte) |= _PAGE_MODIFIED;
+ if (pte_val(pte) & _PAGE_WRITE)
+ pte_val(pte) |= _PAGE_DIRTY;
return pte;
}
static inline pte_t pte_mkwrite(pte_t pte)
{
- pte_val(pte) |= (_PAGE_WRITE | _PAGE_DIRTY);
+ pte_val(pte) |= _PAGE_WRITE;
+ if (pte_val(pte) & _PAGE_MODIFIED)
+ pte_val(pte) |= _PAGE_DIRTY;
return pte;
}
__update_tlb(vma, address, (pte_t *)pmdp);
}
- #define kern_addr_valid(addr) (1)
-
static inline unsigned long pmd_pfn(pmd_t pmd)
{
return (pmd_val(pmd) & _PFN_MASK) >> _PFN_SHIFT;
static inline pmd_t pmd_mkwrite(pmd_t pmd)
{
- pmd_val(pmd) |= (_PAGE_WRITE | _PAGE_DIRTY);
+ pmd_val(pmd) |= _PAGE_WRITE;
+ if (pmd_val(pmd) & _PAGE_MODIFIED)
+ pmd_val(pmd) |= _PAGE_DIRTY;
return pmd;
}
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
- pmd_val(pmd) |= (_PAGE_DIRTY | _PAGE_MODIFIED);
+ pmd_val(pmd) |= _PAGE_MODIFIED;
+ if (pmd_val(pmd) & _PAGE_WRITE)
+ pmd_val(pmd) |= _PAGE_DIRTY;
return pmd;
}
return ret;
pginfo.addr = encl_page->desc & PAGE_MASK;
- pginfo.contents = (unsigned long)kmap_atomic(b.contents);
- pcmd_page = kmap_atomic(b.pcmd);
+ pginfo.contents = (unsigned long)kmap_local_page(b.contents);
+ pcmd_page = kmap_local_page(b.pcmd);
pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
if (secs_page)
*/
pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
- kunmap_atomic(pcmd_page);
- kunmap_atomic((void *)(unsigned long)pginfo.contents);
+ kunmap_local(pcmd_page);
+ kunmap_local((void *)(unsigned long)pginfo.contents);
get_page(b.pcmd);
sgx_encl_put_backing(&b);
if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) {
sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
- pcmd_page = kmap_atomic(b.pcmd);
+ pcmd_page = kmap_local_page(b.pcmd);
if (memchr_inv(pcmd_page, 0, PAGE_SIZE))
pr_warn("PCMD page not empty after truncate.\n");
- kunmap_atomic(pcmd_page);
+ kunmap_local(pcmd_page);
}
put_page(b.pcmd);
unsigned long addr,
unsigned long vm_flags)
{
- unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+ unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
struct sgx_encl_page *entry;
entry = xa_load(&encl->page_array, PFN_DOWN(addr));
int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
unsigned long end, unsigned long vm_flags)
{
- unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+ unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
struct sgx_encl_page *page;
unsigned long count = 0;
int ret = 0;
void sgx_encl_release(struct kref *ref)
{
struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
+ unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1);
struct sgx_va_page *va_page;
struct sgx_encl_page *entry;
- unsigned long index;
+ unsigned long count = 0;
+
+ XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base));
- xa_for_each(&encl->page_array, index, entry) {
+ xas_lock(&xas);
+ xas_for_each(&xas, entry, max_page_index) {
if (entry->epc_page) {
/*
* The page and its radix tree entry cannot be freed
}
kfree(entry);
- /* Invoke scheduler to prevent soft lockups. */
- cond_resched();
+ /*
+ * Invoke scheduler on every XA_CHECK_SCHED iteration
+ * to prevent soft lockups.
+ */
+ if (!(++count % XA_CHECK_SCHED)) {
+ xas_pause(&xas);
+ xas_unlock(&xas);
+
+ cond_resched();
+
+ xas_lock(&xas);
+ }
}
+ xas_unlock(&xas);
xa_destroy(&encl->page_array);
{
struct memory_initiator *ia;
struct memory_initiator *ib;
- unsigned long *p_nodes = priv;
ia = list_entry(a, struct memory_initiator, node);
ib = list_entry(b, struct memory_initiator, node);
- set_bit(ia->processor_pxm, p_nodes);
- set_bit(ib->processor_pxm, p_nodes);
-
return ia->processor_pxm - ib->processor_pxm;
}
+static int initiators_to_nodemask(unsigned long *p_nodes)
+{
+ struct memory_initiator *initiator;
+
+ if (list_empty(&initiators))
+ return -ENXIO;
+
+ list_for_each_entry(initiator, &initiators, node)
+ set_bit(initiator->processor_pxm, p_nodes);
+
+ return 0;
+}
+
static void hmat_register_target_initiators(struct memory_target *target)
{
static DECLARE_BITMAP(p_nodes, MAX_NUMNODES);
* initiators.
*/
bitmap_zero(p_nodes, MAX_NUMNODES);
- list_sort(p_nodes, &initiators, initiator_cmp);
+ list_sort(NULL, &initiators, initiator_cmp);
+ if (initiators_to_nodemask(p_nodes) < 0)
+ return;
+
if (!access0done) {
for (i = WRITE_LATENCY; i <= READ_BANDWIDTH; i++) {
loc = localities_types[i];
/* Access 1 ignores Generic Initiators */
bitmap_zero(p_nodes, MAX_NUMNODES);
- list_sort(p_nodes, &initiators, initiator_cmp);
- best = 0;
+ if (initiators_to_nodemask(p_nodes) < 0)
+ return;
+
for (i = WRITE_LATENCY; i <= READ_BANDWIDTH; i++) {
loc = localities_types[i];
if (!loc)
return NOTIFY_OK;
}
- static struct notifier_block hmat_callback_nb = {
- .notifier_call = hmat_callback,
- .priority = 2,
- };
-
static __init void hmat_free_structures(void)
{
struct memory_target *target, *tnext;
hmat_register_targets();
/* Keep the table and structures if the notifier may use them */
- if (!register_hotmemory_notifier(&hmat_callback_nb))
+ if (!hotplug_memory_notifier(hmat_callback, HMAT_CALLBACK_PRI))
return 0;
out_put:
hmat_free_structures();
#include "amdgpu.h"
#include "amdgpu_display.h"
#include "amdgpu_dma_buf.h"
+#include "amdgpu_hmm.h"
#include "amdgpu_xgmi.h"
static const struct drm_gem_object_funcs amdgpu_gem_object_funcs;
struct amdgpu_bo *robj = gem_to_amdgpu_bo(gobj);
if (robj) {
- amdgpu_mn_unregister(robj);
+ amdgpu_hmm_unregister(robj);
amdgpu_bo_unref(&robj);
}
}
bp.resv = resv;
bp.preferred_domain = initial_domain;
bp.flags = flags;
- bp.domain = initial_domain;
+ bp.domain = initial_domain | AMDGPU_GEM_DOMAIN_CPU;
bp.bo_ptr_size = sizeof(struct amdgpu_bo);
r = amdgpu_bo_create_user(adev, &bp, &ubo);
* becoming writable and makes is_cow_mapping(vm_flags) false.
*/
if (is_cow_mapping(vma->vm_flags) &&
- !(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
+ !(vma->vm_flags & VM_ACCESS_FLAGS))
vma->vm_flags &= ~VM_MAYWRITE;
return drm_gem_ttm_mmap(obj, vma);
}
initial_domain = (u32)(0xffffffff & args->in.domains);
-retry:
r = amdgpu_gem_object_create(adev, size, args->in.alignment,
- initial_domain,
- flags, ttm_bo_type_device, resv, &gobj);
+ initial_domain, flags, ttm_bo_type_device,
+ resv, &gobj);
if (r && r != -ERESTARTSYS) {
- if (flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) {
- flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
- goto retry;
- }
-
- if (initial_domain == AMDGPU_GEM_DOMAIN_VRAM) {
- initial_domain |= AMDGPU_GEM_DOMAIN_GTT;
- goto retry;
- }
DRM_DEBUG("Failed to allocate GEM object (%llu, %d, %llu, %d)\n",
size, initial_domain, args->in.alignment, r);
}
struct amdgpu_device *adev = drm_to_adev(dev);
struct drm_amdgpu_gem_userptr *args = data;
struct drm_gem_object *gobj;
+ struct hmm_range *range;
struct amdgpu_bo *bo;
uint32_t handle;
int r;
if (r)
goto release_object;
- if (args->flags & AMDGPU_GEM_USERPTR_REGISTER) {
- r = amdgpu_mn_register(bo, args->addr);
- if (r)
- goto release_object;
- }
+ r = amdgpu_hmm_register(bo, args->addr);
+ if (r)
+ goto release_object;
if (args->flags & AMDGPU_GEM_USERPTR_VALIDATE) {
- r = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
+ r = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
+ &range);
if (r)
goto release_object;
user_pages_done:
if (args->flags & AMDGPU_GEM_USERPTR_VALIDATE)
- amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
+ amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);
release_object:
drm_gem_object_put(gobj);
{
pgprot_t vm_page_prot;
- vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP;
+ vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
vm_page_prot = vm_get_page_prot(vma->vm_flags);
struct vm_area_struct *vma = vmf->vma;
struct drm_gem_object *obj = vma->vm_private_data;
struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj);
- struct page **pages, *page;
+ struct page **pages;
+ unsigned long pfn;
pgoff_t pgoff;
int err;
/* We don't use vmf->pgoff since that has the fake offset: */
pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
- page = pages[pgoff];
+ pfn = page_to_pfn(pages[pgoff]);
VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
- page_to_pfn(page), page_to_pfn(page) << PAGE_SHIFT);
+ pfn, pfn << PAGE_SHIFT);
- return vmf_insert_page(vma, vmf->address, page);
+ return vmf_insert_pfn(vma, vmf->address, pfn);
}
int etnaviv_gem_mmap_offset(struct drm_gem_object *obj, u64 *offset)
if (mapping->use == 0) {
mutex_lock(&mmu_context->lock);
if (mapping->context == mmu_context)
- mapping->use += 1;
+ if (va && mapping->iova != va) {
+ etnaviv_iommu_reap_mapping(mapping);
+ mapping = NULL;
+ } else {
+ mapping->use += 1;
+ }
else
mapping = NULL;
mutex_unlock(&mmu_context->lock);
kfree(mapping);
}
- drm_gem_free_mmap_offset(obj);
etnaviv_obj->ops->release(etnaviv_obj);
drm_gem_object_release(obj);
struct page **pvec = NULL;
struct etnaviv_gem_userptr *userptr = &etnaviv_obj->userptr;
int ret, pinned = 0, npages = etnaviv_obj->base.size >> PAGE_SHIFT;
+ unsigned int gup_flags = FOLL_LONGTERM;
might_lock_read(¤t->mm->mmap_lock);
if (!pvec)
return -ENOMEM;
+ if (!userptr->ro)
+ gup_flags |= FOLL_WRITE;
+
do {
unsigned num_pages = npages - pinned;
uint64_t ptr = userptr->ptr + pinned * PAGE_SIZE;
struct page **pages = pvec + pinned;
- ret = pin_user_pages_fast(ptr, num_pages,
- FOLL_WRITE | FOLL_FORCE | FOLL_LONGTERM,
- pages);
+ ret = pin_user_pages_fast(ptr, num_pages, gup_flags, pages);
if (ret < 0) {
unpin_user_pages(pvec, pinned);
kvfree(pvec);
* get_vaddr_frames() - map virtual addresses to pfns
* @start: starting user address
* @nr_frames: number of pages / pfns from start to map
+ * @write: the mapped address has write permission
* @vec: structure which receives pages / pfns of the addresses mapped.
* It should have space for at least nr_frames entries.
*
*
* This function takes care of grabbing mmap_lock as necessary.
*/
-int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
+int get_vaddr_frames(unsigned long start, unsigned int nr_frames, bool write,
struct frame_vector *vec)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- int ret_pin_user_pages_fast = 0;
- int ret = 0;
- int err;
+ int ret;
- unsigned int gup_flags = FOLL_FORCE | FOLL_LONGTERM;
++ unsigned int gup_flags = FOLL_LONGTERM;
if (nr_frames == 0)
return 0;
start = untagged_addr(start);
- ret = pin_user_pages_fast(start, nr_frames,
- FOLL_WRITE | FOLL_LONGTERM,
- (struct page **)(vec->ptrs));
- if (ret > 0) {
- vec->got_ref = true;
- vec->is_pfns = false;
- goto out_unlocked;
- }
- ret_pin_user_pages_fast = ret;
+ if (write)
+ gup_flags |= FOLL_WRITE;
- mmap_read_lock(mm);
- vec->got_ref = false;
- vec->is_pfns = true;
- ret = 0;
- do {
- unsigned long *nums = frame_vector_pfns(vec);
+ ret = pin_user_pages_fast(start, nr_frames, gup_flags,
+ (struct page **)(vec->ptrs));
+ vec->got_ref = true;
+ vec->is_pfns = false;
+ vec->nr_frames = ret;
- vma = vma_lookup(mm, start);
- if (!vma)
- break;
+ if (likely(ret > 0))
+ return ret;
- while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
- err = follow_pfn(vma, start, &nums[ret]);
- if (err) {
- if (ret)
- goto out;
- // If follow_pfn() returns -EINVAL, then this
- // is not an IO mapping or a raw PFN mapping.
- // In that case, return the original error from
- // pin_user_pages_fast(). Otherwise this
- // function would return -EINVAL when
- // pin_user_pages_fast() returned -ENOMEM,
- // which makes debugging hard.
- if (err == -EINVAL && ret_pin_user_pages_fast)
- ret = ret_pin_user_pages_fast;
- else
- ret = err;
- goto out;
- }
- start += PAGE_SIZE;
- ret++;
- }
- /* Bail out if VMA doesn't completely cover the tail page. */
- if (start < vma->vm_end)
- break;
- } while (ret < nr_frames);
-out:
- mmap_read_unlock(mm);
-out_unlocked:
- if (!ret)
- ret = -EFAULT;
- if (ret > 0)
- vec->nr_frames = ret;
- return ret;
+ /* This used to (racily) return non-refcounted pfns. Let people know */
+ WARN_ONCE(1, "get_vaddr_frames() cannot follow VM_IO mapping");
+ vec->nr_frames = 0;
+ return ret ? ret : -EFAULT;
}
EXPORT_SYMBOL(get_vaddr_frames);
return ncpy;
}
- static int fuse_check_page(struct page *page)
+ static int fuse_check_folio(struct folio *folio)
{
- if (page_mapcount(page) ||
- page->mapping != NULL ||
- (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
+ if (folio_mapped(folio) ||
+ folio->mapping != NULL ||
+ (folio->flags & PAGE_FLAGS_CHECK_AT_PREP &
~(1 << PG_locked |
1 << PG_referenced |
1 << PG_uptodate |
1 << PG_reclaim |
1 << PG_waiters |
LRU_GEN_MASK | LRU_REFS_MASK))) {
- dump_page(page, "fuse: trying to steal weird page");
+ dump_page(&folio->page, "fuse: trying to steal weird page");
return 1;
}
return 0;
static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
{
int err;
- struct page *oldpage = *pagep;
- struct page *newpage;
+ struct folio *oldfolio = page_folio(*pagep);
+ struct folio *newfolio;
struct pipe_buffer *buf = cs->pipebufs;
- get_page(oldpage);
+ folio_get(oldfolio);
err = unlock_request(cs->req);
if (err)
goto out_put_old;
if (!pipe_buf_try_steal(cs->pipe, buf))
goto out_fallback;
- newpage = buf->page;
+ newfolio = page_folio(buf->page);
- if (!PageUptodate(newpage))
- SetPageUptodate(newpage);
+ if (!folio_test_uptodate(newfolio))
+ folio_mark_uptodate(newfolio);
- ClearPageMappedToDisk(newpage);
+ folio_clear_mappedtodisk(newfolio);
- if (fuse_check_page(newpage) != 0)
+ if (fuse_check_folio(newfolio) != 0)
goto out_fallback_unlock;
/*
* This is a new and locked page, it shouldn't be mapped or
* have any special flags on it
*/
- if (WARN_ON(page_mapped(oldpage)))
+ if (WARN_ON(folio_mapped(oldfolio)))
goto out_fallback_unlock;
- if (WARN_ON(page_has_private(oldpage)))
+ if (WARN_ON(folio_has_private(oldfolio)))
goto out_fallback_unlock;
- if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
+ if (WARN_ON(folio_test_dirty(oldfolio) ||
+ folio_test_writeback(oldfolio)))
goto out_fallback_unlock;
- if (WARN_ON(PageMlocked(oldpage)))
+ if (WARN_ON(folio_test_mlocked(oldfolio)))
goto out_fallback_unlock;
- replace_page_cache_page(oldpage, newpage);
+ replace_page_cache_folio(oldfolio, newfolio);
- get_page(newpage);
+ folio_get(newfolio);
if (!(buf->flags & PIPE_BUF_FLAG_LRU))
- lru_cache_add(newpage);
+ folio_add_lru(newfolio);
/*
* Release while we have extra ref on stolen page. Otherwise
if (test_bit(FR_ABORTED, &cs->req->flags))
err = -ENOENT;
else
- *pagep = newpage;
+ *pagep = &newfolio->page;
spin_unlock(&cs->req->waitq.lock);
if (err) {
- unlock_page(newpage);
- put_page(newpage);
+ folio_unlock(newfolio);
+ folio_put(newfolio);
goto out_put_old;
}
- unlock_page(oldpage);
+ folio_unlock(oldfolio);
/* Drop ref for ap->pages[] array */
- put_page(oldpage);
+ folio_put(oldfolio);
cs->len = 0;
err = 0;
out_put_old:
/* Drop ref obtained in this function */
- put_page(oldpage);
+ folio_put(oldfolio);
return err;
out_fallback_unlock:
- unlock_page(newpage);
+ folio_unlock(newfolio);
out_fallback:
cs->pg = buf->page;
cs->offset = buf->offset;
buf[outarg.namelen] = 0;
down_read(&fc->killsb);
- err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags);
up_read(&fc->killsb);
kfree(buf);
return err;
buf[outarg.namelen] = 0;
down_read(&fc->killsb);
- err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0);
up_read(&fc->killsb);
kfree(buf);
return err;
* Check against file->f_op because CUSE
* uses the same ioctl handler.
*/
- if (old->f_op == file->f_op &&
- old->f_cred->user_ns == file->f_cred->user_ns)
+ if (old->f_op == file->f_op)
fud = fuse_get_dev(old);
if (fud) {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = hfs_read_folio,
- .writepage = hfs_writepage,
.write_begin = hfs_write_begin,
.write_end = generic_write_end,
.bmap = hfs_bmap,
.direct_IO = hfs_direct_IO,
.writepages = hfs_writepages,
+ .migrate_folio = buffer_migrate_folio,
};
/*
/* panic? */
return -EIO;
+ if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN)
+ return -EIO;
fd.search_key->cat = HFS_I(main_inode)->cat_key;
if (hfs_brec_find(&fd))
/* panic? */
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = hfsplus_read_folio,
- .writepage = hfsplus_writepage,
.write_begin = hfsplus_write_begin,
.write_end = generic_write_end,
.bmap = hfsplus_bmap,
.direct_IO = hfsplus_direct_IO,
.writepages = hfsplus_writepages,
+ .migrate_folio = buffer_migrate_folio,
};
const struct dentry_operations hfsplus_dentry_operations = {
mode = be16_to_cpu(perms->mode);
i_uid_write(inode, be32_to_cpu(perms->owner));
- if (!i_uid_read(inode) && !mode)
+ if ((test_bit(HFSPLUS_SB_UID, &sbi->flags)) || (!i_uid_read(inode) && !mode))
inode->i_uid = sbi->uid;
i_gid_write(inode, be32_to_cpu(perms->group));
- if (!i_gid_read(inode) && !mode)
+ if ((test_bit(HFSPLUS_SB_GID, &sbi->flags)) || (!i_gid_read(inode) && !mode))
inode->i_gid = sbi->gid;
if (dir) {
static int
xfs_setattr_nonsize(
struct user_namespace *mnt_userns,
+ struct dentry *dentry,
struct xfs_inode *ip,
struct iattr *iattr)
{
* Posix ACL code seems to care about this issue either.
*/
if (mask & ATTR_MODE) {
- error = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+ error = posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
if (error)
return error;
}
STATIC int
xfs_setattr_size(
struct user_namespace *mnt_userns,
+ struct dentry *dentry,
struct xfs_inode *ip,
struct iattr *iattr)
{
* Use the regular setattr path to update the timestamps.
*/
iattr->ia_valid &= ~ATTR_SIZE;
- return xfs_setattr_nonsize(mnt_userns, ip, iattr);
+ return xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr);
}
/*
error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
if (error)
return error;
- return xfs_setattr_size(mnt_userns, ip, iattr);
+ return xfs_setattr_size(mnt_userns, dentry, ip, iattr);
}
STATIC int
error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
if (!error)
- error = xfs_setattr_nonsize(mnt_userns, ip, iattr);
+ error = xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr);
}
return error;
}
static const struct inode_operations xfs_inode_operations = {
- .get_acl = xfs_get_acl,
+ .get_inode_acl = xfs_get_acl,
.set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.rmdir = xfs_vn_unlink,
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
- .get_acl = xfs_get_acl,
+ .get_inode_acl = xfs_get_acl,
.set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.rmdir = xfs_vn_unlink,
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
- .get_acl = xfs_get_acl,
+ .get_inode_acl = xfs_get_acl,
.set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
if (!S_ISREG(VFS_I(ip)->i_mode))
return false;
- /* Only supported on non-reflinked files. */
- if (xfs_is_reflink_inode(ip))
- return false;
-
/* Block size must match page size */
if (mp->m_sb.sb_blocksize != PAGE_SIZE)
return false;
/* Get a random number in [l, r) */
static inline unsigned long damon_rand(unsigned long l, unsigned long r)
{
- return l + prandom_u32_max(r - l);
+ return l + get_random_u32_below(r - l);
}
/**
* @after_wmarks_check: Called after each schemes' watermarks check.
* @after_sampling: Called after each sampling.
* @after_aggregation: Called after each aggregation.
+ * @before_damos_apply: Called before applying DAMOS action.
* @before_terminate: Called before terminating the monitoring.
* @private: User private data.
*
int (*after_wmarks_check)(struct damon_ctx *context);
int (*after_sampling)(struct damon_ctx *context);
int (*after_aggregation)(struct damon_ctx *context);
+ int (*before_damos_apply)(struct damon_ctx *context,
+ struct damon_target *target,
+ struct damon_region *region,
+ struct damos *scheme);
void (*before_terminate)(struct damon_ctx *context);
};
extern void * high_memory;
extern int page_cluster;
+ extern const int page_cluster_max;
#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
/*
* Called by mprotect() to make driver-specific permission
* checks before mprotect() is finalised. The VMA must not
- * be modified. Returns 0 if eprotect() can proceed.
+ * be modified. Returns 0 if mprotect() can proceed.
*/
int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
unsigned long end, unsigned long newflags);
* paths in userfault.
*/
bool vma_is_shmem(struct vm_area_struct *vma);
+ bool vma_is_anon_shmem(struct vm_area_struct *vma);
#else
static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
+ static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; }
#endif
int vma_is_stack_for_current(struct vm_area_struct *vma);
/*
* How many times the entire folio is mapped as a single unit (eg by a
* PMD or PUD entry). This is probably not what you want, except for
- * debugging purposes; look at folio_mapcount() or page_mapcount()
- * instead.
+ * debugging purposes - it does not include PTE-mapped sub-pages; look
+ * at folio_mapcount() or page_mapcount() or total_mapcount() instead.
*/
static inline int folio_entire_mapcount(struct folio *folio)
{
/*
* Mapcount of compound page as a whole, does not include mapped sub-pages.
- *
- * Must be called only for compound pages.
+ * Must be called only on head of compound page.
+ */
+ static inline int head_compound_mapcount(struct page *head)
+ {
+ return atomic_read(compound_mapcount_ptr(head)) + 1;
+ }
+
+ /*
+ * If a 16GB hugetlb page were mapped by PTEs of all of its 4kB sub-pages,
+ * its subpages_mapcount would be 0x400000: choose the COMPOUND_MAPPED bit
+ * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently
+ * leaves subpages_mapcount at 0, but avoid surprise if it participates later.
*/
- static inline int compound_mapcount(struct page *page)
+ #define COMPOUND_MAPPED 0x800000
+ #define SUBPAGES_MAPPED (COMPOUND_MAPPED - 1)
+
+ /*
+ * Number of sub-pages mapped by PTE, does not include compound mapcount.
+ * Must be called only on head of compound page.
+ */
+ static inline int head_subpages_mapcount(struct page *head)
{
- return folio_entire_mapcount(page_folio(page));
+ return atomic_read(subpages_mapcount_ptr(head)) & SUBPAGES_MAPPED;
}
/*
atomic_set(&(page)->_mapcount, -1);
}
- int __page_mapcount(struct page *page);
-
/*
* Mapcount of 0-order page; when compound sub-page, includes
- * compound_mapcount().
+ * compound_mapcount of compound_head of page.
*
* Result is undefined for pages which cannot be mapped into userspace.
* For example SLAB or special types of pages. See function page_has_type().
*/
static inline int page_mapcount(struct page *page)
{
- if (unlikely(PageCompound(page)))
- return __page_mapcount(page);
- return atomic_read(&page->_mapcount) + 1;
+ int mapcount = atomic_read(&page->_mapcount) + 1;
+
+ if (likely(!PageCompound(page)))
+ return mapcount;
+ page = compound_head(page);
+ return head_compound_mapcount(page) + mapcount;
}
- int folio_mapcount(struct folio *folio);
+ int total_compound_mapcount(struct page *head);
- #ifdef CONFIG_TRANSPARENT_HUGEPAGE
- static inline int total_mapcount(struct page *page)
+ /**
+ * folio_mapcount() - Calculate the number of mappings of this folio.
+ * @folio: The folio.
+ *
+ * A large folio tracks both how many times the entire folio is mapped,
+ * and how many times each individual page in the folio is mapped.
+ * This function calculates the total number of times the folio is
+ * mapped.
+ *
+ * Return: The number of times this folio is mapped.
+ */
+ static inline int folio_mapcount(struct folio *folio)
{
- return folio_mapcount(page_folio(page));
+ if (likely(!folio_test_large(folio)))
+ return atomic_read(&folio->_mapcount) + 1;
+ return total_compound_mapcount(&folio->page);
}
- #else
static inline int total_mapcount(struct page *page)
{
- return page_mapcount(page);
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) + 1;
+ return total_compound_mapcount(compound_head(page));
+ }
+
+ static inline bool folio_large_is_mapped(struct folio *folio)
+ {
+ /*
+ * Reading folio_mapcount_ptr() below could be omitted if hugetlb
+ * participated in incrementing subpages_mapcount when compound mapped.
+ */
+ return atomic_read(folio_subpages_mapcount_ptr(folio)) > 0 ||
+ atomic_read(folio_mapcount_ptr(folio)) >= 0;
+ }
+
+ /**
+ * folio_mapped - Is this folio mapped into userspace?
+ * @folio: The folio.
+ *
+ * Return: True if any page in this folio is referenced by user page tables.
+ */
+ static inline bool folio_mapped(struct folio *folio)
+ {
+ if (likely(!folio_test_large(folio)))
+ return atomic_read(&folio->_mapcount) >= 0;
+ return folio_large_is_mapped(folio);
+ }
+
+ /*
+ * Return true if this page is mapped into pagetables.
+ * For compound page it returns true if any sub-page of compound page is mapped,
+ * even if this particular sub-page is not itself mapped by any PTE or PMD.
+ */
+ static inline bool page_mapped(struct page *page)
+ {
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) >= 0;
+ return folio_large_is_mapped(page_folio(page));
}
- #endif
static inline struct page *virt_to_head_page(const void *x)
{
page[1].compound_dtor = compound_dtor;
}
+ static inline void folio_set_compound_dtor(struct folio *folio,
+ enum compound_dtor_id compound_dtor)
+ {
+ VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
+ folio->_folio_dtor = compound_dtor;
+ }
+
void destroy_large_folio(struct folio *folio);
static inline int head_compound_pincount(struct page *head)
#endif
}
+ /*
+ * folio_set_compound_order is generally passed a non-zero order to
+ * initialize a large folio. However, hugetlb code abuses this by
+ * passing in zero when 'dissolving' a large folio.
+ */
+ static inline void folio_set_compound_order(struct folio *folio,
+ unsigned int order)
+ {
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+
+ folio->_folio_order = order;
+ #ifdef CONFIG_64BIT
+ folio->_folio_nr_pages = order ? 1U << order : 0;
+ #endif
+ }
+
/* Returns the number of pages in this potentially compound page. */
static inline unsigned long compound_nr(struct page *page)
{
folio_get(page_folio(page));
}
-bool __must_check try_grab_page(struct page *page, unsigned int flags);
+int __must_check try_grab_page(struct page *page, unsigned int flags);
static inline __must_check bool try_get_page(struct page *page)
{
__folio_put(folio);
}
- void release_pages(struct page **pages, int nr);
+ /**
+ * release_pages - release an array of pages or folios
+ *
+ * This just releases a simple array of multiple pages, and
+ * accepts various different forms of said page array: either
+ * a regular old boring array of pages, an array of folios, or
+ * an array of encoded page pointers.
+ *
+ * The transparent union syntax for this kind of "any of these
+ * argument types" is all kinds of ugly, so look away.
+ */
+ typedef union {
+ struct page **pages;
+ struct folio **folios;
+ struct encoded_page **encoded_pages;
+ } release_pages_arg __attribute__ ((__transparent_union__));
+
+ void release_pages(release_pages_arg, int nr);
/**
* folios_put - Decrement the reference count on an array of folios.
*/
static inline void folios_put(struct folio **folios, unsigned int nr)
{
- release_pages((struct page **)folios, nr);
+ release_pages(folios, nr);
}
static inline void put_page(struct page *page)
return page->index;
}
- bool page_mapped(struct page *page);
- bool folio_mapped(struct folio *folio);
-
/*
* Return true only if the page has been allocated with
* ALLOC_NO_WATERMARKS and the low watermark was not
#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
MM_CP_UFFD_WP_RESOLVE)
+ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
+ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
+ {
+ /*
+ * We want to check manually if we can change individual PTEs writable
+ * if we can't do that automatically for all PTEs in a mapping. For
+ * private mappings, that's always the case when we have write
+ * permissions as we properly have to handle COW.
+ */
+ if (vma->vm_flags & VM_SHARED)
+ return vma_wants_writenotify(vma, vma->vm_page_prot);
+ return !!(vma->vm_flags & VM_WRITE);
+
+ }
+ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte);
extern unsigned long change_protection(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
*/
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
- long val = atomic_long_read(&mm->rss_stat.count[member]);
-
- #ifdef SPLIT_RSS_COUNTING
- /*
- * counter is updated in asynchronous manner and may go to minus.
- * But it's never be expected number for users.
- */
- if (val < 0)
- val = 0;
- #endif
- return (unsigned long)val;
+ return percpu_counter_read_positive(&mm->rss_stat[member]);
}
- void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);
+ void mm_trace_rss_stat(struct mm_struct *mm, int member);
static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
- long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
+ percpu_counter_add(&mm->rss_stat[member], value);
- mm_trace_rss_stat(mm, member, count);
+ mm_trace_rss_stat(mm, member);
}
static inline void inc_mm_counter(struct mm_struct *mm, int member)
{
- long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
+ percpu_counter_inc(&mm->rss_stat[member]);
- mm_trace_rss_stat(mm, member, count);
+ mm_trace_rss_stat(mm, member);
}
static inline void dec_mm_counter(struct mm_struct *mm, int member)
{
- long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
+ percpu_counter_dec(&mm->rss_stat[member]);
- mm_trace_rss_stat(mm, member, count);
+ mm_trace_rss_stat(mm, member);
}
/* Optimized variant when page is already known not to be PageAnon */
}
#endif
- int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
-
extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
spinlock_t **ptl);
static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
#if USE_SPLIT_PMD_PTLOCKS
- static struct page *pmd_to_page(pmd_t *pmd)
+ static inline struct page *pmd_pgtable_page(pmd_t *pmd)
{
unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
return virt_to_page((void *)((unsigned long) pmd & mask));
static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
- return ptlock_ptr(pmd_to_page(pmd));
+ return ptlock_ptr(pmd_pgtable_page(pmd));
}
static inline bool pmd_ptlock_init(struct page *page)
ptlock_free(page);
}
- #define pmd_huge_pte(mm, pmd) (pmd_to_page(pmd)->pmd_huge_pte)
+ #define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte)
#else
* and return without waiting upon it */
#define FOLL_NOFAULT 0x80 /* do not fault in pages */
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
- #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
#define FOLL_ANON 0x8000 /* don't do file mappings */
#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */
#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */
+#define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */
/*
* FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
* Must be called with the (sub)page that's actually referenced via the
* page table entry, which might not necessarily be the head page for a
* PTE-mapped THP.
+ *
+ * If the vma is NULL, we're coming from the GUP-fast path and might have
+ * to fallback to the slow path just to lookup the vma.
*/
- static inline bool gup_must_unshare(unsigned int flags, struct page *page)
+ static inline bool gup_must_unshare(struct vm_area_struct *vma,
+ unsigned int flags, struct page *page)
{
/*
* FOLL_WRITE is implicitly handled correctly as the page table entry
* Note: PageAnon(page) is stable until the page is actually getting
* freed.
*/
- if (!PageAnon(page))
- return false;
+ if (!PageAnon(page)) {
+ /*
+ * We only care about R/O long-term pining: R/O short-term
+ * pinning does not have the semantics to observe successive
+ * changes through the process page tables.
+ */
+ if (!(flags & FOLL_LONGTERM))
+ return false;
+
+ /* We really need the vma ... */
+ if (!vma)
+ return true;
+
+ /*
+ * ... because we only care about writable private ("COW")
+ * mappings where we have to break COW early.
+ */
+ return is_cow_mapping(vma->vm_flags);
+ }
/* Paired with a memory barrier in page_try_share_anon_rmap(). */
if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
struct page * __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap);
+ void pmd_init(void *addr);
+ void pud_init(void *addr);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
void *vmemmap_alloc_block_buf(unsigned long size, int node,
struct vmem_altmap *altmap);
void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
+ void vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
+ unsigned long addr, unsigned long next);
+ int vmemmap_check_pmd(pmd_t *pmd, int node,
+ unsigned long addr, unsigned long next);
int vmemmap_populate_basepages(unsigned long start, unsigned long end,
int node, struct vmem_altmap *altmap);
+ int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
+ int node, struct vmem_altmap *altmap);
int vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap);
void vmemmap_populate_print_last(void);
int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
unsigned long count, int mf_flags);
extern int memory_failure(unsigned long pfn, int flags);
- extern void memory_failure_queue(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
extern int sysctl_memory_failure_early_kill;
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
#ifdef CONFIG_MEMORY_FAILURE
- extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags);
+ extern void memory_failure_queue(unsigned long pfn, int flags);
+ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared);
+ void num_poisoned_pages_inc(unsigned long pfn);
+ void num_poisoned_pages_sub(unsigned long pfn, long i);
#else
- static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+ static inline void memory_failure_queue(unsigned long pfn, int flags)
+ {
+ }
+
+ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared)
{
return 0;
}
+
+ static inline void num_poisoned_pages_inc(unsigned long pfn)
+ {
+ }
+
+ static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
+ {
+ }
+ #endif
+
+ #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
+ extern void memblk_nr_poison_inc(unsigned long pfn);
+ extern void memblk_nr_poison_sub(unsigned long pfn, long i);
+ #else
+ static inline void memblk_nr_poison_inc(unsigned long pfn)
+ {
+ }
+
+ static inline void memblk_nr_poison_sub(unsigned long pfn, long i)
+ {
+ }
#endif
#ifndef arch_memory_failure
{
return page_zonenum(page) == ZONE_DEVICE;
}
+
+/*
+ * Consecutive zone device pages should not be merged into the same sgl
+ * or bvec segment with other types of pages or if they belong to different
+ * pgmaps. Otherwise getting the pgmap of a given segment is not possible
+ * without scanning the entire segment. This helper returns true either if
+ * both pages are not zone device pages or both pages are zone device pages
+ * with the same pgmap.
+ */
+static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
+ const struct page *b)
+{
+ if (is_zone_device_page(a) != is_zone_device_page(b))
+ return false;
+ if (!is_zone_device_page(a))
+ return true;
+ return a->pgmap == b->pgmap;
+}
+
extern void memmap_init_zone_device(struct zone *, unsigned long,
unsigned long, struct dev_pagemap *);
#else
{
return false;
}
+static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
+ const struct page *b)
+{
+ return true;
+}
#endif
static inline bool folio_is_zone_device(const struct folio *folio)
/* start time in ms of current promote threshold adjustment period */
unsigned int nbp_th_start;
/*
- * number of promote candidate pages at stat time of current promote
+ * number of promote candidate pages at start time of current promote
* threshold adjustment period
*/
unsigned long nbp_th_nr_cand;
struct mm_struct *mm;
struct mm_struct *active_mm;
- #ifdef SPLIT_RSS_COUNTING
- struct task_rss_stat rss_stat;
- #endif
int exit_state;
int exit_code;
int exit_signal;
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
-#ifdef CONFIG_PSI
- unsigned sched_psi_wake_requeue:1;
-#endif
/* Force alignment to the next boundary: */
unsigned :0;
unsigned int futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
- struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
+ struct perf_event_context *perf_event_ctxp;
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Call with callback_lock or cpuset_rwsem held.
+ * Call with callback_lock or cpuset_rwsem held. The check can be skipped
+ * if on default hierarchy.
*/
-static void cpuset_update_task_spread_flag(struct cpuset *cs,
+static void cpuset_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk)
{
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ return;
+
if (is_spread_page(cs))
task_set_spread_page(tsk);
else
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
- cpuset_update_task_spread_flag(cs, task);
+ cpuset_update_task_spread_flags(cs, task);
css_task_iter_end(&it);
}
struct cgroup_subsys_state *css;
struct cpuset *cs;
struct cpuset *oldcs = cpuset_attach_old_cs;
+ bool cpus_updated, mems_updated;
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
percpu_down_write(&cpuset_rwsem);
+ cpus_updated = !cpumask_equal(cs->effective_cpus,
+ oldcs->effective_cpus);
+ mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+
+ /*
+ * In the default hierarchy, enabling cpuset in the child cgroups
+ * will trigger a number of cpuset_attach() calls with no change
+ * in effective cpus and mems. In that case, we can optimize out
+ * by skipping the task iteration and update.
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !cpus_updated && !mems_updated) {
+ cpuset_attach_nodemask_to = cs->effective_mems;
+ goto out;
+ }
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
- cpuset_update_task_spread_flag(cs, task);
+ cpuset_update_task_spread_flags(cs, task);
}
/*
* Change mm for all threadgroup leaders. This is expensive and may
- * sleep and should be moved outside migration path proper.
+ * sleep and should be moved outside migration path proper. Skip it
+ * if there is no change in effective_mems and CS_MEMORY_MIGRATE is
+ * not set.
*/
cpuset_attach_nodemask_to = cs->effective_mems;
+ if (!is_memory_migrate(cs) && !mems_updated)
+ goto out;
+
cgroup_taskset_for_each_leader(leader, css, tset) {
struct mm_struct *mm = get_task_mm(leader);
}
}
+out:
cs->old_mems_allowed = cpuset_attach_nodemask_to;
cs->attach_in_progress--;
};
-/*
- * cpuset_css_alloc - allocate a cpuset css
- * cgrp: control group that the new cpuset will be part of
+/**
+ * cpuset_css_alloc - Allocate a cpuset css
+ * @parent_css: Parent css of the control group that the new cpuset will be
+ * part of
+ * Return: cpuset css on success, -ENOMEM on failure.
+ *
+ * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
+ * top cpuset css otherwise.
*/
-
static struct cgroup_subsys_state *
cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
return NOTIFY_OK;
}
- static struct notifier_block cpuset_track_online_nodes_nb = {
- .notifier_call = cpuset_track_online_nodes,
- .priority = 10, /* ??! */
- };
-
/**
* cpuset_init_smp - initialize cpus_allowed
*
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
- register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+ hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
#include <linux/freezer.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
-#include <linux/random.h>
#include <linux/tty.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
+#include <linux/stackprotector.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
void free_task(struct task_struct *tsk)
{
+#ifdef CONFIG_SECCOMP
+ WARN_ON_ONCE(tsk->seccomp.filter);
+#endif
release_user_cpus_ptr(tsk);
scs_release(tsk);
"Please make sure 'struct resident_page_types[]' is updated as well");
for (i = 0; i < NR_MM_COUNTERS; i++) {
- long x = atomic_long_read(&mm->rss_stat.count[i]);
+ long x = percpu_counter_sum(&mm->rss_stat[i]);
+ if (likely(!x))
+ continue;
+
+ /* Making sure this is not due to race with CPU offlining. */
+ x = percpu_counter_sum_all(&mm->rss_stat[i]);
if (unlikely(x))
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
mm, resident_page_types[i], x);
*/
void __mmdrop(struct mm_struct *mm)
{
+ int i;
+
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
WARN_ON_ONCE(mm == current->active_mm);
check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
+
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ percpu_counter_destroy(&mm->rss_stat[i]);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
+ int i;
+
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
atomic_set(&mm->mm_users, 1);
if (init_new_context(p, mm))
goto fail_nocontext;
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
+ goto fail_pcpu;
+
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
return mm;
+ fail_pcpu:
+ while (i > 0)
+ percpu_counter_destroy(&mm->rss_stat[--i]);
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
return ERR_PTR(-EINVAL);
}
- /*
- * If the new process will be in a different time namespace
- * do not allow it to share VM or a thread group with the forking task.
- */
- if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
- if (nsp->time_ns != nsp->time_ns_for_children)
- return ERR_PTR(-EINVAL);
- }
-
if (clone_flags & CLONE_PIDFD) {
/*
* - CLONE_DETACHED is blocked so that we can potentially
spin_lock(¤t->sighand->siglock);
- /*
- * Copy seccomp details explicitly here, in case they were changed
- * before holding sighand lock.
- */
- copy_seccomp(p);
-
rv_task_fork(p);
rseq_fork(p, clone_flags);
goto bad_fork_cancel_cgroup;
}
+ /* No more failure paths after this point. */
+
+ /*
+ * Copy seccomp details explicitly here, in case they were changed
+ * before holding sighand lock.
+ */
+ copy_seccomp(p);
+
init_task_pid_links(p);
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
ppos);
}
-static size_t proc_skip_spaces(char **buf)
+static void proc_skip_spaces(char **buf, size_t *size)
{
- size_t ret;
- char *tmp = skip_spaces(*buf);
- ret = tmp - *buf;
- *buf = tmp;
- return ret;
+ while (*size) {
+ if (!isspace(**buf))
+ break;
+ (*size)--;
+ (*buf)++;
+ }
}
static void proc_skip_char(char **buf, size_t *size, const char v)
unsigned long *val, bool *neg,
const char *perm_tr, unsigned perm_tr_len, char *tr)
{
- int len;
char *p, tmp[TMPBUFLEN];
+ ssize_t len = *size;
- if (!*size)
+ if (len <= 0)
return -EINVAL;
- len = *size;
if (len > TMPBUFLEN - 1)
len = TMPBUFLEN - 1;
bool neg;
if (write) {
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
if (!left)
break;
if (!write && !first && left && !err)
proc_put_char(&buffer, &left, '\n');
if (write && !err && left)
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
if (write && first)
return err ? : -EINVAL;
*lenp -= left;
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
if (!left) {
err = -EINVAL;
goto out_free;
}
if (!err && left)
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
out_free:
if (err)
if (write) {
bool neg;
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
if (!left)
break;
if (!write && !first && left && !err)
proc_put_char(&buffer, &left, '\n');
if (write && !err)
- left -= proc_skip_spaces(&p);
+ proc_skip_spaces(&p, &left);
if (write && first)
return err ? : -EINVAL;
*lenp -= left;
}
static struct ctl_table kern_table[] = {
-#ifdef CONFIG_NUMA_BALANCING
- {
- .procname = "numa_balancing",
- .data = NULL, /* filled in by handler */
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sysctl_numa_balancing,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_FOUR,
- },
- {
- .procname = "numa_balancing_promote_rate_limit_MBps",
- .data = &sysctl_numa_balancing_promote_rate_limit,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
-#endif /* CONFIG_NUMA_BALANCING */
{
.procname = "panic",
.data = &panic_timeout,
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&page_cluster_max,
},
{
.procname = "dirtytime_expire_seconds",
Enable this option if you want to use the LatencyTOP tool
to find out which userspace is blocking on what kernel operations.
+config DEBUG_CGROUP_REF
+ bool "Disable inlining of cgroup css reference count functions"
+ depends on DEBUG_KERNEL
+ depends on CGROUPS
+ depends on KPROBES
+ default n
+ help
+ Force cgroup css reference count functions to not be inlined so
+ that they can be kprobed for debugging.
+
source "kernel/trace/Kconfig"
config PROVIDE_OHCI1394_DMA_INIT
If unsure, say N.
config FUNCTION_ERROR_INJECTION
- def_bool y
+ bool "Fault-injections of functions"
depends on HAVE_FUNCTION_ERROR_INJECTION && KPROBES
+ help
+ Add fault injections into various functions that are annotated with
+ ALLOW_ERROR_INJECTION() in the kernel. BPF may also modify the return
+ value of theses functions. This is useful to test error paths of code.
+
+ If unsure, say N
config FAULT_INJECTION
bool "Fault-injection framework"
tristate "Test the XArray code at runtime"
config TEST_MAPLE_TREE
+ depends on DEBUG_KERNEL
select DEBUG_MAPLE_TREE
tristate "Test the Maple Tree code at runtime"
If unsure, say Y.
+config RUST_BUILD_ASSERT_ALLOW
+ bool "Allow unoptimized build-time assertions"
+ depends on RUST
+ help
+ Controls how are `build_error!` and `build_assert!` handled during build.
+
+ If calls to them exist in the binary, it may indicate a violated invariant
+ or that the optimizer failed to verify the invariant during compilation.
+
+ This should not happen, thus by default the build is aborted. However,
+ as an escape hatch, you can choose Y here to ignore them during build
+ and let the check be carried at runtime (with `panic!` being called if
+ the check fails).
+
+ If unsure, say N.
+
endmenu # "Rust"
source "Documentation/Kconfig"
(HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS)) && \
CC_HAS_WORKING_NOSANITIZE_ADDRESS) || \
HAVE_ARCH_KASAN_HW_TAGS
- depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB)
+ depends on (SLUB && SYSFS && !SLUB_TINY) || (SLAB && !DEBUG_SLAB)
select STACKDEPOT_ALWAYS_INIT
help
Enables KASAN (Kernel Address Sanitizer) - a dynamic memory safety
config KASAN_KUNIT_TEST
tristate "KUnit-compatible tests of KASAN bug detection capabilities" if !KUNIT_ALL_TESTS
- depends on KASAN && KUNIT
+ depends on KASAN && KUNIT && TRACEPOINTS
default KUNIT_ALL_TESTS
help
A KUnit-based KASAN test suite. Triggers different kinds of
and has enhanced diagnostics. SLUB is the default choice for
a slab allocator.
-config SLOB
+config SLOB_DEPRECATED
depends on EXPERT
- bool "SLOB (Simple Allocator)"
+ bool "SLOB (Simple Allocator - DEPRECATED)"
depends on !PREEMPT_RT
help
+ Deprecated and scheduled for removal in a few cycles. SLUB
+ recommended as replacement. CONFIG_SLUB_TINY can be considered
+ on systems with 16MB or less RAM.
+
+ people listed in the SLAB ALLOCATOR section of MAINTAINERS file,
+ with your use case.
+
SLOB replaces the stock allocator with a drastically simpler
allocator. SLOB is generally more space efficient but
does not perform as well on large systems.
endchoice
+config SLOB
+ bool
+ default y
+ depends on SLOB_DEPRECATED
+
+config SLUB_TINY
+ bool "Configure SLUB for minimal memory footprint"
+ depends on SLUB && EXPERT
+ select SLAB_MERGE_DEFAULT
+ help
+ Configures the SLUB allocator in a way to achieve minimal memory
+ footprint, sacrificing scalability, debugging and other features.
+ This is intended only for the smallest system that had used the
+ SLOB allocator and is not recommended for systems with more than
+ 16MB RAM.
+
+ If unsure, say N.
+
config SLAB_MERGE_DEFAULT
bool "Allow slab caches to be merged"
default y
config SLAB_FREELIST_RANDOM
bool "Randomize slab freelist"
- depends on SLAB || SLUB
+ depends on SLAB || (SLUB && !SLUB_TINY)
help
Randomizes the freelist order used on creating new pages. This
security feature reduces the predictability of the kernel slab
config SLAB_FREELIST_HARDENED
bool "Harden slab freelist metadata"
- depends on SLAB || SLUB
+ depends on SLAB || (SLUB && !SLUB_TINY)
help
Many kernel heap attacks try to target slab cache metadata and
other infrastructure. This options makes minor performance
config SLUB_STATS
default n
bool "Enable SLUB performance statistics"
- depends on SLUB && SYSFS
+ depends on SLUB && SYSFS && !SLUB_TINY
help
SLUB statistics are useful to debug SLUBs allocation behavior in
order find ways to optimize the allocator. This should never be
config SLUB_CPU_PARTIAL
default y
- depends on SLUB && SMP
+ depends on SLUB && SMP && !SLUB_TINY
bool "SLUB per cpu partial cache"
help
Per cpu partial caches accelerate objects allocation and freeing
config THP_SWAP
def_bool y
- depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
+ depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT
help
Swap transparent huge pages in one piece, without splitting.
XXX: For now, swap cluster backing transparent huge page
bool
config SECRETMEM
- def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
+ default y
+ bool "Enable memfd_secret() system call" if EXPERT
+ depends on ARCH_HAS_SET_DIRECT_MAP
+ help
+ Enable the memfd_secret() system call with the ability to create
+ memory areas visible only in the context of the owning process and
+ not mapped to other processes and other kernel page tables.
config ANON_VMA_NAME
bool "Anonymous VMA name support"
help
Arch has userfaultfd minor fault support
- config PTE_MARKER
- bool
-
- help
- Allows to create marker PTEs for file-backed memory.
-
config PTE_MARKER_UFFD_WP
bool "Userfaultfd write protection support for shmem/hugetlbfs"
default y
depends on HAVE_ARCH_USERFAULTFD_WP
- select PTE_MARKER
help
Allows to create marker PTEs for userfaultfd write protection
*/
struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
{
+ if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
+ return NULL;
+
if (flags & FOLL_GET)
return try_get_folio(page, refs);
else if (flags & FOLL_PIN) {
* time. Cases: please see the try_grab_folio() documentation, with
* "refs=1".
*
- * Return: true for success, or if no action was required (if neither FOLL_PIN
- * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
- * FOLL_PIN was set, but the page could not be grabbed.
+ * Return: 0 for success, or if no action was required (if neither FOLL_PIN
+ * nor FOLL_GET was set, nothing is done). A negative error code for failure:
+ *
+ * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not
+ * be grabbed.
*/
-bool __must_check try_grab_page(struct page *page, unsigned int flags)
+int __must_check try_grab_page(struct page *page, unsigned int flags)
{
struct folio *folio = page_folio(page);
WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
- return false;
+ return -ENOMEM;
+
+ if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
+ return -EREMOTEIO;
if (flags & FOLL_GET)
folio_ref_inc(folio);
node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
}
- return true;
+ return 0;
}
/**
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
return ERR_PTR(-EINVAL);
-
- /*
- * Considering PTE level hugetlb, like continuous-PTE hugetlb on
- * ARM64 architecture.
- */
- if (is_vm_hugetlb_page(vma)) {
- page = follow_huge_pmd_pte(vma, address, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
-
- retry:
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
pte = *ptep;
- if (!pte_present(pte)) {
- swp_entry_t entry;
- /*
- * KSM's break_ksm() relies upon recognizing a ksm page
- * even while it is being migrated, so for that case we
- * need migration_entry_wait().
- */
- if (likely(!(flags & FOLL_MIGRATION)))
- goto no_page;
- if (pte_none(pte))
- goto no_page;
- entry = pte_to_swp_entry(pte);
- if (!is_migration_entry(entry))
- goto no_page;
- pte_unmap_unlock(ptep, ptl);
- migration_entry_wait(mm, pmd, address);
- goto retry;
- }
+ if (!pte_present(pte))
+ goto no_page;
if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
goto no_page;
}
}
- if (!pte_write(pte) && gup_must_unshare(flags, page)) {
+ if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) {
page = ERR_PTR(-EMLINK);
goto out;
}
!PageAnonExclusive(page), page);
/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
- if (unlikely(!try_grab_page(page, flags))) {
- page = ERR_PTR(-ENOMEM);
+ ret = try_grab_page(page, flags);
+ if (unlikely(ret)) {
+ page = ERR_PTR(ret);
goto out;
}
+
/*
* We need to make the page accessible if and only if we are going
* to access its content (the FOLL_PIN case). Please see
pmdval = READ_ONCE(*pmd);
if (pmd_none(pmdval))
return no_page_table(vma, flags);
- if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
- page = follow_huge_pmd_pte(vma, address, flags);
- if (page)
- return page;
+ if (!pmd_present(pmdval))
return no_page_table(vma, flags);
- }
- if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(pmd_val(pmdval)), flags,
- PMD_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
- retry:
- if (!pmd_present(pmdval)) {
- /*
- * Should never reach here, if thp migration is not supported;
- * Otherwise, it must be a thp migration entry.
- */
- VM_BUG_ON(!thp_migration_supported() ||
- !is_pmd_migration_entry(pmdval));
-
- if (likely(!(flags & FOLL_MIGRATION)))
- return no_page_table(vma, flags);
-
- pmd_migration_entry_wait(mm, pmd);
- pmdval = READ_ONCE(*pmd);
- /*
- * MADV_DONTNEED may convert the pmd to null because
- * mmap_lock is held in read mode
- */
- if (pmd_none(pmdval))
- return no_page_table(vma, flags);
- goto retry;
- }
if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags))
return no_page_table(vma, flags);
- retry_locked:
ptl = pmd_lock(mm, pmd);
- if (unlikely(pmd_none(*pmd))) {
- spin_unlock(ptl);
- return no_page_table(vma, flags);
- }
if (unlikely(!pmd_present(*pmd))) {
spin_unlock(ptl);
- if (likely(!(flags & FOLL_MIGRATION)))
- return no_page_table(vma, flags);
- pmd_migration_entry_wait(mm, pmd);
- goto retry_locked;
+ return no_page_table(vma, flags);
}
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(ptl);
pud = pud_offset(p4dp, address);
if (pud_none(*pud))
return no_page_table(vma, flags);
- if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
- page = follow_huge_pud(mm, address, pud, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
- if (is_hugepd(__hugepd(pud_val(*pud)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(pud_val(*pud)), flags,
- PUD_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
if (pud_devmap(*pud)) {
ptl = pud_lock(mm, pud);
page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
struct follow_page_context *ctx)
{
p4d_t *p4d;
- struct page *page;
p4d = p4d_offset(pgdp, address);
if (p4d_none(*p4d))
if (unlikely(p4d_bad(*p4d)))
return no_page_table(vma, flags);
- if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(p4d_val(*p4d)), flags,
- P4D_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
return follow_pud_mask(vma, address, p4d, flags, ctx);
}
ctx->page_mask = 0;
- /* make this handle hugepd */
- page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
- if (!IS_ERR(page)) {
- WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
+ /*
+ * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
+ * special hugetlb page table walking code. This eliminates the
+ * need to check for hugetlb entries in the general walking code.
+ *
+ * hugetlb_follow_page_mask is only for follow_page() handling here.
+ * Ordinary GUP uses follow_hugetlb_page for hugetlb processing.
+ */
+ if (is_vm_hugetlb_page(vma)) {
+ page = hugetlb_follow_page_mask(vma, address, flags);
+ if (!page)
+ page = no_page_table(vma, flags);
return page;
}
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
return no_page_table(vma, flags);
- if (pgd_huge(*pgd)) {
- page = follow_huge_pgd(mm, address, pgd, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
- if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(pgd_val(*pgd)), flags,
- PGDIR_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
-
return follow_p4d_mask(vma, address, pgd, flags, ctx);
}
goto unmap;
*page = pte_page(*pte);
}
- if (unlikely(!try_grab_page(*page, gup_flags))) {
- ret = -ENOMEM;
+ ret = try_grab_page(*page, gup_flags);
+ if (unlikely(ret))
goto unmap;
- }
out:
ret = 0;
unmap:
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
return -EOPNOTSUPP;
+ if ((gup_flags & FOLL_LONGTERM) && (gup_flags & FOLL_PCI_P2PDMA))
+ return -EOPNOTSUPP;
+
if (vma_is_secretmem(vma))
return -EFAULT;
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
+ /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
+ if (is_vm_hugetlb_page(vma))
+ return -EFAULT;
/*
* We used to let the write,force case do COW in a
* VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
unsigned long nr_pages,
struct page **pages,
struct vm_area_struct **vmas,
+ int *locked,
unsigned int gup_flags)
{
+ bool must_unlock = false;
unsigned int flags;
long rc, nr_pinned_pages;
+ if (locked && WARN_ON_ONCE(!*locked))
+ return -EINVAL;
+
if (!(gup_flags & FOLL_LONGTERM))
return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
- NULL, gup_flags);
+ locked, gup_flags);
/*
* If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM
return -EINVAL;
flags = memalloc_pin_save();
do {
+ if (locked && !*locked) {
+ mmap_read_lock(mm);
+ must_unlock = true;
+ *locked = 1;
+ }
nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
- pages, vmas, NULL,
+ pages, vmas, locked,
gup_flags);
if (nr_pinned_pages <= 0) {
rc = nr_pinned_pages;
} while (rc == -EAGAIN);
memalloc_pin_restore(flags);
+ if (locked && *locked && must_unlock) {
+ mmap_read_unlock(mm);
+ *locked = 0;
+ }
return rc ? rc : nr_pinned_pages;
}
}
#ifdef CONFIG_MMU
- static long __get_user_pages_remote(struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
- {
- /*
- * Parts of FOLL_LONGTERM behavior are incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. However, this only comes up if locked is set, and there are
- * callers that do request FOLL_LONGTERM, but do not set locked. So,
- * allow what we can.
- */
- if (gup_flags & FOLL_LONGTERM) {
- if (WARN_ON_ONCE(locked))
- return -EINVAL;
- /*
- * This will check the vmas (even if our vmas arg is NULL)
- * and return -ENOTSUPP if DAX isn't allowed in this case:
- */
- return __gup_longterm_locked(mm, start, nr_pages, pages,
- vmas, gup_flags | FOLL_TOUCH |
- FOLL_REMOTE);
- }
-
- return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
- locked,
- gup_flags | FOLL_TOUCH | FOLL_REMOTE);
- }
-
/**
* get_user_pages_remote() - pin user pages in memory
* @mm: mm_struct of target mm
if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
- return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
- pages, vmas, locked);
+ return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked,
+ gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);
{
return 0;
}
-
- static long __get_user_pages_remote(struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
- {
- return 0;
- }
#endif /* !CONFIG_MMU */
/**
return -EINVAL;
return __gup_longterm_locked(current->mm, start, nr_pages,
- pages, vmas, gup_flags | FOLL_TOUCH);
+ pages, vmas, NULL, gup_flags | FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages);
int locked = 1;
long ret;
- /*
- * FIXME: Current FOLL_LONGTERM behavior is incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. As there are no users of this flag in this call we simply
- * disallow this option for now.
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
- return -EINVAL;
-
mmap_read_lock(mm);
- ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
- &locked, gup_flags | FOLL_TOUCH);
+ ret = __gup_longterm_locked(mm, start, nr_pages, pages, NULL, &locked,
+ gup_flags | FOLL_TOUCH);
if (locked)
mmap_read_unlock(mm);
return ret;
goto pte_unmap;
}
- if (!pte_write(pte) && gup_must_unshare(flags, page)) {
+ if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
undo_dev_pagemap(nr, nr_start, flags, pages);
break;
}
+
+ if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
+ undo_dev_pagemap(nr, nr_start, flags, pages);
+ break;
+ }
+
SetPageReferenced(page);
pages[*nr] = page;
- if (unlikely(!try_grab_page(page, flags))) {
+ if (unlikely(try_grab_page(page, flags))) {
undo_dev_pagemap(nr, nr_start, flags, pages);
break;
}
return 0;
}
- if (!pte_write(pte) && gup_must_unshare(flags, &folio->page)) {
+ if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
}
return 0;
}
- if (!pmd_write(orig) && gup_must_unshare(flags, &folio->page)) {
+ if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
}
return 0;
}
- if (!pud_write(orig) && gup_must_unshare(flags, &folio->page)) {
+ if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
}
}
#endif
- static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
- unsigned int gup_flags, struct page **pages)
- {
- int ret;
-
- /*
- * FIXME: FOLL_LONGTERM does not work with
- * get_user_pages_unlocked() (see comments in that function)
- */
- if (gup_flags & FOLL_LONGTERM) {
- mmap_read_lock(current->mm);
- ret = __gup_longterm_locked(current->mm,
- start, nr_pages,
- pages, NULL, gup_flags);
- mmap_read_unlock(current->mm);
- } else {
- ret = get_user_pages_unlocked(start, nr_pages,
- pages, gup_flags);
- }
-
- return ret;
- }
-
static unsigned long lockless_pages_from_mm(unsigned long start,
unsigned long end,
unsigned int gup_flags,
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
FOLL_FORCE | FOLL_PIN | FOLL_GET |
- FOLL_FAST_ONLY | FOLL_NOFAULT)))
+ FOLL_FAST_ONLY | FOLL_NOFAULT |
+ FOLL_PCI_P2PDMA)))
return -EINVAL;
if (gup_flags & FOLL_PIN)
/* Slow path: try to get the remaining pages with get_user_pages */
start += nr_pinned << PAGE_SHIFT;
pages += nr_pinned;
- ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
- pages);
+ ret = get_user_pages_unlocked(start, nr_pages - nr_pinned, pages,
+ gup_flags);
if (ret < 0) {
/*
* The caller has to unpin the pages we already pinned so
if (WARN_ON_ONCE(!pages))
return -EINVAL;
- gup_flags |= FOLL_PIN;
- return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
- pages, vmas, locked);
+ return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked,
+ gup_flags | FOLL_PIN | FOLL_TOUCH |
+ FOLL_REMOTE);
}
EXPORT_SYMBOL(pin_user_pages_remote);
gup_flags |= FOLL_PIN;
return __gup_longterm_locked(current->mm, start, nr_pages,
- pages, vmas, gup_flags);
+ pages, vmas, NULL, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);
unsigned long pfn = pmd_pfn(*pmd);
struct mm_struct *mm = vma->vm_mm;
struct page *page;
+ int ret;
assert_spin_locked(pmd_lockptr(mm, pmd));
if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- if (!try_grab_page(page, flags))
- page = ERR_PTR(-ENOMEM);
+ ret = try_grab_page(page, flags);
+ if (ret)
+ page = ERR_PTR(ret);
return page;
}
unsigned long pfn = pud_pfn(*pud);
struct mm_struct *mm = vma->vm_mm;
struct page *page;
+ int ret;
assert_spin_locked(pud_lockptr(mm, pud));
if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- if (!try_grab_page(page, flags))
- page = ERR_PTR(-ENOMEM);
+
+ ret = try_grab_page(page, flags);
+ if (ret)
+ page = ERR_PTR(ret);
return page;
}
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
- VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
- VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));
-
if (is_huge_zero_pmd(orig_pmd))
goto fallback;
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
spin_unlock(vmf->ptl);
- return VM_FAULT_WRITE;
+ return 0;
}
unlock_fallback:
return VM_FAULT_FALLBACK;
}
+ static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd)
+ {
+ struct page *page;
+
+ if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
+ return false;
+
+ /* Don't touch entries that are not even readable (NUMA hinting). */
+ if (pmd_protnone(pmd))
+ return false;
+
+ /* Do we need write faults for softdirty tracking? */
+ if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
+ return false;
+
+ /* Do we need write faults for uffd-wp tracking? */
+ if (userfaultfd_huge_pmd_wp(vma, pmd))
+ return false;
+
+ if (!(vma->vm_flags & VM_SHARED)) {
+ /* See can_change_pte_writable(). */
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ return page && PageAnon(page) && PageAnonExclusive(page);
+ }
+
+ /* See can_change_pte_writable(). */
+ return pmd_dirty(pmd);
+ }
+
/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
struct page *page;
+ int ret;
assert_spin_locked(pmd_lockptr(mm, pmd));
if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags))
return NULL;
- if (!pmd_write(*pmd) && gup_must_unshare(flags, page))
+ if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page))
return ERR_PTR(-EMLINK);
VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
!PageAnonExclusive(page), page);
- if (!try_grab_page(page, flags))
- return ERR_PTR(-ENOMEM);
+ ret = try_grab_page(page, flags);
+ if (ret)
+ return ERR_PTR(ret);
if (flags & FOLL_TOUCH)
touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
int page_nid = NUMA_NO_NODE;
int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
- bool migrated = false;
- bool was_writable = pmd_savedwrite(oldpmd);
+ bool migrated = false, writable = false;
int flags = 0;
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
}
pmd = pmd_modify(oldpmd, vma->vm_page_prot);
+
+ /*
+ * Detect now whether the PMD could be writable; this information
+ * is only valid while holding the PT lock.
+ */
+ writable = pmd_write(pmd);
+ if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
+ can_change_pmd_writable(vma, vmf->address, pmd))
+ writable = true;
+
page = vm_normal_page_pmd(vma, haddr, pmd);
if (!page)
goto out_map;
/* See similar comment in do_numa_page for explanation */
- if (!was_writable)
+ if (!writable)
flags |= TNF_NO_GROUP;
page_nid = page_to_nid(page);
}
spin_unlock(vmf->ptl);
+ writable = false;
migrated = migrate_misplaced_page(page, vma, target_nid);
if (migrated) {
/* Restore the PMD */
pmd = pmd_modify(oldpmd, vma->vm_page_prot);
pmd = pmd_mkyoung(pmd);
- if (was_writable)
+ if (writable)
pmd = pmd_mkwrite(pmd);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
pmd_t oldpmd, entry;
- bool preserve_write;
- int ret;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ int ret = 1;
tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
if (!ptl)
return 0;
- preserve_write = prot_numa && pmd_write(*pmd);
- ret = 1;
-
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (is_swap_pmd(*pmd)) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
entry = pmd_modify(oldpmd, newprot);
- if (preserve_write)
- entry = pmd_mk_savedwrite(entry);
if (uffd_wp) {
entry = pmd_wrprotect(entry);
entry = pmd_mkuffd_wp(entry);
*/
entry = pmd_clear_uffd_wp(entry);
}
+
+ /* See change_pte_range(). */
+ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
+ can_change_pmd_writable(vma, addr, entry))
+ entry = pmd_mkwrite(entry);
+
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
if (huge_pmd_needs_flush(oldpmd, entry))
tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
-
- BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
unlock:
spin_unlock(ptl);
return ret;
uffd_wp = pmd_uffd_wp(old_pmd);
VM_BUG_ON_PAGE(!page_count(page), page);
- page_ref_add(page, HPAGE_PMD_NR - 1);
/*
* Without "freeze", we'll simply split the PMD, propagating the
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
freeze = false;
+ if (!freeze)
+ page_ref_add(page, HPAGE_PMD_NR - 1);
}
/*
entry = maybe_mkwrite(entry, vma);
if (anon_exclusive)
SetPageAnonExclusive(page + i);
- if (!write)
- entry = pte_wrprotect(entry);
if (!young)
entry = pte_mkold(entry);
+ /* NOTE: this may set soft-dirty too on some archs */
+ if (dirty)
+ entry = pte_mkdirty(entry);
/*
- * NOTE: we don't do pte_mkdirty when dirty==true
- * because it breaks sparc64 which can sigsegv
- * random process. Need to revisit when we figure
- * out what is special with sparc64.
+ * NOTE: this needs to happen after pte_mkdirty,
+ * because some archs (sparc64, loongarch) could
+ * set hw write bit when mkdirty.
*/
+ if (!write)
+ entry = pte_wrprotect(entry);
if (soft_dirty)
entry = pte_mksoft_dirty(entry);
if (uffd_wp)
entry = pte_mkuffd_wp(entry);
+ page_add_anon_rmap(page + i, vma, addr, false);
}
pte = pte_offset_map(&_pmd, addr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, entry);
- if (!pmd_migration)
- atomic_inc(&page[i]._mapcount);
pte_unmap(pte);
}
- if (!pmd_migration) {
- /*
- * Set PG_double_map before dropping compound_mapcount to avoid
- * false-negative page_mapped().
- */
- if (compound_mapcount(page) > 1 &&
- !TestSetPageDoubleMap(page)) {
- for (i = 0; i < HPAGE_PMD_NR; i++)
- atomic_inc(&page[i]._mapcount);
- }
-
- lock_page_memcg(page);
- if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
- /* Last compound_mapcount is gone. */
- __mod_lruvec_page_state(page, NR_ANON_THPS,
- -HPAGE_PMD_NR);
- if (TestClearPageDoubleMap(page)) {
- /* No need in mapcount reference anymore */
- for (i = 0; i < HPAGE_PMD_NR; i++)
- atomic_dec(&page[i]._mapcount);
- }
- }
- unlock_page_memcg(page);
-
- /* Above is effectively page_remove_rmap(page, vma, true) */
- munlock_vma_page(page, vma, true);
- }
+ if (!pmd_migration)
+ page_remove_rmap(page, vma, true);
+ if (freeze)
+ put_page(page);
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
-
- if (freeze) {
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- page_remove_rmap(page + i, vma, false);
- put_page(page + i);
- }
- }
}
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
(1L << PG_dirty) |
LRU_GEN_MASK | LRU_REFS_MASK));
- /* ->mapping in first tail page is compound_mapcount */
+ /* ->mapping in first and second tail page is replaced by other uses */
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
page_tail);
page_tail->mapping = head->mapping;
* page->private should not be set in tail pages with the exception
* of swap cache pages that store the swp_entry_t in tail pages.
* Fix up and warn once if private is unexpectedly set.
+ *
+ * What of 32-bit systems, on which head[1].compound_pincount overlays
+ * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and
+ * compound_pincount must be 0 for folio_ref_freeze() to have succeeded.
*/
if (!folio_test_swapcache(page_folio(head))) {
VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail);
* split PMDs
*/
if (!can_split_folio(folio, &extra_pins)) {
- ret = -EBUSY;
+ ret = -EAGAIN;
goto out_unlock;
}
xas_unlock(&xas);
local_irq_enable();
remap_page(folio, folio_nr_pages(folio));
- ret = -EBUSY;
+ ret = -EAGAIN;
}
out_unlock:
mapping = candidate->f_mapping;
for (index = off_start; index < off_end; index += nr_pages) {
- struct page *fpage = pagecache_get_page(mapping, index,
- FGP_ENTRY | FGP_HEAD, 0);
+ struct folio *folio = __filemap_get_folio(mapping, index,
+ FGP_ENTRY, 0);
nr_pages = 1;
- if (xa_is_value(fpage) || !fpage)
+ if (xa_is_value(folio) || !folio)
continue;
- if (!is_transparent_hugepage(fpage))
+ if (!folio_test_large(folio))
goto next;
total++;
- nr_pages = thp_nr_pages(fpage);
+ nr_pages = folio_nr_pages(folio);
- if (!trylock_page(fpage))
+ if (!folio_trylock(folio))
goto next;
- if (!split_huge_page(fpage))
+ if (!split_folio(folio))
split++;
- unlock_page(fpage);
+ folio_unlock(folio);
next:
- put_page(fpage);
+ folio_put(folio);
cond_resched();
}
#ifdef CONFIG_CMA
static struct cma *hugetlb_cma[MAX_NUMNODES];
static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
- static bool hugetlb_cma_page(struct page *page, unsigned int order)
+ static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
{
- return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page,
+ return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page,
1 << order);
}
#else
- static bool hugetlb_cma_page(struct page *page, unsigned int order)
+ static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
{
return false;
}
return false;
}
- static void enqueue_huge_page(struct hstate *h, struct page *page)
+ static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
{
- int nid = page_to_nid(page);
+ int nid = folio_nid(folio);
lockdep_assert_held(&hugetlb_lock);
- VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
- list_move(&page->lru, &h->hugepage_freelists[nid]);
+ list_move(&folio->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
- SetHPageFreed(page);
+ folio_set_hugetlb_freed(folio);
}
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
nr_nodes--)
/* used to demote non-gigantic_huge pages as well */
- static void __destroy_compound_gigantic_page(struct page *page,
+ static void __destroy_compound_gigantic_folio(struct folio *folio,
unsigned int order, bool demote)
{
int i;
int nr_pages = 1 << order;
struct page *p;
- atomic_set(compound_mapcount_ptr(page), 0);
- atomic_set(compound_pincount_ptr(page), 0);
+ atomic_set(folio_mapcount_ptr(folio), 0);
+ atomic_set(folio_subpages_mapcount_ptr(folio), 0);
+ atomic_set(folio_pincount_ptr(folio), 0);
for (i = 1; i < nr_pages; i++) {
- p = nth_page(page, i);
+ p = folio_page(folio, i);
p->mapping = NULL;
clear_compound_head(p);
if (!demote)
set_page_refcounted(p);
}
- set_compound_order(page, 0);
- #ifdef CONFIG_64BIT
- page[1].compound_nr = 0;
- #endif
- __ClearPageHead(page);
+ folio_set_compound_order(folio, 0);
+ __folio_clear_head(folio);
}
- static void destroy_compound_hugetlb_page_for_demote(struct page *page,
+ static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio,
unsigned int order)
{
- __destroy_compound_gigantic_page(page, order, true);
+ __destroy_compound_gigantic_folio(folio, order, true);
}
#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
- static void destroy_compound_gigantic_page(struct page *page,
+ static void destroy_compound_gigantic_folio(struct folio *folio,
unsigned int order)
{
- __destroy_compound_gigantic_page(page, order, false);
+ __destroy_compound_gigantic_folio(folio, order, false);
}
- static void free_gigantic_page(struct page *page, unsigned int order)
+ static void free_gigantic_folio(struct folio *folio, unsigned int order)
{
/*
* If the page isn't allocated using the cma allocator,
* cma_release() returns false.
*/
#ifdef CONFIG_CMA
- if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
+ int nid = folio_nid(folio);
+
+ if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order))
return;
#endif
- free_contig_range(page_to_pfn(page), 1 << order);
+ free_contig_range(folio_pfn(folio), 1 << order);
}
#ifdef CONFIG_CONTIG_ALLOC
- static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
+ struct page *page;
unsigned long nr_pages = pages_per_huge_page(h);
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
#ifdef CONFIG_CMA
{
- struct page *page;
int node;
if (hugetlb_cma[nid]) {
page = cma_alloc(hugetlb_cma[nid], nr_pages,
huge_page_order(h), true);
if (page)
- return page;
+ return page_folio(page);
}
if (!(gfp_mask & __GFP_THISNODE)) {
page = cma_alloc(hugetlb_cma[node], nr_pages,
huge_page_order(h), true);
if (page)
- return page;
+ return page_folio(page);
}
}
}
#endif
- return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
+ page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
+ return page ? page_folio(page) : NULL;
}
#else /* !CONFIG_CONTIG_ALLOC */
- static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
return NULL;
#endif /* CONFIG_CONTIG_ALLOC */
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
- static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
return NULL;
}
- static inline void free_gigantic_page(struct page *page, unsigned int order) { }
- static inline void destroy_compound_gigantic_page(struct page *page,
+ static inline void free_gigantic_folio(struct folio *folio,
+ unsigned int order) { }
+ static inline void destroy_compound_gigantic_folio(struct folio *folio,
unsigned int order) { }
#endif
/*
- * Remove hugetlb page from lists, and update dtor so that page appears
+ * Remove hugetlb folio from lists, and update dtor so that the folio appears
* as just a compound page.
*
- * A reference is held on the page, except in the case of demote.
+ * A reference is held on the folio, except in the case of demote.
*
* Must be called with hugetlb lock held.
*/
- static void __remove_hugetlb_page(struct hstate *h, struct page *page,
+ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
bool adjust_surplus,
bool demote)
{
- int nid = page_to_nid(page);
+ int nid = folio_nid(folio);
- VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
- VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
+ VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio);
+ VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio);
lockdep_assert_held(&hugetlb_lock);
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
- list_del(&page->lru);
+ list_del(&folio->lru);
- if (HPageFreed(page)) {
+ if (folio_test_hugetlb_freed(folio)) {
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
}
*
* For gigantic pages set the destructor to the null dtor. This
* destructor will never be called. Before freeing the gigantic
- * page destroy_compound_gigantic_page will turn the compound page
- * into a simple group of pages. After this the destructor does not
+ * page destroy_compound_gigantic_folio will turn the folio into a
+ * simple group of pages. After this the destructor does not
* apply.
*
* This handles the case where more than one ref is held when and
- * after update_and_free_page is called.
+ * after update_and_free_hugetlb_folio is called.
*
* In the case of demote we do not ref count the page as it will soon
* be turned into a page of smaller size.
*/
if (!demote)
- set_page_refcounted(page);
+ folio_ref_unfreeze(folio, 1);
if (hstate_is_gigantic(h))
- set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+ folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR);
else
- set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
+ folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
h->nr_huge_pages--;
h->nr_huge_pages_node[nid]--;
}
- static void remove_hugetlb_page(struct hstate *h, struct page *page,
+ static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
bool adjust_surplus)
{
- __remove_hugetlb_page(h, page, adjust_surplus, false);
+ __remove_hugetlb_folio(h, folio, adjust_surplus, false);
}
- static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page,
+ static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio,
bool adjust_surplus)
{
- __remove_hugetlb_page(h, page, adjust_surplus, true);
+ __remove_hugetlb_folio(h, folio, adjust_surplus, true);
}
- static void add_hugetlb_page(struct hstate *h, struct page *page,
+ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
bool adjust_surplus)
{
int zeroed;
- int nid = page_to_nid(page);
+ int nid = folio_nid(folio);
- VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio);
lockdep_assert_held(&hugetlb_lock);
- INIT_LIST_HEAD(&page->lru);
+ INIT_LIST_HEAD(&folio->lru);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
h->surplus_huge_pages_node[nid]++;
}
- set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
- set_page_private(page, 0);
+ folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
+ folio_change_private(folio, NULL);
/*
- * We have to set HPageVmemmapOptimized again as above
- * set_page_private(page, 0) cleared it.
+ * We have to set hugetlb_vmemmap_optimized again as above
+ * folio_change_private(folio, NULL) cleared it.
*/
- SetHPageVmemmapOptimized(page);
+ folio_set_hugetlb_vmemmap_optimized(folio);
/*
- * This page is about to be managed by the hugetlb allocator and
+ * This folio is about to be managed by the hugetlb allocator and
* should have no users. Drop our reference, and check for others
* just in case.
*/
- zeroed = put_page_testzero(page);
- if (!zeroed)
+ zeroed = folio_put_testzero(folio);
+ if (unlikely(!zeroed))
/*
* It is VERY unlikely soneone else has taken a ref on
* the page. In this case, we simply return as the
*/
return;
- arch_clear_hugepage_flags(page);
- enqueue_huge_page(h, page);
+ arch_clear_hugepage_flags(&folio->page);
+ enqueue_hugetlb_folio(h, folio);
}
static void __update_and_free_page(struct hstate *h, struct page *page)
{
int i;
+ struct folio *folio = page_folio(page);
struct page *subpage;
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
* If we don't know which subpages are hwpoisoned, we can't free
* the hugepage, so it's leaked intentionally.
*/
- if (HPageRawHwpUnreliable(page))
+ if (folio_test_hugetlb_raw_hwp_unreliable(folio))
return;
if (hugetlb_vmemmap_restore(h, page)) {
* page and put the page back on the hugetlb free list and treat
* as a surplus page.
*/
- add_hugetlb_page(h, page, true);
+ add_hugetlb_folio(h, folio, true);
spin_unlock_irq(&hugetlb_lock);
return;
}
* Move PageHWPoison flag from head page to the raw error pages,
* which makes any healthy subpages reusable.
*/
- if (unlikely(PageHWPoison(page)))
- hugetlb_clear_page_hwpoison(page);
+ if (unlikely(folio_test_hwpoison(folio)))
+ hugetlb_clear_page_hwpoison(&folio->page);
for (i = 0; i < pages_per_huge_page(h); i++) {
- subpage = nth_page(page, i);
+ subpage = folio_page(folio, i);
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
1 << PG_active | 1 << PG_private |
/*
* Non-gigantic pages demoted from CMA allocated gigantic pages
- * need to be given back to CMA in free_gigantic_page.
+ * need to be given back to CMA in free_gigantic_folio.
*/
if (hstate_is_gigantic(h) ||
- hugetlb_cma_page(page, huge_page_order(h))) {
- destroy_compound_gigantic_page(page, huge_page_order(h));
- free_gigantic_page(page, huge_page_order(h));
+ hugetlb_cma_folio(folio, huge_page_order(h))) {
+ destroy_compound_gigantic_folio(folio, huge_page_order(h));
+ free_gigantic_folio(folio, huge_page_order(h));
} else {
__free_pages(page, huge_page_order(h));
}
}
/*
- * As update_and_free_page() can be called under any context, so we cannot
+ * As update_and_free_hugetlb_folio() can be called under any context, so we cannot
* use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
* actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
* the vmemmap pages.
/*
* The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
* is going to trigger because a previous call to
- * remove_hugetlb_page() will set_compound_page_dtor(page,
- * NULL_COMPOUND_DTOR), so do not use page_hstate() directly.
+ * remove_hugetlb_folio() will call folio_set_compound_dtor
+ * (folio, NULL_COMPOUND_DTOR), so do not use page_hstate()
+ * directly.
*/
h = size_to_hstate(page_size(page));
flush_work(&free_hpage_work);
}
- static void update_and_free_page(struct hstate *h, struct page *page,
+ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
bool atomic)
{
- if (!HPageVmemmapOptimized(page) || !atomic) {
- __update_and_free_page(h, page);
+ if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
+ __update_and_free_page(h, &folio->page);
return;
}
* empty. Otherwise, schedule_work() had been called but the workfn
* hasn't retrieved the list yet.
*/
- if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
+ if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist))
schedule_work(&free_hpage_work);
}
static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
{
struct page *page, *t_page;
+ struct folio *folio;
list_for_each_entry_safe(page, t_page, list, lru) {
- update_and_free_page(h, page, false);
+ folio = page_folio(page);
+ update_and_free_hugetlb_folio(h, folio, false);
cond_resched();
}
}
* Can't pass hstate in here because it is called from the
* compound page destructor.
*/
- struct hstate *h = page_hstate(page);
- int nid = page_to_nid(page);
- struct hugepage_subpool *spool = hugetlb_page_subpool(page);
+ struct folio *folio = page_folio(page);
+ struct hstate *h = folio_hstate(folio);
+ int nid = folio_nid(folio);
+ struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
bool restore_reserve;
unsigned long flags;
- VM_BUG_ON_PAGE(page_count(page), page);
- VM_BUG_ON_PAGE(page_mapcount(page), page);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
+ VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);
- hugetlb_set_page_subpool(page, NULL);
- if (PageAnon(page))
- __ClearPageAnonExclusive(page);
- page->mapping = NULL;
- restore_reserve = HPageRestoreReserve(page);
- ClearHPageRestoreReserve(page);
+ hugetlb_set_folio_subpool(folio, NULL);
+ if (folio_test_anon(folio))
+ __ClearPageAnonExclusive(&folio->page);
+ folio->mapping = NULL;
+ restore_reserve = folio_test_hugetlb_restore_reserve(folio);
+ folio_clear_hugetlb_restore_reserve(folio);
/*
* If HPageRestoreReserve was set on page, page allocation consumed a
}
spin_lock_irqsave(&hugetlb_lock, flags);
- ClearHPageMigratable(page);
- hugetlb_cgroup_uncharge_page(hstate_index(h),
- pages_per_huge_page(h), page);
- hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
- pages_per_huge_page(h), page);
+ folio_clear_hugetlb_migratable(folio);
+ hugetlb_cgroup_uncharge_folio(hstate_index(h),
+ pages_per_huge_page(h), folio);
+ hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
+ pages_per_huge_page(h), folio);
if (restore_reserve)
h->resv_huge_pages++;
- if (HPageTemporary(page)) {
- remove_hugetlb_page(h, page, false);
+ if (folio_test_hugetlb_temporary(folio)) {
+ remove_hugetlb_folio(h, folio, false);
spin_unlock_irqrestore(&hugetlb_lock, flags);
- update_and_free_page(h, page, true);
+ update_and_free_hugetlb_folio(h, folio, true);
} else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
- remove_hugetlb_page(h, page, true);
+ remove_hugetlb_folio(h, folio, true);
spin_unlock_irqrestore(&hugetlb_lock, flags);
- update_and_free_page(h, page, true);
+ update_and_free_hugetlb_folio(h, folio, true);
} else {
arch_clear_hugepage_flags(page);
- enqueue_huge_page(h, page);
+ enqueue_hugetlb_folio(h, folio);
spin_unlock_irqrestore(&hugetlb_lock, flags);
}
}
h->nr_huge_pages_node[nid]++;
}
- static void __prep_new_huge_page(struct hstate *h, struct page *page)
+ static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
{
- hugetlb_vmemmap_optimize(h, page);
- INIT_LIST_HEAD(&page->lru);
- set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
- hugetlb_set_page_subpool(page, NULL);
- set_hugetlb_cgroup(page, NULL);
- set_hugetlb_cgroup_rsvd(page, NULL);
+ hugetlb_vmemmap_optimize(h, &folio->page);
+ INIT_LIST_HEAD(&folio->lru);
+ folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
+ hugetlb_set_folio_subpool(folio, NULL);
+ set_hugetlb_cgroup(folio, NULL);
+ set_hugetlb_cgroup_rsvd(folio, NULL);
}
- static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+ static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid)
{
- __prep_new_huge_page(h, page);
+ __prep_new_hugetlb_folio(h, folio);
spin_lock_irq(&hugetlb_lock);
__prep_account_new_huge_page(h, nid);
spin_unlock_irq(&hugetlb_lock);
}
- static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
- bool demote)
+ static bool __prep_compound_gigantic_folio(struct folio *folio,
+ unsigned int order, bool demote)
{
int i, j;
int nr_pages = 1 << order;
struct page *p;
- /* we rely on prep_new_huge_page to set the destructor */
- set_compound_order(page, order);
- __ClearPageReserved(page);
- __SetPageHead(page);
+ __folio_clear_reserved(folio);
+ __folio_set_head(folio);
+ /* we rely on prep_new_hugetlb_folio to set the destructor */
+ folio_set_compound_order(folio, order);
for (i = 0; i < nr_pages; i++) {
- p = nth_page(page, i);
+ p = folio_page(folio, i);
/*
* For gigantic hugepages allocated through bootmem at
VM_BUG_ON_PAGE(page_count(p), p);
}
if (i != 0)
- set_compound_head(p, page);
+ set_compound_head(p, &folio->page);
}
- atomic_set(compound_mapcount_ptr(page), -1);
- atomic_set(compound_pincount_ptr(page), 0);
+ atomic_set(folio_mapcount_ptr(folio), -1);
+ atomic_set(folio_subpages_mapcount_ptr(folio), 0);
+ atomic_set(folio_pincount_ptr(folio), 0);
return true;
out_error:
/* undo page modifications made above */
for (j = 0; j < i; j++) {
- p = nth_page(page, j);
+ p = folio_page(folio, j);
if (j != 0)
clear_compound_head(p);
set_page_refcounted(p);
}
/* need to clear PG_reserved on remaining tail pages */
for (; j < nr_pages; j++) {
- p = nth_page(page, j);
+ p = folio_page(folio, j);
__ClearPageReserved(p);
}
- set_compound_order(page, 0);
- #ifdef CONFIG_64BIT
- page[1].compound_nr = 0;
- #endif
- __ClearPageHead(page);
+ folio_set_compound_order(folio, 0);
+ __folio_clear_head(folio);
return false;
}
- static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
+ static bool prep_compound_gigantic_folio(struct folio *folio,
+ unsigned int order)
{
- return __prep_compound_gigantic_page(page, order, false);
+ return __prep_compound_gigantic_folio(folio, order, false);
}
- static bool prep_compound_gigantic_page_for_demote(struct page *page,
+ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
unsigned int order)
{
- return __prep_compound_gigantic_page(page, order, true);
+ return __prep_compound_gigantic_folio(folio, order, true);
}
/*
return (index << compound_order(page_head)) + compound_idx;
}
- static struct page *alloc_buddy_huge_page(struct hstate *h,
+ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
{
page = NULL;
}
- if (page)
- __count_vm_event(HTLB_BUDDY_PGALLOC);
- else
- __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
-
/*
* If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
* indicates an overall state change. Clear bit so that we resume
if (node_alloc_noretry && !page && alloc_try_hard)
node_set(nid, *node_alloc_noretry);
- return page;
+ if (!page) {
+ __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+ return NULL;
+ }
+
+ __count_vm_event(HTLB_BUDDY_PGALLOC);
+ return page_folio(page);
}
/*
* Note that returned page is 'frozen': ref count of head page and all tail
* pages is zero.
*/
- static struct page *alloc_fresh_huge_page(struct hstate *h,
+ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
{
- struct page *page;
+ struct folio *folio;
bool retry = false;
retry:
if (hstate_is_gigantic(h))
- page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
+ folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
else
- page = alloc_buddy_huge_page(h, gfp_mask,
+ folio = alloc_buddy_hugetlb_folio(h, gfp_mask,
nid, nmask, node_alloc_noretry);
- if (!page)
+ if (!folio)
return NULL;
-
if (hstate_is_gigantic(h)) {
- if (!prep_compound_gigantic_page(page, huge_page_order(h))) {
+ if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) {
/*
* Rare failure to convert pages to compound page.
* Free pages and try again - ONCE!
*/
- free_gigantic_page(page, huge_page_order(h));
+ free_gigantic_folio(folio, huge_page_order(h));
if (!retry) {
retry = true;
goto retry;
return NULL;
}
}
- prep_new_huge_page(h, page, page_to_nid(page));
+ prep_new_hugetlb_folio(h, folio, folio_nid(folio));
- return page;
+ return folio;
}
/*
static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
nodemask_t *node_alloc_noretry)
{
- struct page *page;
+ struct folio *folio;
int nr_nodes, node;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
- node_alloc_noretry);
- if (page)
- break;
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node,
+ nodes_allowed, node_alloc_noretry);
+ if (folio) {
+ free_huge_page(&folio->page); /* free it into the hugepage allocator */
+ return 1;
+ }
}
- if (!page)
- return 0;
-
- free_huge_page(page); /* free it into the hugepage allocator */
-
- return 1;
+ return 0;
}
/*
{
int nr_nodes, node;
struct page *page = NULL;
+ struct folio *folio;
lockdep_assert_held(&hugetlb_lock);
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
!list_empty(&h->hugepage_freelists[node])) {
page = list_entry(h->hugepage_freelists[node].next,
struct page, lru);
- remove_hugetlb_page(h, page, acct_surplus);
+ folio = page_folio(page);
+ remove_hugetlb_folio(h, folio, acct_surplus);
break;
}
}
int dissolve_free_huge_page(struct page *page)
{
int rc = -EBUSY;
+ struct folio *folio = page_folio(page);
retry:
/* Not to disrupt normal path by vainly holding hugetlb_lock */
- if (!PageHuge(page))
+ if (!folio_test_hugetlb(folio))
return 0;
spin_lock_irq(&hugetlb_lock);
- if (!PageHuge(page)) {
+ if (!folio_test_hugetlb(folio)) {
rc = 0;
goto out;
}
- if (!page_count(page)) {
- struct page *head = compound_head(page);
- struct hstate *h = page_hstate(head);
+ if (!folio_ref_count(folio)) {
+ struct hstate *h = folio_hstate(folio);
if (!available_huge_pages(h))
goto out;
* We should make sure that the page is already on the free list
* when it is dissolved.
*/
- if (unlikely(!HPageFreed(head))) {
+ if (unlikely(!folio_test_hugetlb_freed(folio))) {
spin_unlock_irq(&hugetlb_lock);
cond_resched();
goto retry;
}
- remove_hugetlb_page(h, head, false);
+ remove_hugetlb_folio(h, folio, false);
h->max_huge_pages--;
spin_unlock_irq(&hugetlb_lock);
/*
- * Normally update_and_free_page will allocate required vmemmmap
- * before freeing the page. update_and_free_page will fail to
+ * Normally update_and_free_hugtlb_folio will allocate required vmemmmap
+ * before freeing the page. update_and_free_hugtlb_folio will fail to
* free the page if it can not allocate required vmemmap. We
* need to adjust max_huge_pages if the page is not freed.
* Attempt to allocate vmemmmap here so that we can take
* appropriate action on failure.
*/
- rc = hugetlb_vmemmap_restore(h, head);
+ rc = hugetlb_vmemmap_restore(h, &folio->page);
if (!rc) {
- update_and_free_page(h, head, false);
+ update_and_free_hugetlb_folio(h, folio, false);
} else {
spin_lock_irq(&hugetlb_lock);
- add_hugetlb_page(h, head, false);
+ add_hugetlb_folio(h, folio, false);
h->max_huge_pages++;
spin_unlock_irq(&hugetlb_lock);
}
static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
if (hstate_is_gigantic(h))
return NULL;
goto out_unlock;
spin_unlock_irq(&hugetlb_lock);
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
- if (!page)
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
+ if (!folio)
return NULL;
spin_lock_irq(&hugetlb_lock);
* codeflow
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
- SetHPageTemporary(page);
+ folio_set_hugetlb_temporary(folio);
spin_unlock_irq(&hugetlb_lock);
- free_huge_page(page);
+ free_huge_page(&folio->page);
return NULL;
}
h->surplus_huge_pages++;
- h->surplus_huge_pages_node[page_to_nid(page)]++;
+ h->surplus_huge_pages_node[folio_nid(folio)]++;
out_unlock:
spin_unlock_irq(&hugetlb_lock);
- return page;
+ return &folio->page;
}
static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
- struct page *page;
+ struct folio *folio;
if (hstate_is_gigantic(h))
return NULL;
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
- if (!page)
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
+ if (!folio)
return NULL;
/* fresh huge pages are frozen */
- set_page_refcounted(page);
-
+ folio_ref_unfreeze(folio, 1);
/*
* We do not account these pages as surplus because they are only
* temporary and will be released properly on the last reference
*/
- SetHPageTemporary(page);
+ folio_set_hugetlb_temporary(folio);
- return page;
+ return &folio->page;
}
/*
if ((--needed) < 0)
break;
/* Add the page to the hugetlb allocator */
- enqueue_huge_page(h, page);
+ enqueue_hugetlb_folio(h, page_folio(page));
}
free:
spin_unlock_irq(&hugetlb_lock);
}
/*
- * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
+ * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
+ * the old one
* @h: struct hstate old page belongs to
- * @old_page: Old page to dissolve
+ * @old_folio: Old folio to dissolve
* @list: List to isolate the page in case we need to
* Returns 0 on success, otherwise negated error.
*/
- static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
- struct list_head *list)
+ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
+ struct folio *old_folio, struct list_head *list)
{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
- int nid = page_to_nid(old_page);
- struct page *new_page;
+ int nid = folio_nid(old_folio);
+ struct folio *new_folio;
int ret = 0;
/*
- * Before dissolving the page, we need to allocate a new one for the
- * pool to remain stable. Here, we allocate the page and 'prep' it
+ * Before dissolving the folio, we need to allocate a new one for the
+ * pool to remain stable. Here, we allocate the folio and 'prep' it
* by doing everything but actually updating counters and adding to
* the pool. This simplifies and let us do most of the processing
* under the lock.
*/
- new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
- if (!new_page)
+ new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL);
+ if (!new_folio)
return -ENOMEM;
- __prep_new_huge_page(h, new_page);
+ __prep_new_hugetlb_folio(h, new_folio);
retry:
spin_lock_irq(&hugetlb_lock);
- if (!PageHuge(old_page)) {
+ if (!folio_test_hugetlb(old_folio)) {
/*
- * Freed from under us. Drop new_page too.
+ * Freed from under us. Drop new_folio too.
*/
goto free_new;
- } else if (page_count(old_page)) {
+ } else if (folio_ref_count(old_folio)) {
/*
- * Someone has grabbed the page, try to isolate it here.
+ * Someone has grabbed the folio, try to isolate it here.
* Fail with -EBUSY if not possible.
*/
spin_unlock_irq(&hugetlb_lock);
- ret = isolate_hugetlb(old_page, list);
+ ret = isolate_hugetlb(&old_folio->page, list);
spin_lock_irq(&hugetlb_lock);
goto free_new;
- } else if (!HPageFreed(old_page)) {
+ } else if (!folio_test_hugetlb_freed(old_folio)) {
/*
- * Page's refcount is 0 but it has not been enqueued in the
+ * Folio's refcount is 0 but it has not been enqueued in the
* freelist yet. Race window is small, so we can succeed here if
* we retry.
*/
goto retry;
} else {
/*
- * Ok, old_page is still a genuine free hugepage. Remove it from
+ * Ok, old_folio is still a genuine free hugepage. Remove it from
* the freelist and decrease the counters. These will be
* incremented again when calling __prep_account_new_huge_page()
- * and enqueue_huge_page() for new_page. The counters will remain
- * stable since this happens under the lock.
+ * and enqueue_hugetlb_folio() for new_folio. The counters will
+ * remain stable since this happens under the lock.
*/
- remove_hugetlb_page(h, old_page, false);
+ remove_hugetlb_folio(h, old_folio, false);
/*
- * Ref count on new page is already zero as it was dropped
+ * Ref count on new_folio is already zero as it was dropped
* earlier. It can be directly added to the pool free list.
*/
__prep_account_new_huge_page(h, nid);
- enqueue_huge_page(h, new_page);
+ enqueue_hugetlb_folio(h, new_folio);
/*
- * Pages have been replaced, we can safely free the old one.
+ * Folio has been replaced, we can safely free the old one.
*/
spin_unlock_irq(&hugetlb_lock);
- update_and_free_page(h, old_page, false);
+ update_and_free_hugetlb_folio(h, old_folio, false);
}
return ret;
free_new:
spin_unlock_irq(&hugetlb_lock);
- /* Page has a zero ref count, but needs a ref to be freed */
- set_page_refcounted(new_page);
- update_and_free_page(h, new_page, false);
+ /* Folio has a zero ref count, but needs a ref to be freed */
+ folio_ref_unfreeze(new_folio, 1);
+ update_and_free_hugetlb_folio(h, new_folio, false);
return ret;
}
int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
{
struct hstate *h;
- struct page *head;
+ struct folio *folio = page_folio(page);
int ret = -EBUSY;
/*
* Return success when racing as if we dissolved the page ourselves.
*/
spin_lock_irq(&hugetlb_lock);
- if (PageHuge(page)) {
- head = compound_head(page);
- h = page_hstate(head);
+ if (folio_test_hugetlb(folio)) {
+ h = folio_hstate(folio);
} else {
spin_unlock_irq(&hugetlb_lock);
return 0;
if (hstate_is_gigantic(h))
return -ENOMEM;
- if (page_count(head) && !isolate_hugetlb(head, list))
+ if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list))
ret = 0;
- else if (!page_count(head))
- ret = alloc_and_dissolve_huge_page(h, head, list);
+ else if (!folio_ref_count(folio))
+ ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);
return ret;
}
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
+ struct folio *folio;
long map_chg, map_commit;
long gbl_chg;
int ret, idx;
set_page_refcounted(page);
/* Fall through */
}
+ folio = page_folio(page);
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
/* If allocation is not consuming a reservation, also store the
* hugetlb_cgroup pointer on the page.
rsv_adjust = hugepage_subpool_put_pages(spool, 1);
hugetlb_acct_memory(h, -rsv_adjust);
if (deferred_reserve)
- hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
- pages_per_huge_page(h), page);
+ hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
+ pages_per_huge_page(h), folio);
}
return page;
list_for_each_entry(m, &huge_boot_pages, list) {
struct page *page = virt_to_page(m);
+ struct folio *folio = page_folio(page);
struct hstate *h = m->hstate;
VM_BUG_ON(!hstate_is_gigantic(h));
- WARN_ON(page_count(page) != 1);
- if (prep_compound_gigantic_page(page, huge_page_order(h))) {
- WARN_ON(PageReserved(page));
- prep_new_huge_page(h, page, page_to_nid(page));
+ WARN_ON(folio_ref_count(folio) != 1);
+ if (prep_compound_gigantic_folio(folio, huge_page_order(h))) {
+ WARN_ON(folio_test_reserved(folio));
+ prep_new_hugetlb_folio(h, folio, folio_nid(folio));
free_huge_page(page); /* add to the hugepage allocator */
} else {
/* VERY unlikely inflated ref count on a tail page */
- free_gigantic_page(page, huge_page_order(h));
+ free_gigantic_folio(folio, huge_page_order(h));
}
/*
if (!alloc_bootmem_huge_page(h, nid))
break;
} else {
- struct page *page;
+ struct folio *folio;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
- page = alloc_fresh_huge_page(h, gfp_mask, nid,
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
&node_states[N_MEMORY], NULL);
- if (!page)
+ if (!folio)
break;
- free_huge_page(page); /* free it into the hugepage allocator */
+ free_huge_page(&folio->page); /* free it into the hugepage allocator */
}
cond_resched();
}
goto out;
if (PageHighMem(page))
continue;
- remove_hugetlb_page(h, page, false);
+ remove_hugetlb_folio(h, page_folio(page), false);
list_add(&page->lru, &page_list);
}
}
{
int i, nid = page_to_nid(page);
struct hstate *target_hstate;
+ struct folio *folio = page_folio(page);
struct page *subpage;
int rc = 0;
target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
- remove_hugetlb_page_for_demote(h, page, false);
+ remove_hugetlb_folio_for_demote(h, folio, false);
spin_unlock_irq(&hugetlb_lock);
rc = hugetlb_vmemmap_restore(h, page);
/* Allocation of vmemmmap failed, we can not demote page */
spin_lock_irq(&hugetlb_lock);
set_page_refcounted(page);
- add_hugetlb_page(h, page, false);
+ add_hugetlb_folio(h, page_folio(page), false);
return rc;
}
/*
- * Use destroy_compound_hugetlb_page_for_demote for all huge page
+ * Use destroy_compound_hugetlb_folio_for_demote for all huge page
* sizes as it will not ref count pages.
*/
- destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h));
+ destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h));
/*
* Taking target hstate mutex synchronizes with set_max_huge_pages.
for (i = 0; i < pages_per_huge_page(h);
i += pages_per_huge_page(target_hstate)) {
subpage = nth_page(page, i);
+ folio = page_folio(subpage);
if (hstate_is_gigantic(target_hstate))
- prep_compound_gigantic_page_for_demote(subpage,
+ prep_compound_gigantic_folio_for_demote(folio,
target_hstate->order);
else
prep_compound_page(subpage, target_hstate->order);
set_page_private(subpage, 0);
- prep_new_huge_page(target_hstate, subpage, nid);
+ prep_new_hugetlb_folio(target_hstate, folio, nid);
free_huge_page(subpage);
}
mutex_unlock(&target_hstate->resize_lock);
hugepage_add_new_anon_rmap(new_page, vma, addr);
set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
- ClearHPageRestoreReserve(new_page);
SetHPageMigratable(new_page);
}
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- struct mmu_notifier_range range;
unsigned long last_addr_mask;
bool force_flush = false;
tlb_change_page_size(tlb, sz);
tlb_start_vma(tlb, vma);
- /*
- * If sharing possible, alert mmu notifiers of worst case.
- */
- mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
- end);
- adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
- mmu_notifier_invalidate_range_start(&range);
last_addr_mask = hugetlb_mask_last_page(h);
address = start;
for (; address < end; address += sz) {
* unmapped and its refcount is dropped, so just clear pte here.
*/
if (unlikely(!pte_present(pte))) {
- #ifdef CONFIG_PTE_MARKER_UFFD_WP
/*
* If the pte was wr-protected by uffd-wp in any of the
* swap forms, meanwhile the caller does not want to
set_huge_pte_at(mm, address, ptep,
make_pte_marker(PTE_MARKER_UFFD_WP));
else
- #endif
huge_pte_clear(mm, address, ptep, sz);
spin_unlock(ptl);
continue;
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
- #ifdef CONFIG_PTE_MARKER_UFFD_WP
/* Leave a uffd-wp pte marker if needed */
if (huge_pte_uffd_wp(pte) &&
!(zap_flags & ZAP_FLAG_DROP_MARKER))
set_huge_pte_at(mm, address, ptep,
make_pte_marker(PTE_MARKER_UFFD_WP));
- #endif
hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page, vma, true);
if (ref_page)
break;
}
- mmu_notifier_invalidate_range_end(&range);
tlb_end_vma(tlb, vma);
/*
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
+ /* mmu notification performed in caller */
__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */
unsigned long end, struct page *ref_page,
zap_flags_t zap_flags)
{
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ start, end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+ mmu_notifier_invalidate_range_start(&range);
tlb_gather_mmu(&tlb, vma->vm_mm);
+
__unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
+
+ mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
unsigned long haddr = address & huge_page_mask(h);
struct mmu_notifier_range range;
- VM_BUG_ON(unshare && (flags & FOLL_WRITE));
- VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
-
/*
* hugetlb does not support FOLL_FORCE-style write faults that keep the
* PTE mapped R/O such as maybe_mkwrite() would do.
/* Let's take out MAP_SHARED mappings first. */
if (vma->vm_flags & VM_MAYSHARE) {
- if (unlikely(unshare))
- return 0;
set_huge_ptep_writable(vma, haddr, ptep);
return 0;
}
spin_lock(ptl);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
- ClearHPageRestoreReserve(new_page);
-
/* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
if (!pte_same(huge_ptep_get(ptep), old_pte))
goto backout;
- if (anon_rmap) {
- ClearHPageRestoreReserve(page);
+ if (anon_rmap)
hugepage_add_new_anon_rmap(page, vma, haddr);
- } else
+ else
page_dup_file_rmap(page, true);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
goto out_release_unlock;
- if (page_in_pagecache) {
+ if (page_in_pagecache)
page_dup_file_rmap(page, true);
- } else {
- ClearHPageRestoreReserve(page);
+ else
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
- }
/*
* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
}
}
- static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
+ static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma,
+ unsigned int flags, pte_t *pte,
bool *unshare)
{
pte_t pteval = huge_ptep_get(pte);
return false;
if (flags & FOLL_WRITE)
return true;
- if (gup_must_unshare(flags, pte_page(pteval))) {
+ if (gup_must_unshare(vma, flags, pte_page(pteval))) {
*unshare = true;
return true;
}
return false;
}
- * try_grab_page() should always succeed here, because we hold
- * the ptl lock and have verified pte_present().
+ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
+ {
+ struct hstate *h = hstate_vma(vma);
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long haddr = address & huge_page_mask(h);
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ pte_t *pte, entry;
+
+ /*
+ * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
+ * follow_hugetlb_page().
+ */
+ if (WARN_ON_ONCE(flags & FOLL_PIN))
+ return NULL;
+
+ retry:
+ pte = huge_pte_offset(mm, haddr, huge_page_size(h));
+ if (!pte)
+ return NULL;
+
+ ptl = huge_pte_lock(h, mm, pte);
+ entry = huge_ptep_get(pte);
+ if (pte_present(entry)) {
+ page = pte_page(entry) +
+ ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+ /*
+ * Note that page may be a sub-page, and with vmemmap
+ * optimizations the page struct may be read only.
+ * try_grab_page() will increase the ref count on the
+ * head page, so this will be OK.
+ *
- if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
++ * try_grab_page() should always be able to get the page here,
++ * because we hold the ptl lock and have verified pte_present().
+ */
++ if (try_grab_page(page, flags)) {
+ page = NULL;
+ goto out;
+ }
+ } else {
+ if (is_hugetlb_entry_migration(entry)) {
+ spin_unlock(ptl);
+ __migration_entry_wait_huge(pte, ptl);
+ goto retry;
+ }
+ /*
+ * hwpoisoned entry is treated as no_page_table in
+ * follow_page_mask().
+ */
+ }
+ out:
+ spin_unlock(ptl);
+ return page;
+ }
+
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
* directly from any kind of swap entries.
*/
if (absent ||
- __follow_hugetlb_must_fault(flags, pte, &unshare)) {
+ __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) {
vm_fault_t ret;
unsigned int fault_flags = 0;
* tables. If the huge page is present, then the tail
* pages must also be present. The ptl prevents the
* head page and tail pages from being rearranged in
- * any way. So this page must be available at this
- * point, unless the page refcount overflowed:
+ * any way. As this is hugetlb, the pages will never
+ * be p2pdma or not longterm pinable. So this page
+ * must be available at this point, unless the page
+ * refcount overflowed:
*/
if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
flags))) {
* These functions are overwritable if your architecture needs its own
* behavior.
*/
- struct page * __weak
- follow_huge_addr(struct mm_struct *mm, unsigned long address,
- int write)
- {
- return ERR_PTR(-EINVAL);
- }
-
- struct page * __weak
- follow_huge_pd(struct vm_area_struct *vma,
- unsigned long address, hugepd_t hpd, int flags, int pdshift)
- {
- WARN(1, "hugepd follow called with no support for hugepage directory format\n");
- return NULL;
- }
-
- struct page * __weak
- follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
- {
- struct hstate *h = hstate_vma(vma);
- struct mm_struct *mm = vma->vm_mm;
- struct page *page = NULL;
- spinlock_t *ptl;
- pte_t *ptep, pte;
-
- /*
- * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
- * follow_hugetlb_page().
- */
- if (WARN_ON_ONCE(flags & FOLL_PIN))
- return NULL;
-
- retry:
- ptep = huge_pte_offset(mm, address, huge_page_size(h));
- if (!ptep)
- return NULL;
-
- ptl = huge_pte_lock(h, mm, ptep);
- pte = huge_ptep_get(ptep);
- if (pte_present(pte)) {
- page = pte_page(pte) +
- ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
- /*
- * try_grab_page() should always be able to get the page here,
- * because: a) we hold the pmd (ptl) lock, and b) we've just
- * checked that the huge pmd (head) page is present in the
- * page tables. The ptl prevents the head page and tail pages
- * from being rearranged in any way. So this page must be
- * available at this point, unless the page refcount
- * overflowed:
- */
- if (try_grab_page(page, flags)) {
- page = NULL;
- goto out;
- }
- } else {
- if (is_hugetlb_entry_migration(pte)) {
- spin_unlock(ptl);
- __migration_entry_wait_huge(ptep, ptl);
- goto retry;
- }
- /*
- * hwpoisoned entry is treated as no_page_table in
- * follow_page_mask().
- */
- }
- out:
- spin_unlock(ptl);
- return page;
- }
-
- struct page * __weak
- follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int flags)
- {
- struct page *page = NULL;
- spinlock_t *ptl;
- pte_t pte;
-
- if (WARN_ON_ONCE(flags & FOLL_PIN))
- return NULL;
-
- retry:
- ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
- if (!pud_huge(*pud))
- goto out;
- pte = huge_ptep_get((pte_t *)pud);
- if (pte_present(pte)) {
- page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
- if (try_grab_page(page, flags)) {
- page = NULL;
- goto out;
- }
- } else {
- if (is_hugetlb_entry_migration(pte)) {
- spin_unlock(ptl);
- __migration_entry_wait(mm, (pte_t *)pud, ptl);
- goto retry;
- }
- /*
- * hwpoisoned entry is treated as no_page_table in
- * follow_page_mask().
- */
- }
- out:
- spin_unlock(ptl);
- return page;
- }
-
- struct page * __weak
- follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
- {
- if (flags & (FOLL_GET | FOLL_PIN))
- return NULL;
-
- return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
- }
-
int isolate_hugetlb(struct page *page, struct list_head *list)
{
int ret = 0;
return ret;
}
- int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
+ int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison)
{
int ret = 0;
*hugetlb = true;
if (HPageFreed(page))
ret = 0;
- else if (HPageMigratable(page))
+ else if (HPageMigratable(page) || unpoison)
ret = get_page_unless_zero(page);
else
ret = -EBUSY;
return ret;
}
- int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+ int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared)
{
int ret;
spin_lock_irq(&hugetlb_lock);
- ret = __get_huge_page_for_hwpoison(pfn, flags);
+ ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
spin_unlock_irq(&hugetlb_lock);
return ret;
}
put_page(page);
}
- void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
+ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason)
{
- struct hstate *h = page_hstate(oldpage);
+ struct hstate *h = folio_hstate(old_folio);
- hugetlb_cgroup_migrate(oldpage, newpage);
- set_page_owner_migrate_reason(newpage, reason);
+ hugetlb_cgroup_migrate(old_folio, new_folio);
+ set_page_owner_migrate_reason(&new_folio->page, reason);
/*
- * transfer temporary state of the new huge page. This is
+ * transfer temporary state of the new hugetlb folio. This is
* reverse to other transitions because the newpage is going to
* be final while the old one will be freed so it takes over
* the temporary status.
* here as well otherwise the global surplus count will not match
* the per-node's.
*/
- if (HPageTemporary(newpage)) {
- int old_nid = page_to_nid(oldpage);
- int new_nid = page_to_nid(newpage);
+ if (folio_test_hugetlb_temporary(new_folio)) {
+ int old_nid = folio_nid(old_folio);
+ int new_nid = folio_nid(new_folio);
+
+ folio_set_hugetlb_temporary(old_folio);
+ folio_clear_hugetlb_temporary(new_folio);
- SetHPageTemporary(oldpage);
- ClearHPageTemporary(newpage);
/*
* There is no need to transfer the per-node surplus state
*/
+ #define pr_fmt(fmt) "kasan_test: " fmt
+
+ #include <kunit/test.h>
#include <linux/bitops.h>
#include <linux/delay.h>
+ #include <linux/io.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/printk.h>
#include <linux/random.h>
+ #include <linux/set_memory.h>
#include <linux/slab.h>
#include <linux/string.h>
+ #include <linux/tracepoint.h>
#include <linux/uaccess.h>
- #include <linux/io.h>
#include <linux/vmalloc.h>
- #include <linux/set_memory.h>
+ #include <trace/events/printk.h>
#include <asm/page.h>
- #include <kunit/test.h>
-
#include "kasan.h"
#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE)
+ static bool multishot;
+
+ /* Fields set based on lines observed in the console. */
+ static struct {
+ bool report_found;
+ bool async_fault;
+ } test_status;
+
/*
* Some tests use these global variables to store return values from function
* calls that could otherwise be eliminated by the compiler as dead code.
void *kasan_ptr_result;
int kasan_int_result;
- static struct kunit_resource resource;
- static struct kunit_kasan_status test_status;
- static bool multishot;
+ /* Probe for console output: obtains test_status lines of interest. */
+ static void probe_console(void *ignore, const char *buf, size_t len)
+ {
+ if (strnstr(buf, "BUG: KASAN: ", len))
+ WRITE_ONCE(test_status.report_found, true);
+ else if (strnstr(buf, "Asynchronous fault: ", len))
+ WRITE_ONCE(test_status.async_fault, true);
+ }
- /*
- * Temporarily enable multi-shot mode. Otherwise, KASAN would only report the
- * first detected bug and panic the kernel if panic_on_warn is enabled. For
- * hardware tag-based KASAN also allow tag checking to be reenabled for each
- * test, see the comment for KUNIT_EXPECT_KASAN_FAIL().
- */
- static int kasan_test_init(struct kunit *test)
+ static void register_tracepoints(struct tracepoint *tp, void *ignore)
+ {
+ check_trace_callback_type_console(probe_console);
+ if (!strcmp(tp->name, "console"))
+ WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
+ }
+
+ static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
+ {
+ if (!strcmp(tp->name, "console"))
+ tracepoint_probe_unregister(tp, probe_console, NULL);
+ }
+
+ static int kasan_suite_init(struct kunit_suite *suite)
{
if (!kasan_enabled()) {
- kunit_err(test, "can't run KASAN tests with KASAN disabled");
+ pr_err("Can't run KASAN tests with KASAN disabled");
return -1;
}
+ /* Stop failing KUnit tests on KASAN reports. */
+ kasan_kunit_test_suite_start();
+
+ /*
+ * Temporarily enable multi-shot mode. Otherwise, KASAN would only
+ * report the first detected bug and panic the kernel if panic_on_warn
+ * is enabled.
+ */
multishot = kasan_save_enable_multi_shot();
- test_status.report_found = false;
- test_status.sync_fault = false;
- kunit_add_named_resource(test, NULL, NULL, &resource,
- "kasan_status", &test_status);
+
+ /*
+ * Because we want to be able to build the test as a module, we need to
+ * iterate through all known tracepoints, since the static registration
+ * won't work here.
+ */
+ for_each_kernel_tracepoint(register_tracepoints, NULL);
return 0;
}
- static void kasan_test_exit(struct kunit *test)
+ static void kasan_suite_exit(struct kunit_suite *suite)
{
+ kasan_kunit_test_suite_end();
kasan_restore_multi_shot(multishot);
- KUNIT_EXPECT_FALSE(test, test_status.report_found);
+ for_each_kernel_tracepoint(unregister_tracepoints, NULL);
+ tracepoint_synchronize_unregister();
+ }
+
+ static void kasan_test_exit(struct kunit *test)
+ {
+ KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found));
}
/**
if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \
kasan_sync_fault_possible()) { \
if (READ_ONCE(test_status.report_found) && \
- READ_ONCE(test_status.sync_fault)) \
+ !READ_ONCE(test_status.async_fault)) \
kasan_enable_tagging(); \
migrate_enable(); \
} \
WRITE_ONCE(test_status.report_found, false); \
+ WRITE_ONCE(test_status.async_fault, false); \
} while (0)
#define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \
KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr));
}
+ /*
+ * The two tests below check that Generic KASAN prints auxiliary stack traces
+ * for RCU callbacks and workqueues. The reports need to be inspected manually.
+ *
+ * These tests are still enabled for other KASAN modes to make sure that all
+ * modes report bad accesses in tested scenarios.
+ */
+
+ static struct kasan_rcu_info {
+ int i;
+ struct rcu_head rcu;
+ } *global_rcu_ptr;
+
+ static void rcu_uaf_reclaim(struct rcu_head *rp)
+ {
+ struct kasan_rcu_info *fp =
+ container_of(rp, struct kasan_rcu_info, rcu);
+
+ kfree(fp);
+ ((volatile struct kasan_rcu_info *)fp)->i;
+ }
+
+ static void rcu_uaf(struct kunit *test)
+ {
+ struct kasan_rcu_info *ptr;
+
+ ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ global_rcu_ptr = rcu_dereference_protected(
+ (struct kasan_rcu_info __rcu *)ptr, NULL);
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ call_rcu(&global_rcu_ptr->rcu, rcu_uaf_reclaim);
+ rcu_barrier());
+ }
+
+ static void workqueue_uaf_work(struct work_struct *work)
+ {
+ kfree(work);
+ }
+
+ static void workqueue_uaf(struct kunit *test)
+ {
+ struct workqueue_struct *workqueue;
+ struct work_struct *work;
+
+ workqueue = create_workqueue("kasan_workqueue_test");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, workqueue);
+
+ work = kmalloc(sizeof(struct work_struct), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, work);
+
+ INIT_WORK(work, workqueue_uaf_work);
+ queue_work(workqueue, work);
+ destroy_workqueue(workqueue);
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ ((volatile struct work_struct *)work)->data);
+ }
+
static void vmalloc_helpers_tags(struct kunit *test)
{
void *ptr;
KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
for (i = 0; i < 256; i++) {
- size = prandom_u32_max(1024) + 1;
+ size = get_random_u32_inclusive(1, 1024);
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
}
for (i = 0; i < 256; i++) {
- order = prandom_u32_max(4) + 1;
+ order = get_random_u32_inclusive(1, 4);
pages = alloc_pages(GFP_KERNEL, order);
ptr = page_address(pages);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
return;
for (i = 0; i < 256; i++) {
- size = prandom_u32_max(1024) + 1;
+ size = get_random_u32_inclusive(1, 1024);
ptr = vmalloc(size);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
KUNIT_CASE(kasan_bitops_generic),
KUNIT_CASE(kasan_bitops_tags),
KUNIT_CASE(kmalloc_double_kzfree),
+ KUNIT_CASE(rcu_uaf),
+ KUNIT_CASE(workqueue_uaf),
KUNIT_CASE(vmalloc_helpers_tags),
KUNIT_CASE(vmalloc_oob),
KUNIT_CASE(vmap_tags),
static struct kunit_suite kasan_kunit_test_suite = {
.name = "kasan",
- .init = kasan_test_init,
.test_cases = kasan_kunit_test_cases,
.exit = kasan_test_exit,
+ .suite_init = kasan_suite_init,
+ .suite_exit = kasan_suite_exit,
};
kunit_test_suite(kasan_kunit_test_suite);
#include <linux/random.h>
#include <linux/rcupdate.h>
#include <linux/sched/clock.h>
- #include <linux/sched/sysctl.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
unsigned long flags;
struct slab *slab;
void *addr;
- const bool random_right_allocate = prandom_u32_max(2);
+ const bool random_right_allocate = get_random_u32_below(2);
const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS &&
- !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS);
+ !get_random_u32_below(CONFIG_KFENCE_STRESS_TEST_FAULTS);
/* Try to obtain a free object. */
raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
/* Enable static key, and await allocation to happen. */
static_branch_enable(&kfence_allocation_key);
- if (sysctl_hung_task_timeout_secs) {
- /*
- * During low activity with no allocations we might wait a
- * while; let's avoid the hung task warning.
- */
- wait_event_idle_timeout(allocation_wait, atomic_read(&kfence_allocation_gate),
- sysctl_hung_task_timeout_secs * HZ / 2);
- } else {
- wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate));
- }
+ wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate));
/* Disable static key and reset timer. */
static_branch_disable(&kfence_allocation_key);
{
mmap_assert_locked(vma->vm_mm);
- if (vma->vm_file)
- return NULL;
-
return vma->anon_name;
}
* vm_flags is protected by the mmap_lock held in write mode.
*/
vma->vm_flags = new_flags;
- if (!vma->vm_file) {
+ if (!vma->vm_file || vma_is_anon_shmem(vma)) {
error = replace_anon_vma_name(vma, anon_name);
if (error)
return error;
put_page(page);
}
swap_read_unplug(splug);
+ cond_resched();
return 0;
}
return 0;
}
+ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
+ {
+ if (!vma->vm_file)
+ return false;
+ /*
+ * paging out pagecache only for non-anonymous mappings that correspond
+ * to the files the calling process could (if tried) open for writing;
+ * otherwise we'd be including shared non-exclusive mappings, which
+ * opens a side channel.
+ */
+ return inode_owner_or_capable(&init_user_ns,
+ file_inode(vma->vm_file)) ||
+ file_permission(vma->vm_file, MAY_WRITE) == 0;
+ }
+
static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
spinlock_t *ptl;
struct page *page = NULL;
LIST_HEAD(page_list);
+ bool pageout_anon_only_filter;
if (fatal_signal_pending(current))
return -EINTR;
+ pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
+ !can_do_file_pageout(vma);
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pmd_trans_huge(*pmd)) {
pmd_t orig_pmd;
if (page_mapcount(page) != 1)
goto huge_unlock;
+ if (pageout_anon_only_filter && !PageAnon(page))
+ goto huge_unlock;
+
if (next - addr != HPAGE_PMD_SIZE) {
int err;
if (PageTransCompound(page)) {
if (page_mapcount(page) != 1)
break;
+ if (pageout_anon_only_filter && !PageAnon(page))
+ break;
get_page(page);
if (!trylock_page(page)) {
put_page(page);
if (!PageLRU(page) || page_mapcount(page) != 1)
continue;
+ if (pageout_anon_only_filter && !PageAnon(page))
+ continue;
+
VM_BUG_ON_PAGE(PageTransCompound(page), page);
if (pte_young(ptent)) {
tlb_end_vma(tlb, vma);
}
- static inline bool can_do_pageout(struct vm_area_struct *vma)
- {
- if (vma_is_anonymous(vma))
- return true;
- if (!vma->vm_file)
- return false;
- /*
- * paging out pagecache only for non-anonymous mappings that correspond
- * to the files the calling process could (if tried) open for writing;
- * otherwise we'd be including shared non-exclusive mappings, which
- * opens a side channel.
- */
- return inode_owner_or_capable(&init_user_ns,
- file_inode(vma->vm_file)) ||
- file_permission(vma->vm_file, MAY_WRITE) == 0;
- }
-
static long madvise_pageout(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start_addr, unsigned long end_addr)
if (!can_madv_lru_vma(vma))
return -EINVAL;
- if (!can_do_pageout(vma))
+ /*
+ * If the VMA belongs to a private file mapping, there can be private
+ * dirty pages which can be paged out if even this process is neither
+ * owner nor write capable of the file. We allow private file mappings
+ * further to pageout dirty anon pages.
+ */
+ if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
+ (vma->vm_flags & VM_MAYSHARE)))
return 0;
lru_add_drain();
int error;
/* Only anonymous mappings can be named */
- if (vma->vm_file)
+ if (vma->vm_file && !vma_is_anon_shmem(vma))
return -EBADF;
error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
goto out;
}
- ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
if (ret < 0)
goto out;
if (unlikely(!get_page_unless_zero(page)))
goto out;
+ if (unlikely(PageSlab(page)))
+ goto out_putpage;
+ /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */
+ smp_rmb();
/*
- * Check PageMovable before holding a PG_lock because page's owner
- * assumes anybody doesn't touch PG_lock of newly allocated page
- * so unconditionally grabbing the lock ruins page's owner side.
+ * Check movable flag before taking the page lock because
+ * we use non-atomic bitops on newly allocated page flags so
+ * unconditionally grabbing the lock ruins page's owner side.
*/
if (unlikely(!__PageMovable(page)))
goto out_putpage;
+ /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */
+ smp_rmb();
+ if (unlikely(PageSlab(page)))
+ goto out_putpage;
+
/*
* As movable pages are not isolated from LRU lists, concurrent
* compaction threads can race against page migration functions
{
return __buffer_migrate_folio(mapping, dst, src, mode, true);
}
+EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs);
#endif
int filemap_migrate_folio(struct address_space *mapping,
}
/*
- * Obtain the lock on page, remove all ptes and migrate the page
- * to the newly allocated page in newpage.
+ * Obtain the lock on folio, remove all ptes and migrate the folio
+ * to the newly allocated folio in dst.
*/
static int unmap_and_move(new_page_t get_new_page,
free_page_t put_new_page,
- unsigned long private, struct page *page,
+ unsigned long private, struct folio *src,
int force, enum migrate_mode mode,
enum migrate_reason reason,
struct list_head *ret)
{
- struct folio *dst, *src = page_folio(page);
+ struct folio *dst;
int rc = MIGRATEPAGE_SUCCESS;
struct page *newpage = NULL;
- if (!thp_migration_supported() && PageTransHuge(page))
+ if (!thp_migration_supported() && folio_test_transhuge(src))
return -ENOSYS;
- if (page_count(page) == 1) {
- /* Page was freed from under us. So we are done. */
- ClearPageActive(page);
- ClearPageUnevictable(page);
+ if (folio_ref_count(src) == 1) {
+ /* Folio was freed from under us. So we are done. */
+ folio_clear_active(src);
+ folio_clear_unevictable(src);
/* free_pages_prepare() will clear PG_isolated. */
goto out;
}
- newpage = get_new_page(page, private);
+ newpage = get_new_page(&src->page, private);
if (!newpage)
return -ENOMEM;
dst = page_folio(newpage);
- newpage->private = 0;
+ dst->private = NULL;
rc = __unmap_and_move(src, dst, force, mode);
if (rc == MIGRATEPAGE_SUCCESS)
- set_page_owner_migrate_reason(newpage, reason);
+ set_page_owner_migrate_reason(&dst->page, reason);
out:
if (rc != -EAGAIN) {
/*
- * A page that has been migrated has all references
- * removed and will be freed. A page that has not been
+ * A folio that has been migrated has all references
+ * removed and will be freed. A folio that has not been
* migrated will have kept its references and be restored.
*/
- list_del(&page->lru);
+ list_del(&src->lru);
}
/*
* If migration is successful, releases reference grabbed during
- * isolation. Otherwise, restore the page to right list unless
+ * isolation. Otherwise, restore the folio to right list unless
* we want to retry.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
/*
- * Compaction can migrate also non-LRU pages which are
+ * Compaction can migrate also non-LRU folios which are
* not accounted to NR_ISOLATED_*. They can be recognized
- * as __PageMovable
+ * as __folio_test_movable
*/
- if (likely(!__PageMovable(page)))
- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_lru(page), -thp_nr_pages(page));
+ if (likely(!__folio_test_movable(src)))
+ mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
+ folio_is_file_lru(src), -folio_nr_pages(src));
if (reason != MR_MEMORY_FAILURE)
/*
- * We release the page in page_handle_poison.
+ * We release the folio in page_handle_poison.
*/
- put_page(page);
+ folio_put(src);
} else {
if (rc != -EAGAIN)
- list_add_tail(&page->lru, ret);
+ list_add_tail(&src->lru, ret);
if (put_new_page)
- put_new_page(newpage, private);
+ put_new_page(&dst->page, private);
else
- put_page(newpage);
+ folio_put(dst);
}
return rc;
* folio_mapping() set, hugetlbfs specific move page routine will not
* be called and we could leak usage counts for subpools.
*/
- if (hugetlb_page_subpool(hpage) && !folio_mapping(src)) {
+ if (hugetlb_folio_subpool(src) && !folio_mapping(src)) {
rc = -EBUSY;
goto out_unlock;
}
put_anon_vma(anon_vma);
if (rc == MIGRATEPAGE_SUCCESS) {
- move_hugetlb_state(hpage, new_hpage, reason);
+ move_hugetlb_state(src, dst, reason);
put_new_page = NULL;
}
return rc;
}
- static inline int try_split_thp(struct page *page, struct list_head *split_pages)
+ static inline int try_split_folio(struct folio *folio, struct list_head *split_folios)
{
int rc;
- lock_page(page);
- rc = split_huge_page_to_list(page, split_pages);
- unlock_page(page);
+ folio_lock(folio);
+ rc = split_folio_to_list(folio, split_folios);
+ folio_unlock(folio);
if (!rc)
- list_move_tail(&page->lru, split_pages);
+ list_move_tail(&folio->lru, split_folios);
return rc;
}
/*
- * migrate_pages - migrate the pages specified in a list, to the free pages
+ * migrate_pages - migrate the folios specified in a list, to the free folios
* supplied as the target for the page migration
*
- * @from: The list of pages to be migrated.
- * @get_new_page: The function used to allocate free pages to be used
- * as the target of the page migration.
- * @put_new_page: The function used to free target pages if migration
+ * @from: The list of folios to be migrated.
+ * @get_new_page: The function used to allocate free folios to be used
+ * as the target of the folio migration.
+ * @put_new_page: The function used to free target folios if migration
* fails, or NULL if no special handling is necessary.
* @private: Private data to be passed on to get_new_page()
* @mode: The migration mode that specifies the constraints for
- * page migration, if any.
- * @reason: The reason for page migration.
- * @ret_succeeded: Set to the number of normal pages migrated successfully if
+ * folio migration, if any.
+ * @reason: The reason for folio migration.
+ * @ret_succeeded: Set to the number of folios migrated successfully if
* the caller passes a non-NULL pointer.
*
- * The function returns after 10 attempts or if no pages are movable any more
- * because the list has become empty or no retryable pages exist any more.
- * It is caller's responsibility to call putback_movable_pages() to return pages
+ * The function returns after 10 attempts or if no folios are movable any more
+ * because the list has become empty or no retryable folios exist any more.
+ * It is caller's responsibility to call putback_movable_pages() to return folios
* to the LRU or free list only if ret != 0.
*
- * Returns the number of {normal page, THP, hugetlb} that were not migrated, or
- * an error code. The number of THP splits will be considered as the number of
- * non-migrated THP, no matter how many subpages of the THP are migrated successfully.
+ * Returns the number of {normal folio, large folio, hugetlb} that were not
+ * migrated, or an error code. The number of large folio splits will be
+ * considered as the number of non-migrated large folio, no matter how many
+ * split folios of the large folio are migrated successfully.
*/
int migrate_pages(struct list_head *from, new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
{
int retry = 1;
+ int large_retry = 1;
int thp_retry = 1;
int nr_failed = 0;
int nr_failed_pages = 0;
int nr_retry_pages = 0;
int nr_succeeded = 0;
int nr_thp_succeeded = 0;
+ int nr_large_failed = 0;
int nr_thp_failed = 0;
int nr_thp_split = 0;
int pass = 0;
+ bool is_large = false;
bool is_thp = false;
- struct page *page;
- struct page *page2;
- int rc, nr_subpages;
- LIST_HEAD(ret_pages);
- LIST_HEAD(thp_split_pages);
+ struct folio *folio, *folio2;
+ int rc, nr_pages;
+ LIST_HEAD(ret_folios);
+ LIST_HEAD(split_folios);
bool nosplit = (reason == MR_NUMA_MISPLACED);
- bool no_subpage_counting = false;
+ bool no_split_folio_counting = false;
trace_mm_migrate_pages_start(mode, reason);
- thp_subpage_migration:
- for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
+ split_folio_migration:
+ for (pass = 0; pass < 10 && (retry || large_retry); pass++) {
retry = 0;
+ large_retry = 0;
thp_retry = 0;
nr_retry_pages = 0;
- list_for_each_entry_safe(page, page2, from, lru) {
+ list_for_each_entry_safe(folio, folio2, from, lru) {
/*
- * THP statistics is based on the source huge page.
- * Capture required information that might get lost
- * during migration.
+ * Large folio statistics is based on the source large
+ * folio. Capture required information that might get
+ * lost during migration.
*/
- is_thp = PageTransHuge(page) && !PageHuge(page);
- nr_subpages = compound_nr(page);
+ is_large = folio_test_large(folio) && !folio_test_hugetlb(folio);
+ is_thp = is_large && folio_test_pmd_mappable(folio);
+ nr_pages = folio_nr_pages(folio);
cond_resched();
- if (PageHuge(page))
+ if (folio_test_hugetlb(folio))
rc = unmap_and_move_huge_page(get_new_page,
- put_new_page, private, page,
- pass > 2, mode, reason,
- &ret_pages);
+ put_new_page, private,
+ &folio->page, pass > 2, mode,
+ reason,
+ &ret_folios);
else
rc = unmap_and_move(get_new_page, put_new_page,
- private, page, pass > 2, mode,
- reason, &ret_pages);
+ private, folio, pass > 2, mode,
+ reason, &ret_folios);
/*
* The rules are:
- * Success: non hugetlb page will be freed, hugetlb
- * page will be put back
+ * Success: non hugetlb folio will be freed, hugetlb
+ * folio will be put back
* -EAGAIN: stay on the from list
* -ENOMEM: stay on the from list
* -ENOSYS: stay on the from list
- * Other errno: put on ret_pages list then splice to
+ * Other errno: put on ret_folios list then splice to
* from list
*/
switch(rc) {
/*
- * THP migration might be unsupported or the
- * allocation could've failed so we should
- * retry on the same page with the THP split
- * to base pages.
+ * Large folio migration might be unsupported or
+ * the allocation could've failed so we should retry
+ * on the same folio with the large folio split
+ * to normal folios.
*
- * Sub-pages are put in thp_split_pages, and
+ * Split folios are put in split_folios, and
* we will migrate them after the rest of the
* list is processed.
*/
case -ENOSYS:
- /* THP migration is unsupported */
- if (is_thp) {
- nr_thp_failed++;
- if (!try_split_thp(page, &thp_split_pages)) {
- nr_thp_split++;
+ /* Large folio migration is unsupported */
+ if (is_large) {
+ nr_large_failed++;
+ nr_thp_failed += is_thp;
+ if (!try_split_folio(folio, &split_folios)) {
+ nr_thp_split += is_thp;
break;
}
/* Hugetlb migration is unsupported */
- } else if (!no_subpage_counting) {
+ } else if (!no_split_folio_counting) {
nr_failed++;
}
- nr_failed_pages += nr_subpages;
- list_move_tail(&page->lru, &ret_pages);
+ nr_failed_pages += nr_pages;
+ list_move_tail(&folio->lru, &ret_folios);
break;
case -ENOMEM:
/*
* When memory is low, don't bother to try to migrate
- * other pages, just exit.
+ * other folios, just exit.
*/
- if (is_thp) {
- nr_thp_failed++;
- /* THP NUMA faulting doesn't split THP to retry. */
- if (!nosplit && !try_split_thp(page, &thp_split_pages)) {
- nr_thp_split++;
- break;
+ if (is_large) {
+ nr_large_failed++;
+ nr_thp_failed += is_thp;
+ /* Large folio NUMA faulting doesn't split to retry. */
+ if (!nosplit) {
+ int ret = try_split_folio(folio, &split_folios);
+
+ if (!ret) {
+ nr_thp_split += is_thp;
+ break;
+ } else if (reason == MR_LONGTERM_PIN &&
+ ret == -EAGAIN) {
+ /*
+ * Try again to split large folio to
+ * mitigate the failure of longterm pinning.
+ */
+ large_retry++;
+ thp_retry += is_thp;
+ nr_retry_pages += nr_pages;
+ break;
+ }
}
- } else if (!no_subpage_counting) {
+ } else if (!no_split_folio_counting) {
nr_failed++;
}
- nr_failed_pages += nr_subpages + nr_retry_pages;
+ nr_failed_pages += nr_pages + nr_retry_pages;
/*
- * There might be some subpages of fail-to-migrate THPs
- * left in thp_split_pages list. Move them back to migration
+ * There might be some split folios of fail-to-migrate large
+ * folios left in split_folios list. Move them back to migration
* list so that they could be put back to the right list by
- * the caller otherwise the page refcnt will be leaked.
+ * the caller otherwise the folio refcnt will be leaked.
*/
- list_splice_init(&thp_split_pages, from);
+ list_splice_init(&split_folios, from);
/* nr_failed isn't updated for not used */
+ nr_large_failed += large_retry;
nr_thp_failed += thp_retry;
goto out;
case -EAGAIN:
- if (is_thp)
- thp_retry++;
- else if (!no_subpage_counting)
+ if (is_large) {
+ large_retry++;
+ thp_retry += is_thp;
+ } else if (!no_split_folio_counting) {
retry++;
- nr_retry_pages += nr_subpages;
+ }
+ nr_retry_pages += nr_pages;
break;
case MIGRATEPAGE_SUCCESS:
- nr_succeeded += nr_subpages;
- if (is_thp)
- nr_thp_succeeded++;
+ nr_succeeded += nr_pages;
+ nr_thp_succeeded += is_thp;
break;
default:
/*
* Permanent failure (-EBUSY, etc.):
- * unlike -EAGAIN case, the failed page is
- * removed from migration page list and not
+ * unlike -EAGAIN case, the failed folio is
+ * removed from migration folio list and not
* retried in the next outer loop.
*/
- if (is_thp)
- nr_thp_failed++;
- else if (!no_subpage_counting)
+ if (is_large) {
+ nr_large_failed++;
+ nr_thp_failed += is_thp;
+ } else if (!no_split_folio_counting) {
nr_failed++;
+ }
- nr_failed_pages += nr_subpages;
+ nr_failed_pages += nr_pages;
break;
}
}
}
nr_failed += retry;
+ nr_large_failed += large_retry;
nr_thp_failed += thp_retry;
nr_failed_pages += nr_retry_pages;
/*
- * Try to migrate subpages of fail-to-migrate THPs, no nr_failed
- * counting in this round, since all subpages of a THP is counted
- * as 1 failure in the first round.
+ * Try to migrate split folios of fail-to-migrate large folios, no
+ * nr_failed counting in this round, since all split folios of a
+ * large folio is counted as 1 failure in the first round.
*/
- if (!list_empty(&thp_split_pages)) {
+ if (!list_empty(&split_folios)) {
/*
- * Move non-migrated pages (after 10 retries) to ret_pages
+ * Move non-migrated folios (after 10 retries) to ret_folios
* to avoid migrating them again.
*/
- list_splice_init(from, &ret_pages);
- list_splice_init(&thp_split_pages, from);
- no_subpage_counting = true;
+ list_splice_init(from, &ret_folios);
+ list_splice_init(&split_folios, from);
+ no_split_folio_counting = true;
retry = 1;
- goto thp_subpage_migration;
+ goto split_folio_migration;
}
- rc = nr_failed + nr_thp_failed;
+ rc = nr_failed + nr_large_failed;
out:
/*
- * Put the permanent failure page back to migration list, they
+ * Put the permanent failure folio back to migration list, they
* will be put back to the right list by the caller.
*/
- list_splice(&ret_pages, from);
+ list_splice(&ret_folios, from);
/*
- * Return 0 in case all subpages of fail-to-migrate THPs are
- * migrated successfully.
+ * Return 0 in case all split folios of fail-to-migrate large folios
+ * are migrated successfully.
*/
if (list_empty(from))
rc = 0;
nid = folio_nid(folio);
if (folio_test_hugetlb(folio)) {
- struct hstate *h = page_hstate(&folio->page);
+ struct hstate *h = folio_hstate(folio);
gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
for (i = 0; i < nr_pages; i++) {
unsigned long addr = (unsigned long)(*pages);
- unsigned int foll_flags = FOLL_DUMP;
struct vm_area_struct *vma;
struct page *page;
int err = -EFAULT;
if (!vma)
goto set_status;
- /* Not all huge page follow APIs support 'FOLL_GET' */
- if (!is_vm_hugetlb_page(vma))
- foll_flags |= FOLL_GET;
-
/* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, addr, foll_flags);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
err = PTR_ERR(page);
if (IS_ERR(page))
if (!is_zone_device_page(page))
err = page_to_nid(page);
- if (foll_flags & FOLL_GET)
- put_page(page);
+ put_page(page);
set_status:
*status = err;
*/
pgoff = 0;
get_area = shmem_get_unmapped_area;
- } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- /* Ensures that larger anonymous mappings are THP aligned. */
- get_area = thp_get_unmapped_area;
}
addr = get_area(file, addr, len, pgoff, flags);
addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
mas_set_range(mas, vma->vm_start, addr + len - 1);
if (mas_preallocate(mas, vma, GFP_KERNEL))
- return -ENOMEM;
+ goto unacct_fail;
vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
if (vma->anon_vma) {
/* create a vma struct for an anonymous mapping */
vma = vm_area_alloc(mm);
if (!vma)
- goto vma_alloc_fail;
+ goto unacct_fail;
vma_set_anonymous(vma);
vma->vm_start = addr;
mas_store_fail:
vm_area_free(vma);
- vma_alloc_fail:
+ unacct_fail:
vm_unacct_memory(len >> PAGE_SHIFT);
return -ENOMEM;
}
return NOTIFY_OK;
}
- static struct notifier_block reserve_mem_nb = {
- .notifier_call = reserve_mem_notifier,
- };
-
static int __meminit init_reserve_notifier(void)
{
- if (register_hotmemory_notifier(&reserve_mem_nb))
+ if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI))
pr_err("Failed registering memory add/remove notifier for admin reserve\n");
return 0;
static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
static const struct vm_operations_struct shmem_vm_ops;
+ static const struct vm_operations_struct shmem_anon_vm_ops;
static struct file_system_type shmem_fs_type;
+ bool vma_is_anon_shmem(struct vm_area_struct *vma)
+ {
+ return vma->vm_ops == &shmem_anon_vm_ops;
+ }
+
bool vma_is_shmem(struct vm_area_struct *vma)
{
- return vma->vm_ops == &shmem_vm_ops;
+ return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
}
static LIST_HEAD(shmem_swaplist);
folio_batch_init(&fbatch);
index = start;
- while (index < end && find_lock_entries(mapping, index, end - 1,
+ while (index < end && find_lock_entries(mapping, &index, end - 1,
&fbatch, indices)) {
for (i = 0; i < folio_batch_count(&fbatch); i++) {
folio = fbatch.folios[i];
- index = indices[i];
-
if (xa_is_value(folio)) {
if (unfalloc)
continue;
nr_swaps_freed += !shmem_free_swap(mapping,
- index, folio);
+ indices[i], folio);
continue;
}
- index += folio_nr_pages(folio) - 1;
if (!unfalloc || !folio_test_uptodate(folio))
truncate_inode_folio(mapping, folio);
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
cond_resched();
- index++;
}
/*
while (index < end) {
cond_resched();
- if (!find_get_entries(mapping, index, end - 1, &fbatch,
+ if (!find_get_entries(mapping, &index, end - 1, &fbatch,
indices)) {
/* If all gone or hole-punch or unfalloc, we're done */
if (index == start || end != -1)
for (i = 0; i < folio_batch_count(&fbatch); i++) {
folio = fbatch.folios[i];
- index = indices[i];
if (xa_is_value(folio)) {
if (unfalloc)
continue;
- if (shmem_free_swap(mapping, index, folio)) {
+ if (shmem_free_swap(mapping, indices[i], folio)) {
/* Swap was replaced by page: retry */
- index--;
+ index = indices[i];
break;
}
nr_swaps_freed++;
if (folio_mapping(folio) != mapping) {
/* Page was replaced by swap: retry */
folio_unlock(folio);
- index--;
+ index = indices[i];
break;
}
VM_BUG_ON_FOLIO(folio_test_writeback(folio),
folio);
truncate_inode_folio(mapping, folio);
}
- index = folio->index + folio_nr_pages(folio) - 1;
folio_unlock(folio);
}
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
- index++;
}
spin_lock_irq(&info->lock);
setattr_copy(&init_user_ns, inode, attr);
if (attr->ia_valid & ATTR_MODE)
- error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+ error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
if (!error && update_ctime) {
inode->i_ctime = current_time(inode);
if (update_mtime)
swp_entry_t swapin_error;
void *old;
- swapin_error = make_swapin_error_entry(&folio->page);
+ swapin_error = make_swapin_error_entry();
old = xa_cmpxchg_irq(&mapping->i_pages, index,
swp_to_radix_entry(swap),
swp_to_radix_entry(swapin_error), 0);
struct shmem_sb_info *sbinfo;
struct mm_struct *charge_mm;
struct folio *folio;
- pgoff_t hindex = index;
+ pgoff_t hindex;
gfp_t huge_gfp;
int error;
int once = 0;
}
if (folio) {
- hindex = folio->index;
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
if (folio_test_uptodate(folio))
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
- struct shmem_inode_info *info = SHMEM_I(file_inode(file));
+ struct inode *inode = file_inode(file);
+ struct shmem_inode_info *info = SHMEM_I(inode);
int ret;
ret = seal_check_future_write(info->seals, vma);
vma->vm_flags |= VM_MTE_ALLOWED;
file_accessed(file);
- vma->vm_ops = &shmem_vm_ops;
+ /* This is anonymous shared memory if it is unlinked at the time of mmap */
+ if (inode->i_nlink)
+ vma->vm_ops = &shmem_vm_ops;
+ else
+ vma->vm_ops = &shmem_anon_vm_ops;
return 0;
}
memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
xattr->name, len);
- simple_xattr_list_add(&info->xattrs, new_xattr);
+ simple_xattr_add(&info->xattrs, new_xattr);
}
return 0;
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
+ .open = generic_file_open,
.get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
.llseek = shmem_file_llseek,
#endif
};
+ static const struct vm_operations_struct shmem_anon_vm_ops = {
+ .fault = shmem_fault,
+ .map_pages = filemap_map_pages,
+ #ifdef CONFIG_NUMA
+ .set_policy = shmem_set_policy,
+ .get_policy = shmem_get_policy,
+ #endif
+ };
+
int shmem_init_fs_context(struct fs_context *fc)
{
struct shmem_options *ctx;
EXPORT_SYMBOL_GPL(shmem_truncate_range);
#define shmem_vm_ops generic_file_vm_ops
+ #define shmem_anon_vm_ops generic_file_vm_ops
#define shmem_file_operations ramfs_file_operations
#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
#define shmem_acct_size(flags, size) 0
if (vma->vm_file)
fput(vma->vm_file);
vma->vm_file = file;
- vma->vm_ops = &shmem_vm_ops;
+ vma->vm_ops = &shmem_anon_vm_ops;
return 0;
}
#include <linux/memcontrol.h>
#include <linux/random.h>
#include <kunit/test.h>
+#include <kunit/test-bug.h>
#include <linux/sort.h>
#include <linux/debugfs.h>
#define USE_LOCKLESS_FAST_PATH() (false)
#endif
+#ifndef CONFIG_SLUB_TINY
+#define __fastpath_inline __always_inline
+#else
+#define __fastpath_inline
+#endif
+
#ifdef CONFIG_SLUB_DEBUG
#ifdef CONFIG_SLUB_DEBUG_ON
DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
/* Enable to log cmpxchg failures */
#undef SLUB_DEBUG_CMPXCHG
+#ifndef CONFIG_SLUB_TINY
/*
* Minimum number of partial slabs. These will be left on the partial
* lists even if they are empty. kmem_cache_shrink may reclaim them.
* sort the partial list by the number of objects in use.
*/
#define MAX_PARTIAL 10
+#else
+#define MIN_PARTIAL 0
+#define MAX_PARTIAL 0
+#endif
#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
SLAB_POISON | SLAB_STORE_USER)
enum track_item { TRACK_ALLOC, TRACK_FREE };
-#ifdef CONFIG_SYSFS
+#ifdef SLAB_SUPPORTS_SYSFS
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
#else
*/
static nodemask_t slab_nodes;
+#ifndef CONFIG_SLUB_TINY
/*
* Workqueue used for flush_cpu_slab().
*/
static struct workqueue_struct *flushwq;
+#endif
/********************************************************************
* Core slab cache functions
return freelist_dereference(s, object + s->offset);
}
+#ifndef CONFIG_SLUB_TINY
static void prefetch_freepointer(const struct kmem_cache *s, void *object)
{
prefetchw(object + s->offset);
}
+#endif
/*
* When running under KMSAN, get_freepointer_safe() may return an uninitialized
{
struct kunit_resource *resource;
- if (likely(!current->kunit_test))
+ if (!kunit_get_current_test())
return false;
resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
if (!slub_debug_orig_size(s))
return;
+#ifdef CONFIG_KASAN_GENERIC
+ /*
+ * KASAN could save its free meta data in object's data area at
+ * offset 0, if the size is larger than 'orig_size', it will
+ * overlap the data redzone in [orig_size+1, object_size], and
+ * the check should be skipped.
+ */
+ if (kasan_metadata_size(s, true) > orig_size)
+ orig_size = s->object_size;
+#endif
+
p += get_info_end(s);
p += sizeof(struct track) * 2;
return *(unsigned int *)p;
}
+void skip_orig_size_check(struct kmem_cache *s, const void *object)
+{
+ set_orig_size(s, (void *)object, s->object_size);
+}
+
static void slab_bug(struct kmem_cache *s, char *fmt, ...)
{
struct va_format vaf;
if (slub_debug_orig_size(s))
off += sizeof(unsigned int);
- off += kasan_metadata_size(s);
+ off += kasan_metadata_size(s, false);
if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
static void init_object(struct kmem_cache *s, void *object, u8 val)
{
u8 *p = kasan_reset_tag(object);
+ unsigned int poison_size = s->object_size;
- if (s->flags & SLAB_RED_ZONE)
+ if (s->flags & SLAB_RED_ZONE) {
memset(p - s->red_left_pad, val, s->red_left_pad);
+ if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
+ /*
+ * Redzone the extra allocated space by kmalloc than
+ * requested, and the poison size will be limited to
+ * the original request size accordingly.
+ */
+ poison_size = get_orig_size(s, object);
+ }
+ }
+
if (s->flags & __OBJECT_POISON) {
- memset(p, POISON_FREE, s->object_size - 1);
- p[s->object_size - 1] = POISON_END;
+ memset(p, POISON_FREE, poison_size - 1);
+ p[poison_size - 1] = POISON_END;
}
if (s->flags & SLAB_RED_ZONE)
- memset(p + s->object_size, val, s->inuse - s->object_size);
+ memset(p + poison_size, val, s->inuse - poison_size);
}
static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
off += sizeof(unsigned int);
}
- off += kasan_metadata_size(s);
+ off += kasan_metadata_size(s, false);
if (size_from_object(s) == off)
return 1;
{
u8 *p = object;
u8 *endobject = object + s->object_size;
+ unsigned int orig_size;
if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, slab, object, "Left Redzone",
if (!check_bytes_and_report(s, slab, object, "Right Redzone",
endobject, val, s->inuse - s->object_size))
return 0;
+
+ if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
+ orig_size = get_orig_size(s, object);
+
+ if (s->object_size > orig_size &&
+ !check_bytes_and_report(s, slab, object,
+ "kmalloc Redzone", p + orig_size,
+ val, s->object_size - orig_size)) {
+ return 0;
+ }
+ }
} else {
if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
check_bytes_and_report(s, slab, p, "Alignment padding",
return 1;
}
-static noinline int alloc_debug_processing(struct kmem_cache *s,
+static noinline bool alloc_debug_processing(struct kmem_cache *s,
struct slab *slab, void *object, int orig_size)
{
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
trace(s, slab, object, 1);
set_orig_size(s, object, orig_size);
init_object(s, object, SLUB_RED_ACTIVE);
- return 1;
+ return true;
bad:
if (folio_test_slab(slab_folio(slab))) {
slab->inuse = slab->objects;
slab->freelist = NULL;
}
- return 0;
+ return false;
}
static inline int free_consistency_checks(struct kmem_cache *s,
static inline
void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
-static inline int alloc_debug_processing(struct kmem_cache *s,
- struct slab *slab, void *object, int orig_size) { return 0; }
+static inline bool alloc_debug_processing(struct kmem_cache *s,
+ struct slab *slab, void *object, int orig_size) { return true; }
-static inline void free_debug_processing(
- struct kmem_cache *s, struct slab *slab,
- void *head, void *tail, int bulk_cnt,
- unsigned long addr) {}
+static inline bool free_debug_processing(struct kmem_cache *s,
+ struct slab *slab, void *head, void *tail, int *bulk_cnt,
+ unsigned long addr, depot_stack_handle_t handle) { return true; }
static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
static inline int check_object(struct kmem_cache *s, struct slab *slab,
void *object, u8 val) { return 1; }
+static inline depot_stack_handle_t set_track_prepare(void) { return 0; }
static inline void set_track(struct kmem_cache *s, void *object,
enum track_item alloc, unsigned long addr) {}
static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
+#ifndef CONFIG_SLUB_TINY
static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
void **freelist, void *nextfree)
{
return false;
}
+#endif
#endif /* CONFIG_SLUB_DEBUG */
/*
slab = folio_slab(folio);
__folio_set_slab(folio);
+ /* Make the flag visible before any changes to folio->mapping */
+ smp_wmb();
if (page_is_pfmemalloc(folio_page(folio, 0)))
slab_set_pfmemalloc(slab);
return false;
freelist_count = oo_objects(s->oo);
- pos = prandom_u32_max(freelist_count);
+ pos = get_random_u32_below(freelist_count);
page_limit = slab->objects * s->size;
start = fixup_red_left(s, slab_address(slab));
int order = folio_order(folio);
int pages = 1 << order;
- if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
- void *p;
-
- slab_pad_check(s, slab);
- for_each_object(p, s, slab_address(slab), slab->objects)
- check_object(s, slab, p, SLUB_RED_INACTIVE);
- }
-
__slab_clear_pfmemalloc(slab);
- __folio_clear_slab(folio);
folio->mapping = NULL;
+ /* Make the mapping reset visible before clearing the flag */
+ smp_wmb();
+ __folio_clear_slab(folio);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
unaccount_slab(slab, order, s);
static void free_slab(struct kmem_cache *s, struct slab *slab)
{
- if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
+ if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
+ void *p;
+
+ slab_pad_check(s, slab);
+ for_each_object(p, s, slab_address(slab), slab->objects)
+ check_object(s, slab, p, SLUB_RED_INACTIVE);
+ }
+
+ if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU))
call_rcu(&slab->rcu_head, rcu_free_slab);
- } else
+ else
__free_slab(s, slab);
}
if (!pfmemalloc_match(slab, pc->flags))
continue;
- if (kmem_cache_debug(s)) {
+ if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
object = alloc_single_from_partial(s, n, slab,
pc->orig_size);
if (object)
return get_any_partial(s, pc);
}
+#ifndef CONFIG_SLUB_TINY
+
#ifdef CONFIG_PREEMPTION
/*
* Calculate the next globally unique transaction for disambiguation
* different cpus.
*/
#define TID_STEP 1
-#endif
+#endif /* CONFIG_PREEMPTION */
static inline unsigned long next_tid(unsigned long tid)
{
static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
void *freelist)
{
- enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE, M_FULL_NOLIST };
+ enum slab_modes { M_NONE, M_PARTIAL, M_FREE, M_FULL_NOLIST };
struct kmem_cache_node *n = get_node(s, slab_nid(slab));
int free_delta = 0;
enum slab_modes mode = M_NONE;
* acquire_slab() will see a slab that is frozen
*/
spin_lock_irqsave(&n->list_lock, flags);
- } else if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) {
- mode = M_FULL;
- /*
- * This also ensures that the scanning of full
- * slabs from diagnostic functions will not see
- * any frozen slabs.
- */
- spin_lock_irqsave(&n->list_lock, flags);
} else {
mode = M_FULL_NOLIST;
}
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab")) {
- if (mode == M_PARTIAL || mode == M_FULL)
+ if (mode == M_PARTIAL)
spin_unlock_irqrestore(&n->list_lock, flags);
goto redo;
}
stat(s, DEACTIVATE_EMPTY);
discard_slab(s, slab);
stat(s, FREE_SLAB);
- } else if (mode == M_FULL) {
- add_full(s, n, slab);
- spin_unlock_irqrestore(&n->list_lock, flags);
- stat(s, DEACTIVATE_FULL);
} else if (mode == M_FULL_NOLIST) {
stat(s, DEACTIVATE_FULL);
}
return 0;
}
+#else /* CONFIG_SLUB_TINY */
+static inline void flush_all_cpus_locked(struct kmem_cache *s) { }
+static inline void flush_all(struct kmem_cache *s) { }
+static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { }
+static inline int slub_cpu_dead(unsigned int cpu) { return 0; }
+#endif /* CONFIG_SLUB_TINY */
+
/*
* Check if the objects in a per cpu structure fit numa
* locality expectations.
}
/* Supports checking bulk free of a constructed freelist */
-static noinline void free_debug_processing(
- struct kmem_cache *s, struct slab *slab,
- void *head, void *tail, int bulk_cnt,
- unsigned long addr)
+static inline bool free_debug_processing(struct kmem_cache *s,
+ struct slab *slab, void *head, void *tail, int *bulk_cnt,
+ unsigned long addr, depot_stack_handle_t handle)
{
- struct kmem_cache_node *n = get_node(s, slab_nid(slab));
- struct slab *slab_free = NULL;
+ bool checks_ok = false;
void *object = head;
int cnt = 0;
- unsigned long flags;
- bool checks_ok = false;
- depot_stack_handle_t handle = 0;
-
- if (s->flags & SLAB_STORE_USER)
- handle = set_track_prepare();
-
- spin_lock_irqsave(&n->list_lock, flags);
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
if (!check_slab(s, slab))
goto out;
}
- if (slab->inuse < bulk_cnt) {
+ if (slab->inuse < *bulk_cnt) {
slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n",
- slab->inuse, bulk_cnt);
+ slab->inuse, *bulk_cnt);
goto out;
}
next_object:
- if (++cnt > bulk_cnt)
+ if (++cnt > *bulk_cnt)
goto out_cnt;
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
checks_ok = true;
out_cnt:
- if (cnt != bulk_cnt)
+ if (cnt != *bulk_cnt) {
slab_err(s, slab, "Bulk free expected %d objects but found %d\n",
- bulk_cnt, cnt);
-
-out:
- if (checks_ok) {
- void *prior = slab->freelist;
-
- /* Perform the actual freeing while we still hold the locks */
- slab->inuse -= cnt;
- set_freepointer(s, tail, prior);
- slab->freelist = head;
-
- /*
- * If the slab is empty, and node's partial list is full,
- * it should be discarded anyway no matter it's on full or
- * partial list.
- */
- if (slab->inuse == 0 && n->nr_partial >= s->min_partial)
- slab_free = slab;
-
- if (!prior) {
- /* was on full list */
- remove_full(s, n, slab);
- if (!slab_free) {
- add_partial(n, slab, DEACTIVATE_TO_TAIL);
- stat(s, FREE_ADD_PARTIAL);
- }
- } else if (slab_free) {
- remove_partial(n, slab);
- stat(s, FREE_REMOVE_PARTIAL);
- }
+ *bulk_cnt, cnt);
+ *bulk_cnt = cnt;
}
- if (slab_free) {
- /*
- * Update the counters while still holding n->list_lock to
- * prevent spurious validation warnings
- */
- dec_slabs_node(s, slab_nid(slab_free), slab_free->objects);
- }
-
- spin_unlock_irqrestore(&n->list_lock, flags);
+out:
if (!checks_ok)
slab_fix(s, "Object at 0x%p not freed", object);
- if (slab_free) {
- stat(s, FREE_SLAB);
- free_slab(s, slab_free);
- }
+ return checks_ok;
}
#endif /* CONFIG_SLUB_DEBUG */
-#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
+#if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS)
static unsigned long count_partial(struct kmem_cache_node *n,
int (*get_count)(struct slab *))
{
spin_unlock_irqrestore(&n->list_lock, flags);
return x;
}
-#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
+#endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
+#ifdef CONFIG_SLUB_DEBUG
static noinline void
slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
{
-#ifdef CONFIG_SLUB_DEBUG
static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
int node;
pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n",
node, nr_slabs, nr_objs, nr_free);
}
-#endif
}
+#else /* CONFIG_SLUB_DEBUG */
+static inline void
+slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { }
+#endif
static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
{
return true;
}
+#ifndef CONFIG_SLUB_TINY
/*
* Check the slab->freelist and either transfer the freelist to the
* per cpu freelist or deactivate the slab.
return p;
}
-/*
- * If the object has been wiped upon free, make sure it's fully initialized by
- * zeroing out freelist pointer.
- */
-static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
- void *obj)
-{
- if (unlikely(slab_want_init_on_free(s)) && obj)
- memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
- 0, sizeof(void *));
-}
-
-/*
- * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
- * have the fastpath folded into their functions. So no function call
- * overhead for requests that can be satisfied on the fastpath.
- *
- * The fastpath works by first checking if the lockless freelist can be used.
- * If not then __slab_alloc is called for slow processing.
- *
- * Otherwise we can simply pick the next object from the lockless free list.
- */
-static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
+static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
{
- void *object;
struct kmem_cache_cpu *c;
struct slab *slab;
unsigned long tid;
- struct obj_cgroup *objcg = NULL;
- bool init = false;
-
- s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags);
- if (!s)
- return NULL;
-
- object = kfence_alloc(s, orig_size, gfpflags);
- if (unlikely(object))
- goto out;
+ void *object;
redo:
/*
stat(s, ALLOC_FASTPATH);
}
+ return object;
+}
+#else /* CONFIG_SLUB_TINY */
+static void *__slab_alloc_node(struct kmem_cache *s,
+ gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
+{
+ struct partial_context pc;
+ struct slab *slab;
+ void *object;
+
+ pc.flags = gfpflags;
+ pc.slab = &slab;
+ pc.orig_size = orig_size;
+ object = get_partial(s, node, &pc);
+
+ if (object)
+ return object;
+
+ slab = new_slab(s, gfpflags, node);
+ if (unlikely(!slab)) {
+ slab_out_of_memory(s, gfpflags, node);
+ return NULL;
+ }
+
+ object = alloc_single_from_new_slab(s, slab, orig_size);
+
+ return object;
+}
+#endif /* CONFIG_SLUB_TINY */
+
+/*
+ * If the object has been wiped upon free, make sure it's fully initialized by
+ * zeroing out freelist pointer.
+ */
+static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
+ void *obj)
+{
+ if (unlikely(slab_want_init_on_free(s)) && obj)
+ memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
+ 0, sizeof(void *));
+}
+
+/*
+ * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
+ * have the fastpath folded into their functions. So no function call
+ * overhead for requests that can be satisfied on the fastpath.
+ *
+ * The fastpath works by first checking if the lockless freelist can be used.
+ * If not then __slab_alloc is called for slow processing.
+ *
+ * Otherwise we can simply pick the next object from the lockless free list.
+ */
+static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
+ gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
+{
+ void *object;
+ struct obj_cgroup *objcg = NULL;
+ bool init = false;
+
+ s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags);
+ if (!s)
+ return NULL;
+
+ object = kfence_alloc(s, orig_size, gfpflags);
+ if (unlikely(object))
+ goto out;
+
+ object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
+
maybe_wipe_obj_freeptr(s, object);
init = slab_want_init_on_alloc(gfpflags, s);
out:
- slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init);
+ /*
+ * When init equals 'true', like for kzalloc() family, only
+ * @orig_size bytes might be zeroed instead of s->object_size
+ */
+ slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size);
return object;
}
-static __always_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru,
+static __fastpath_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru,
gfp_t gfpflags, unsigned long addr, size_t orig_size)
{
return slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, addr, orig_size);
}
-static __always_inline
+static __fastpath_inline
void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
gfp_t gfpflags)
{
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
+static noinline void free_to_partial_list(
+ struct kmem_cache *s, struct slab *slab,
+ void *head, void *tail, int bulk_cnt,
+ unsigned long addr)
+{
+ struct kmem_cache_node *n = get_node(s, slab_nid(slab));
+ struct slab *slab_free = NULL;
+ int cnt = bulk_cnt;
+ unsigned long flags;
+ depot_stack_handle_t handle = 0;
+
+ if (s->flags & SLAB_STORE_USER)
+ handle = set_track_prepare();
+
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) {
+ void *prior = slab->freelist;
+
+ /* Perform the actual freeing while we still hold the locks */
+ slab->inuse -= cnt;
+ set_freepointer(s, tail, prior);
+ slab->freelist = head;
+
+ /*
+ * If the slab is empty, and node's partial list is full,
+ * it should be discarded anyway no matter it's on full or
+ * partial list.
+ */
+ if (slab->inuse == 0 && n->nr_partial >= s->min_partial)
+ slab_free = slab;
+
+ if (!prior) {
+ /* was on full list */
+ remove_full(s, n, slab);
+ if (!slab_free) {
+ add_partial(n, slab, DEACTIVATE_TO_TAIL);
+ stat(s, FREE_ADD_PARTIAL);
+ }
+ } else if (slab_free) {
+ remove_partial(n, slab);
+ stat(s, FREE_REMOVE_PARTIAL);
+ }
+ }
+
+ if (slab_free) {
+ /*
+ * Update the counters while still holding n->list_lock to
+ * prevent spurious validation warnings
+ */
+ dec_slabs_node(s, slab_nid(slab_free), slab_free->objects);
+ }
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ if (slab_free) {
+ stat(s, FREE_SLAB);
+ free_slab(s, slab_free);
+ }
+}
+
/*
* Slow path handling. This may still be called frequently since objects
* have a longer lifetime than the cpu slabs in most processing loads.
if (kfence_free(head))
return;
- if (kmem_cache_debug(s)) {
- free_debug_processing(s, slab, head, tail, cnt, addr);
+ if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
+ free_to_partial_list(s, slab, head, tail, cnt, addr);
return;
}
discard_slab(s, slab);
}
+#ifndef CONFIG_SLUB_TINY
/*
* Fastpath with forced inlining to produce a kfree and kmem_cache_free that
* can perform fastpath freeing without additional function calls.
}
stat(s, FREE_FASTPATH);
}
+#else /* CONFIG_SLUB_TINY */
+static void do_slab_free(struct kmem_cache *s,
+ struct slab *slab, void *head, void *tail,
+ int cnt, unsigned long addr)
+{
+ void *tail_obj = tail ? : head;
+
+ __slab_free(s, slab, head, tail_obj, cnt, addr);
+}
+#endif /* CONFIG_SLUB_TINY */
-static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab,
+static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab,
void *head, void *tail, void **p, int cnt,
unsigned long addr)
{
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
-/* Note that interrupts must be enabled when calling this function. */
-int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- void **p)
+#ifndef CONFIG_SLUB_TINY
+static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p, struct obj_cgroup *objcg)
{
struct kmem_cache_cpu *c;
int i;
- struct obj_cgroup *objcg = NULL;
- /* memcg and kmem_cache debug support */
- s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags);
- if (unlikely(!s))
- return false;
/*
* Drain objects in the per cpu slab, while disabling local
* IRQs, which protects against PREEMPT and interrupts
local_unlock_irq(&s->cpu_slab->lock);
slub_put_cpu_ptr(s->cpu_slab);
- /*
- * memcg and kmem_cache debug support and memory initialization.
- * Done outside of the IRQ disabled fastpath loop.
- */
- slab_post_alloc_hook(s, objcg, flags, size, p,
- slab_want_init_on_alloc(flags, s));
return i;
+
error:
slub_put_cpu_ptr(s->cpu_slab);
- slab_post_alloc_hook(s, objcg, flags, i, p, false);
+ slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size);
+ kmem_cache_free_bulk(s, i, p);
+ return 0;
+
+}
+#else /* CONFIG_SLUB_TINY */
+static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p, struct obj_cgroup *objcg)
+{
+ int i;
+
+ for (i = 0; i < size; i++) {
+ void *object = kfence_alloc(s, s->object_size, flags);
+
+ if (unlikely(object)) {
+ p[i] = object;
+ continue;
+ }
+
+ p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE,
+ _RET_IP_, s->object_size);
+ if (unlikely(!p[i]))
+ goto error;
+
+ maybe_wipe_obj_freeptr(s, p[i]);
+ }
+
+ return i;
+
+error:
+ slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size);
kmem_cache_free_bulk(s, i, p);
return 0;
}
+#endif /* CONFIG_SLUB_TINY */
+
+/* Note that interrupts must be enabled when calling this function. */
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
+{
+ int i;
+ struct obj_cgroup *objcg = NULL;
+
+ if (!size)
+ return 0;
+
+ /* memcg and kmem_cache debug support */
+ s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags);
+ if (unlikely(!s))
+ return 0;
+
+ i = __kmem_cache_alloc_bulk(s, flags, size, p, objcg);
+
+ /*
+ * memcg and kmem_cache debug support and memory initialization.
+ * Done outside of the IRQ disabled fastpath loop.
+ */
+ if (i != 0)
+ slab_post_alloc_hook(s, objcg, flags, size, p,
+ slab_want_init_on_alloc(flags, s), s->object_size);
+ return i;
+}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
* take the list_lock.
*/
static unsigned int slub_min_order;
-static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
+static unsigned int slub_max_order =
+ IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER;
static unsigned int slub_min_objects;
/*
#endif
}
+#ifndef CONFIG_SLUB_TINY
static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
{
BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
- KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
+ NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH *
+ sizeof(struct kmem_cache_cpu));
/*
* Must align to double word boundary for the double cmpxchg
return 1;
}
+#else
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
+{
+ return 1;
+}
+#endif /* CONFIG_SLUB_TINY */
static struct kmem_cache *kmem_cache_node;
void __kmem_cache_release(struct kmem_cache *s)
{
cache_random_seq_destroy(s);
+#ifndef CONFIG_SLUB_TINY
free_percpu(s->cpu_slab);
+#endif
free_kmem_cache_nodes(s);
}
*/
s->inuse = size;
- if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
+ if (slub_debug_orig_size(s) ||
+ (flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
s->ctor) {
/*
return ret;
}
- static struct notifier_block slab_memory_callback_nb = {
- .notifier_call = slab_memory_callback,
- .priority = SLAB_CALLBACK_PRI,
- };
-
/********************************************************************
* Basic setup of slabs
*******************************************************************/
create_boot_cache(kmem_cache_node, "kmem_cache_node",
sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
- register_hotmemory_notifier(&slab_memory_callback_nb);
+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
/* Able to allocate the per node structures */
slab_state = PARTIAL;
void __init kmem_cache_init_late(void)
{
+#ifndef CONFIG_SLUB_TINY
flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0);
WARN_ON(!flushwq);
+#endif
}
struct kmem_cache *
return 0;
}
-#ifdef CONFIG_SYSFS
+#ifdef SLAB_SUPPORTS_SYSFS
static int count_inuse(struct slab *slab)
{
return slab->inuse;
#endif /* CONFIG_DEBUG_FS */
#endif /* CONFIG_SLUB_DEBUG */
-#ifdef CONFIG_SYSFS
+#ifdef SLAB_SUPPORTS_SYSFS
enum slab_stat_type {
SL_ALL, /* All slabs */
SL_PARTIAL, /* Only partially allocated slabs */
SLAB_ATTR_RO(cache_dma);
#endif
+#ifdef CONFIG_HARDENED_USERCOPY
static ssize_t usersize_show(struct kmem_cache *s, char *buf)
{
return sysfs_emit(buf, "%u\n", s->usersize);
}
SLAB_ATTR_RO(usersize);
+#endif
static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
{
{
return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
}
-SLAB_ATTR_RO(failslab);
+
+static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ if (s->refcount > 1)
+ return -EINVAL;
+
+ if (buf[0] == '1')
+ WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB);
+ else
+ WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB);
+
+ return length;
+}
+SLAB_ATTR(failslab);
#endif
static ssize_t shrink_show(struct kmem_cache *s, char *buf)
#ifdef CONFIG_FAILSLAB
&failslab_attr.attr,
#endif
+#ifdef CONFIG_HARDENED_USERCOPY
&usersize_attr.attr,
+#endif
#ifdef CONFIG_KFENCE
&skip_kfence_attr.attr,
#endif
struct kset *kset = cache_kset(s);
int unmergeable = slab_unmergeable(s);
- if (!kset) {
- kobject_init(&s->kobj, &slab_ktype);
- return 0;
- }
-
if (!unmergeable && disable_higher_order_debug &&
(slub_debug & DEBUG_METADATA_FLAGS))
unmergeable = 1;
mutex_unlock(&slab_mutex);
return 0;
}
-
-__initcall(slab_sysfs_init);
-#endif /* CONFIG_SYSFS */
+late_initcall(slab_sysfs_init);
+#endif /* SLAB_SUPPORTS_SYSFS */
#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
static int slab_debugfs_show(struct seq_file *seq, void *v)
/* No free swap slots available */
if (si->highest_bit <= si->lowest_bit)
return;
- next = si->lowest_bit +
- prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
+ next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit);
next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
next = max_t(unsigned int, next, si->lowest_bit);
}
pte_t pteval;
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
- pteval = swp_entry_to_pte(make_swapin_error_entry(page));
+ pteval = swp_entry_to_pte(make_swapin_error_entry());
set_pte_at(vma->vm_mm, addr, pte, pteval);
swap_free(entry);
ret = 0;
*/
for_each_possible_cpu(cpu) {
per_cpu(*p->cluster_next_cpu, cpu) =
- 1 + prandom_u32_max(p->highest_bit);
+ get_random_u32_inclusive(1, p->highest_bit);
}
nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
# SPDX-License-Identifier: GPL-2.0-only
+ cow
hugepage-mmap
hugepage-mremap
hugepage-shm
soft-dirty
split_huge_page_test
ksm_tests
+local_config.h
+local_config.mk