#include <linux/console.h>
#include <linux/seq_file.h>
#include <linux/screen_info.h>
-#include <linux/of_platform.h>
#include <linux/init.h>
#include <linux/kexec.h>
#include <linux/libfdt.h>
+#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/cpu.h>
#include <linux/interrupt.h>
total_mem = get_total_mem();
ret = parse_crashkernel(boot_command_line, total_mem,
- &crash_size, &crash_base);
+ &crash_size, &crash_base,
+ NULL, NULL);
/* invalid value specified or crashkernel=0 */
if (ret || !crash_size)
return;
If unsure, say Y.
+config ARM64_ERRATUM_2966298
+ bool "Cortex-A520: 2966298: workaround for speculatively executed unprivileged load"
+ default y
+ help
+ This option adds the workaround for ARM Cortex-A520 erratum 2966298.
+
+ On an affected Cortex-A520 core, a speculatively executed unprivileged
+ load might leak data from a privileged level via a cache side channel.
+
+ Work around this problem by executing a TLBI before returning to EL0.
+
+ If unsure, say Y.
+
config CAVIUM_ERRATUM_22375
bool "Cavium erratum 22375, 24313"
default y
config CPU_BIG_ENDIAN
bool "Build big-endian kernel"
depends on !LD_IS_LLD || LLD_VERSION >= 130000
+ # https://github.com/llvm/llvm-project/commit/1379b150991f70a5782e9a143c2ba5308da1161c
+ depends on AS_IS_GNU || AS_VERSION >= 150000
help
Say Y if you plan on running a kernel with a big-endian userspace.
config ARCH_SUPPORTS_CRASH_DUMP
def_bool y
+ config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ def_bool CRASH_CORE
+
config TRANS_TABLE
def_bool y
depends on HIBERNATION || KEXEC_CORE
#include <linux/nodemask.h>
#include <linux/initrd.h>
#include <linux/gfp.h>
+#include <linux/math.h>
#include <linux/memblock.h>
#include <linux/sort.h>
#include <linux/of.h>
*/
phys_addr_t __ro_after_init arm64_dma_phys_limit;
- /* Current arm64 boot protocol requires 2MB alignment */
- #define CRASH_ALIGN SZ_2M
-
- #define CRASH_ADDR_LOW_MAX arm64_dma_phys_limit
- #define CRASH_ADDR_HIGH_MAX (PHYS_MASK + 1)
- #define CRASH_HIGH_SEARCH_BASE SZ_4G
-
- #define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20)
-
/*
* To make optimal use of block mappings when laying out the linear
* mapping, round down the base of physical memory to a size that can
#define ARM64_MEMSTART_ALIGN (1UL << ARM64_MEMSTART_SHIFT)
#endif
- static int __init reserve_crashkernel_low(unsigned long long low_size)
- {
- unsigned long long low_base;
-
- low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
- if (!low_base) {
- pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
- return -ENOMEM;
- }
-
- pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
- low_base, low_base + low_size, low_size >> 20);
-
- crashk_low_res.start = low_base;
- crashk_low_res.end = low_base + low_size - 1;
- insert_resource(&iomem_resource, &crashk_low_res);
-
- return 0;
- }
-
- /*
- * reserve_crashkernel() - reserves memory for crash kernel
- *
- * This function reserves memory area given in "crashkernel=" kernel command
- * line parameter. The memory reserved is used by dump capture kernel when
- * primary kernel is crashing.
- */
- static void __init reserve_crashkernel(void)
+ static void __init arch_reserve_crashkernel(void)
{
- unsigned long long crash_low_size = 0, search_base = 0;
- unsigned long long crash_max = CRASH_ADDR_LOW_MAX;
+ unsigned long long low_size = 0;
unsigned long long crash_base, crash_size;
char *cmdline = boot_command_line;
- bool fixed_base = false;
bool high = false;
int ret;
if (!IS_ENABLED(CONFIG_KEXEC_CORE))
return;
- /* crashkernel=X[@offset] */
ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
- &crash_size, &crash_base);
- if (ret == -ENOENT) {
- ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base);
- if (ret || !crash_size)
- return;
-
- /*
- * crashkernel=Y,low can be specified or not, but invalid value
- * is not allowed.
- */
- ret = parse_crashkernel_low(cmdline, 0, &crash_low_size, &crash_base);
- if (ret == -ENOENT)
- crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
- else if (ret)
- return;
-
- search_base = CRASH_HIGH_SEARCH_BASE;
- crash_max = CRASH_ADDR_HIGH_MAX;
- high = true;
- } else if (ret || !crash_size) {
- /* The specified value is invalid */
+ &crash_size, &crash_base,
+ &low_size, &high);
+ if (ret)
return;
- }
-
- crash_size = PAGE_ALIGN(crash_size);
-
- /* User specifies base address explicitly. */
- if (crash_base) {
- fixed_base = true;
- search_base = crash_base;
- crash_max = crash_base + crash_size;
- }
-
- retry:
- crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
- search_base, crash_max);
- if (!crash_base) {
- /*
- * For crashkernel=size[KMG]@offset[KMG], print out failure
- * message if can't reserve the specified region.
- */
- if (fixed_base) {
- pr_warn("crashkernel reservation failed - memory is in use.\n");
- return;
- }
-
- /*
- * For crashkernel=size[KMG], if the first attempt was for
- * low memory, fall back to high memory, the minimum required
- * low memory will be reserved later.
- */
- if (!high && crash_max == CRASH_ADDR_LOW_MAX) {
- crash_max = CRASH_ADDR_HIGH_MAX;
- search_base = CRASH_ADDR_LOW_MAX;
- crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
- goto retry;
- }
-
- /*
- * For crashkernel=size[KMG],high, if the first attempt was
- * for high memory, fall back to low memory.
- */
- if (high && crash_max == CRASH_ADDR_HIGH_MAX) {
- crash_max = CRASH_ADDR_LOW_MAX;
- search_base = 0;
- goto retry;
- }
- pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
- crash_size);
- return;
- }
-
- if ((crash_base >= CRASH_ADDR_LOW_MAX) && crash_low_size &&
- reserve_crashkernel_low(crash_low_size)) {
- memblock_phys_free(crash_base, crash_size);
- return;
- }
-
- pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
- crash_base, crash_base + crash_size, crash_size >> 20);
-
- /*
- * The crashkernel memory will be removed from the kernel linear
- * map. Inform kmemleak so that it won't try to access it.
- */
- kmemleak_ignore_phys(crash_base);
- if (crashk_low_res.end)
- kmemleak_ignore_phys(crashk_low_res.start);
- crashk_res.start = crash_base;
- crashk_res.end = crash_base + crash_size - 1;
- insert_resource(&iomem_resource, &crashk_res);
+ reserve_crashkernel_generic(cmdline, crash_size, crash_base,
+ low_size, high);
}
/*
* request_standard_resources() depends on crashkernel's memory being
* reserved, so do it here.
*/
- reserve_crashkernel();
+ arch_reserve_crashkernel();
memblock_dump_all();
}
{
bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit);
- if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC))
+ if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb) {
+ /*
+ * If no bouncing needed for ZONE_DMA, reduce the swiotlb
+ * buffer for kmalloc() bouncing to 1MB per 1GB of RAM.
+ */
+ unsigned long size =
+ DIV_ROUND_UP(memblock_phys_mem_size(), 1024);
+ swiotlb_adjust_size(min(swiotlb_size_or_default(), size));
swiotlb = true;
+ }
swiotlb_init(swiotlb, SWIOTLB_VERBOSE);
}
#ifdef CONFIG_ARCH_WRITECOMBINE
-pgprot_t pgprot_wc = PAGE_KERNEL_WUC;
+bool wc_enabled = true;
#else
-pgprot_t pgprot_wc = PAGE_KERNEL_SUC;
+bool wc_enabled = false;
#endif
-EXPORT_SYMBOL(pgprot_wc);
+EXPORT_SYMBOL(wc_enabled);
static int __init setup_writecombine(char *p)
{
if (!strcmp(p, "on"))
- pgprot_wc = PAGE_KERNEL_WUC;
+ wc_enabled = true;
else if (!strcmp(p, "off"))
- pgprot_wc = PAGE_KERNEL_SUC;
+ wc_enabled = false;
else
pr_warn("Unknown writecombine setting \"%s\".\n", p);
unsigned long long crash_base, crash_size;
total_mem = memblock_phys_mem_size();
- ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
+ ret = parse_crashkernel(boot_command_line, total_mem,
+ &crash_size, &crash_base,
+ NULL, NULL);
if (ret < 0 || crash_size <= 0)
return;
select ARCH_HAS_SYNC_DMA_FOR_CPU
select ARCH_HAS_SYNC_DMA_FOR_DEVICE
select DMA_BOUNCE_UNALIGNED_KMALLOC if SWIOTLB
- select DMA_DIRECT_REMAP if MMU
config RISCV_NONSTANDARD_CACHE_OPS
bool
- depends on RISCV_DMA_NONCOHERENT
help
This enables function pointer support for non-standard noncoherent
systems to handle cache management.
depends on RISCV_ALTERNATIVE
default y
select RISCV_DMA_NONCOHERENT
+ select DMA_DIRECT_REMAP
help
Adds support to dynamically detect the presence of the ZICBOM
extension (Cache Block Management Operations) and enable its
config ARCH_SUPPORTS_CRASH_DUMP
def_bool y
+ config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ def_bool CRASH_CORE
+
config COMPAT
bool "Kernel support for 32-bit U-mode"
default 64BIT
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
select ARCH_SUPPORTS_PER_VMA_LOCK
- select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_SOFT_DIRTY
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE
select ARCH_SUPPORTS_LTO_CLANG
select ARCH_SUPPORTS_LTO_CLANG_THIN
select ARCH_USE_BUILTIN_BSWAP
+ select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64
select ARCH_USE_MEMTEST
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP)
default y if X86_BIGSMP
select USE_PERCPU_NUMA_NODE_ID
+ select OF_NUMA if OF
help
Enable NUMA (Non-Uniform Memory Access) support.
If unsure, say N.
+config INTEL_TDX_HOST
+ bool "Intel Trust Domain Extensions (TDX) host support"
+ depends on CPU_SUP_INTEL
+ depends on X86_64
+ depends on KVM_INTEL
+ help
+ Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
+ host and certain physical attacks. This option enables necessary TDX
+ support in the host kernel to run confidential VMs.
+
+ If unsure, say N.
+
config EFI
bool "EFI runtime service support"
depends on ACPI
config ARCH_SUPPORTS_CRASH_HOTPLUG
def_bool y
+ config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ def_bool CRASH_CORE
+
config PHYSICAL_START
hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
default "0x1000000"
64-bit kernel. You should likely turn this on, unless you're
100% sure that you don't have any 32-bit programs left.
+config IA32_EMULATION_DEFAULT_DISABLED
+ bool "IA32 emulation disabled by default"
+ default n
+ depends on IA32_EMULATION
+ help
+ Make IA32 emulation disabled by default. This prevents loading 32-bit
+ processes and access to 32-bit syscalls. If unsure, leave it to its
+ default value.
+
config X86_X32_ABI
bool "x32 ABI for 64-bit mode"
depends on X86_64
}
}
- /*
- * --------- Crashkernel reservation ------------------------------
- */
-
- /* 16M alignment for crash kernel regions */
- #define CRASH_ALIGN SZ_16M
-
- /*
- * Keep the crash kernel below this limit.
- *
- * Earlier 32-bits kernels would limit the kernel to the low 512 MB range
- * due to mapping restrictions.
- *
- * 64-bit kdump kernels need to be restricted to be under 64 TB, which is
- * the upper limit of system RAM in 4-level paging mode. Since the kdump
- * jump could be from 5-level paging to 4-level paging, the jump will fail if
- * the kernel is put above 64 TB, and during the 1st kernel bootup there's
- * no good way to detect the paging mode of the target kernel which will be
- * loaded for dumping.
- */
- #ifdef CONFIG_X86_32
- # define CRASH_ADDR_LOW_MAX SZ_512M
- # define CRASH_ADDR_HIGH_MAX SZ_512M
- #else
- # define CRASH_ADDR_LOW_MAX SZ_4G
- # define CRASH_ADDR_HIGH_MAX SZ_64T
- #endif
-
- static int __init reserve_crashkernel_low(void)
+ static void __init arch_reserve_crashkernel(void)
{
- #ifdef CONFIG_X86_64
- unsigned long long base, low_base = 0, low_size = 0;
- unsigned long low_mem_limit;
- int ret;
-
- low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX);
-
- /* crashkernel=Y,low */
- ret = parse_crashkernel_low(boot_command_line, low_mem_limit, &low_size, &base);
- if (ret) {
- /*
- * two parts from kernel/dma/swiotlb.c:
- * -swiotlb size: user-specified with swiotlb= or default.
- *
- * -swiotlb overflow buffer: now hardcoded to 32k. We round it
- * to 8M for other buffers that may need to stay low too. Also
- * make sure we allocate enough extra low memory so that we
- * don't run out of DMA buffers for 32-bit devices.
- */
- low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20);
- } else {
- /* passed with crashkernel=0,low ? */
- if (!low_size)
- return 0;
- }
-
- low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
- if (!low_base) {
- pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
- (unsigned long)(low_size >> 20));
- return -ENOMEM;
- }
-
- pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low RAM limit: %ldMB)\n",
- (unsigned long)(low_size >> 20),
- (unsigned long)(low_base >> 20),
- (unsigned long)(low_mem_limit >> 20));
-
- crashk_low_res.start = low_base;
- crashk_low_res.end = low_base + low_size - 1;
- insert_resource(&iomem_resource, &crashk_low_res);
- #endif
- return 0;
- }
-
- static void __init reserve_crashkernel(void)
- {
- unsigned long long crash_size, crash_base, total_mem;
+ unsigned long long crash_base, crash_size, low_size = 0;
+ char *cmdline = boot_command_line;
bool high = false;
int ret;
if (!IS_ENABLED(CONFIG_KEXEC_CORE))
return;
- total_mem = memblock_phys_mem_size();
-
- /* crashkernel=XM */
- ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
- if (ret != 0 || crash_size <= 0) {
- /* crashkernel=X,high */
- ret = parse_crashkernel_high(boot_command_line, total_mem,
- &crash_size, &crash_base);
- if (ret != 0 || crash_size <= 0)
- return;
- high = true;
- }
+ ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
+ &crash_size, &crash_base,
+ &low_size, &high);
+ if (ret)
+ return;
if (xen_pv_domain()) {
pr_info("Ignoring crashkernel for a Xen PV domain\n");
return;
}
- /* 0 means: find the address automatically */
- if (!crash_base) {
- /*
- * Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
- * crashkernel=x,high reserves memory over 4G, also allocates
- * 256M extra low memory for DMA buffers and swiotlb.
- * But the extra memory is not required for all machines.
- * So try low memory first and fall back to high memory
- * unless "crashkernel=size[KMG],high" is specified.
- */
- if (!high)
- crash_base = memblock_phys_alloc_range(crash_size,
- CRASH_ALIGN, CRASH_ALIGN,
- CRASH_ADDR_LOW_MAX);
- if (!crash_base)
- crash_base = memblock_phys_alloc_range(crash_size,
- CRASH_ALIGN, CRASH_ALIGN,
- CRASH_ADDR_HIGH_MAX);
- if (!crash_base) {
- pr_info("crashkernel reservation failed - No suitable area found.\n");
- return;
- }
- } else {
- unsigned long long start;
-
- start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base,
- crash_base + crash_size);
- if (start != crash_base) {
- pr_info("crashkernel reservation failed - memory is in use.\n");
- return;
- }
- }
-
- if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
- memblock_phys_free(crash_base, crash_size);
- return;
- }
-
- pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n",
- (unsigned long)(crash_size >> 20),
- (unsigned long)(crash_base >> 20),
- (unsigned long)(total_mem >> 20));
-
- crashk_res.start = crash_base;
- crashk_res.end = crash_base + crash_size - 1;
- insert_resource(&iomem_resource, &crashk_res);
+ reserve_crashkernel_generic(cmdline, crash_size, crash_base,
+ low_size, high);
}
static struct resource standard_io_resources[] = {
* Needs to run after memblock setup because it needs the physical
* memory size.
*/
- sev_setup_arch();
+ mem_encrypt_setup_arch();
efi_fake_memmap();
efi_find_mirror();
early_acpi_boot_init();
+ x86_flattree_get_config();
+
initmem_init();
dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
* Reserve memory for crash kernel after SRAT is parsed so that it
* won't consume hotpluggable memory.
*/
- reserve_crashkernel();
+ arch_reserve_crashkernel();
memblock_find_dma_reserve();
*/
static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock);
- static struct kmem_cache * bdev_cachep __read_mostly;
+ static struct kmem_cache *bdev_cachep __ro_after_init;
static struct inode *bdev_alloc_inode(struct super_block *sb)
{
.kill_sb = kill_anon_super,
};
- struct super_block *blockdev_superblock __read_mostly;
+ struct super_block *blockdev_superblock __ro_after_init;
EXPORT_SYMBOL_GPL(blockdev_superblock);
void __init bdev_cache_init(void)
{
int err;
- static struct vfsmount *bd_mnt;
+ static struct vfsmount *bd_mnt __ro_after_init;
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
}
EXPORT_SYMBOL(blkdev_get_by_dev);
+struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+ const struct blk_holder_ops *hops)
+{
+ struct bdev_handle *handle = kmalloc(sizeof(*handle), GFP_KERNEL);
+ struct block_device *bdev;
+
+ if (!handle)
+ return ERR_PTR(-ENOMEM);
+ bdev = blkdev_get_by_dev(dev, mode, holder, hops);
+ if (IS_ERR(bdev)) {
+ kfree(handle);
+ return ERR_CAST(bdev);
+ }
+ handle->bdev = bdev;
+ handle->holder = holder;
+ if (holder)
+ mode |= BLK_OPEN_EXCL;
+ handle->mode = mode;
+ return handle;
+}
+EXPORT_SYMBOL(bdev_open_by_dev);
+
/**
* blkdev_get_by_path - open a block device by name
* @path: path to the block device to open
}
EXPORT_SYMBOL(blkdev_get_by_path);
+struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
+ void *holder, const struct blk_holder_ops *hops)
+{
+ struct bdev_handle *handle;
+ dev_t dev;
+ int error;
+
+ error = lookup_bdev(path, &dev);
+ if (error)
+ return ERR_PTR(error);
+
+ handle = bdev_open_by_dev(dev, mode, holder, hops);
+ if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) &&
+ bdev_read_only(handle->bdev)) {
+ bdev_release(handle);
+ return ERR_PTR(-EACCES);
+ }
+
+ return handle;
+}
+EXPORT_SYMBOL(bdev_open_by_path);
+
void blkdev_put(struct block_device *bdev, void *holder)
{
struct gendisk *disk = bdev->bd_disk;
}
EXPORT_SYMBOL(blkdev_put);
+void bdev_release(struct bdev_handle *handle)
+{
+ blkdev_put(handle->bdev, handle->holder);
+ kfree(handle);
+}
+EXPORT_SYMBOL(bdev_release);
+
/**
* lookup_bdev() - Look up a struct block_device by name.
* @pathname: Name of the block device in the filesystem.
mutex_lock(&bdev->bd_holder_lock);
if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead)
bdev->bd_holder_ops->mark_dead(bdev, surprise);
- else
+ else {
+ mutex_unlock(&bdev->bd_holder_lock);
sync_blockdev(bdev);
- mutex_unlock(&bdev->bd_holder_lock);
+ }
invalidate_bdev(bdev);
}
-#ifdef CONFIG_DASD_MODULE
/*
- * Drivers should not use this directly, but the DASD driver has historically
- * had a shutdown to offline mode that doesn't actually remove the gendisk
- * that otherwise looks a lot like a safe device removal.
+ * New drivers should not use this directly. There are some drivers however
+ * that needs this for historical reasons. For example, the DASD driver has
+ * historically had a shutdown to offline mode that doesn't actually remove the
+ * gendisk that otherwise looks a lot like a safe device removal.
*/
EXPORT_SYMBOL_GPL(bdev_mark_dead);
-#endif
void sync_bdevs(bool wait)
{
goto cmdq_free;
cmdq->db_id = file_priv->ctx.id + engine * ivpu_get_context_count(vdev);
- cmdq->entry_count = (u32)((cmdq->mem->base.size - sizeof(struct vpu_job_queue_header)) /
+ cmdq->entry_count = (u32)((ivpu_bo_size(cmdq->mem) - sizeof(struct vpu_job_queue_header)) /
sizeof(struct vpu_job_queue_entry));
- cmdq->jobq = (struct vpu_job_queue *)cmdq->mem->kvaddr;
+ cmdq->jobq = (struct vpu_job_queue *)ivpu_bo_vaddr(cmdq->mem);
jobq_header = &cmdq->jobq->header;
jobq_header->engine_idx = engine;
jobq_header->head = 0;
return cmdq;
ret = ivpu_jsm_register_db(vdev, file_priv->ctx.id, cmdq->db_id,
- cmdq->mem->vpu_addr, cmdq->mem->base.size);
+ cmdq->mem->vpu_addr, ivpu_bo_size(cmdq->mem));
if (ret)
return NULL;
return -EBUSY;
}
- if (commands_offset >= bo->base.size) {
+ if (commands_offset >= ivpu_bo_size(bo)) {
ivpu_warn(vdev, "Invalid command buffer offset %u\n", commands_offset);
return -EINVAL;
}
void ivpu_job_done_thread_fini(struct ivpu_device *vdev)
{
- kthread_stop(vdev->job_done_thread);
- put_task_struct(vdev->job_done_thread);
+ kthread_stop_put(vdev->job_done_thread);
}
thread[i].tsk = tsk;
}
- msleep(10); /* start all threads before we kthread_stop() */
+ msleep(10 * n_cpus); /* start all threads before we kthread_stop() */
for (i = 0; i < n_cpus; ++i) {
struct task_struct *tsk = thread[i].tsk;
if (IS_ERR_OR_NULL(tsk))
continue;
- status = kthread_stop(tsk);
+ status = kthread_stop_put(tsk);
if (status && !err)
err = status;
-
- put_task_struct(tsk);
}
kfree(thread);
#include <asm/xen/hypercall.h>
#include <xen/balloon.h>
-#define XENVIF_QUEUE_LENGTH 32
-
/* Number of bytes allowed on the internal guest Rx queue. */
#define XENVIF_RX_QUEUE_BYTES (XEN_NETIF_RX_RING_SIZE/2 * PAGE_SIZE)
if (vif->hash.alg == XEN_NETIF_CTRL_HASH_ALGORITHM_NONE)
skb_clear_hash(skb);
+ /* timestamp packet in software */
+ skb_tx_timestamp(skb);
+
if (!xenvif_rx_queue_tail(queue, skb))
goto drop;
static const struct ethtool_ops xenvif_ethtool_ops = {
.get_link = ethtool_op_get_link,
-
+ .get_ts_info = ethtool_op_get_ts_info,
.get_sset_count = xenvif_get_sset_count,
.get_ethtool_stats = xenvif_get_ethtool_stats,
.get_strings = xenvif_get_strings,
dev->features = dev->hw_features | NETIF_F_RXCSUM;
dev->ethtool_ops = &xenvif_ethtool_ops;
- dev->tx_queue_len = XENVIF_QUEUE_LENGTH;
-
dev->min_mtu = ETH_MIN_MTU;
dev->max_mtu = ETH_MAX_MTU - VLAN_ETH_HLEN;
static void xenvif_disconnect_queue(struct xenvif_queue *queue)
{
if (queue->task) {
- kthread_stop(queue->task);
- put_task_struct(queue->task);
+ kthread_stop_put(queue->task);
queue->task = NULL;
}
module_param(experimental_iopoll_q_cnt, int, 0444);
MODULE_PARM_DESC(experimental_iopoll_q_cnt, "number of queues to be used as poll mode, def=0");
-static void debugfs_work_handler_v3_hw(struct work_struct *work);
-static void debugfs_snapshot_regs_v3_hw(struct hisi_hba *hisi_hba);
+static int debugfs_snapshot_regs_v3_hw(struct hisi_hba *hisi_hba);
static u32 hisi_sas_read32(struct hisi_hba *hisi_hba, u32 off)
{
hisi_hba = shost_priv(shost);
INIT_WORK(&hisi_hba->rst_work, hisi_sas_rst_work_handler);
- INIT_WORK(&hisi_hba->debugfs_work, debugfs_work_handler_v3_hw);
hisi_hba->hw = &hisi_sas_v3_hw;
hisi_hba->pci_dev = pdev;
hisi_hba->dev = dev;
&debugfs_ras_v3_hw_fops);
}
-static void debugfs_snapshot_regs_v3_hw(struct hisi_hba *hisi_hba)
-{
- int debugfs_dump_index = hisi_hba->debugfs_dump_index;
- struct device *dev = hisi_hba->dev;
- u64 timestamp = local_clock();
-
- if (debugfs_dump_index >= hisi_sas_debugfs_dump_count) {
- dev_warn(dev, "dump count exceeded!\n");
- return;
- }
-
- do_div(timestamp, NSEC_PER_MSEC);
- hisi_hba->debugfs_timestamp[debugfs_dump_index] = timestamp;
-
- debugfs_snapshot_prepare_v3_hw(hisi_hba);
-
- debugfs_snapshot_global_reg_v3_hw(hisi_hba);
- debugfs_snapshot_port_reg_v3_hw(hisi_hba);
- debugfs_snapshot_axi_reg_v3_hw(hisi_hba);
- debugfs_snapshot_ras_reg_v3_hw(hisi_hba);
- debugfs_snapshot_cq_reg_v3_hw(hisi_hba);
- debugfs_snapshot_dq_reg_v3_hw(hisi_hba);
- debugfs_snapshot_itct_reg_v3_hw(hisi_hba);
- debugfs_snapshot_iost_reg_v3_hw(hisi_hba);
-
- debugfs_create_files_v3_hw(hisi_hba);
-
- debugfs_snapshot_restore_v3_hw(hisi_hba);
- hisi_hba->debugfs_dump_index++;
-}
-
static ssize_t debugfs_trigger_dump_v3_hw_write(struct file *file,
const char __user *user_buf,
size_t count, loff_t *ppos)
struct hisi_hba *hisi_hba = file->f_inode->i_private;
char buf[8];
- if (hisi_hba->debugfs_dump_index >= hisi_sas_debugfs_dump_count)
- return -EFAULT;
-
if (count > 8)
return -EFAULT;
if (buf[0] != '1')
return -EFAULT;
- queue_work(hisi_hba->wq, &hisi_hba->debugfs_work);
+ down(&hisi_hba->sem);
+ if (debugfs_snapshot_regs_v3_hw(hisi_hba)) {
+ up(&hisi_hba->sem);
+ return -EFAULT;
+ }
+ up(&hisi_hba->sem);
return count;
}
return count;
}
-
- static int debugfs_bist_linkrate_v3_hw_open(struct inode *inode,
- struct file *filp)
- {
- return single_open(filp, debugfs_bist_linkrate_v3_hw_show,
- inode->i_private);
- }
-
- static const struct file_operations debugfs_bist_linkrate_v3_hw_fops = {
- .open = debugfs_bist_linkrate_v3_hw_open,
- .read = seq_read,
- .write = debugfs_bist_linkrate_v3_hw_write,
- .llseek = seq_lseek,
- .release = single_release,
- .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_linkrate_v3_hw);
static const struct {
int value;
return count;
}
-
- static int debugfs_bist_code_mode_v3_hw_open(struct inode *inode,
- struct file *filp)
- {
- return single_open(filp, debugfs_bist_code_mode_v3_hw_show,
- inode->i_private);
- }
-
- static const struct file_operations debugfs_bist_code_mode_v3_hw_fops = {
- .open = debugfs_bist_code_mode_v3_hw_open,
- .read = seq_read,
- .write = debugfs_bist_code_mode_v3_hw_write,
- .llseek = seq_lseek,
- .release = single_release,
- .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_code_mode_v3_hw);
static ssize_t debugfs_bist_phy_v3_hw_write(struct file *filp,
const char __user *buf,
return 0;
}
-
- static int debugfs_bist_phy_v3_hw_open(struct inode *inode,
- struct file *filp)
- {
- return single_open(filp, debugfs_bist_phy_v3_hw_show,
- inode->i_private);
- }
-
- static const struct file_operations debugfs_bist_phy_v3_hw_fops = {
- .open = debugfs_bist_phy_v3_hw_open,
- .read = seq_read,
- .write = debugfs_bist_phy_v3_hw_write,
- .llseek = seq_lseek,
- .release = single_release,
- .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_phy_v3_hw);
static ssize_t debugfs_bist_cnt_v3_hw_write(struct file *filp,
const char __user *buf,
return 0;
}
-
- static int debugfs_bist_cnt_v3_hw_open(struct inode *inode,
- struct file *filp)
- {
- return single_open(filp, debugfs_bist_cnt_v3_hw_show,
- inode->i_private);
- }
-
- static const struct file_operations debugfs_bist_cnt_v3_hw_ops = {
- .open = debugfs_bist_cnt_v3_hw_open,
- .read = seq_read,
- .write = debugfs_bist_cnt_v3_hw_write,
- .llseek = seq_lseek,
- .release = single_release,
- .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_cnt_v3_hw);
static const struct {
int value;
return count;
}
-
- static int debugfs_bist_mode_v3_hw_open(struct inode *inode,
- struct file *filp)
- {
- return single_open(filp, debugfs_bist_mode_v3_hw_show,
- inode->i_private);
- }
-
- static const struct file_operations debugfs_bist_mode_v3_hw_fops = {
- .open = debugfs_bist_mode_v3_hw_open,
- .read = seq_read,
- .write = debugfs_bist_mode_v3_hw_write,
- .llseek = seq_lseek,
- .release = single_release,
- .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_mode_v3_hw);
static ssize_t debugfs_bist_enable_v3_hw_write(struct file *filp,
const char __user *buf,
return 0;
}
-
- static int debugfs_bist_enable_v3_hw_open(struct inode *inode,
- struct file *filp)
- {
- return single_open(filp, debugfs_bist_enable_v3_hw_show,
- inode->i_private);
- }
-
- static const struct file_operations debugfs_bist_enable_v3_hw_fops = {
- .open = debugfs_bist_enable_v3_hw_open,
- .read = seq_read,
- .write = debugfs_bist_enable_v3_hw_write,
- .llseek = seq_lseek,
- .release = single_release,
- .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_enable_v3_hw);
static const struct {
char *name;
return 0;
}
-
- static int debugfs_v3_hw_open(struct inode *inode, struct file *filp)
- {
- return single_open(filp, debugfs_v3_hw_show,
- inode->i_private);
- }
-
- static const struct file_operations debugfs_v3_hw_fops = {
- .open = debugfs_v3_hw_open,
- .read = seq_read,
- .write = debugfs_v3_hw_write,
- .llseek = seq_lseek,
- .release = single_release,
- .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_v3_hw);
static ssize_t debugfs_phy_down_cnt_v3_hw_write(struct file *filp,
const char __user *buf,
return 0;
}
-
- static int debugfs_phy_down_cnt_v3_hw_open(struct inode *inode,
- struct file *filp)
- {
- return single_open(filp, debugfs_phy_down_cnt_v3_hw_show,
- inode->i_private);
- }
-
- static const struct file_operations debugfs_phy_down_cnt_v3_hw_fops = {
- .open = debugfs_phy_down_cnt_v3_hw_open,
- .read = seq_read,
- .write = debugfs_phy_down_cnt_v3_hw_write,
- .llseek = seq_lseek,
- .release = single_release,
- .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_phy_down_cnt_v3_hw);
enum fifo_dump_mode_v3_hw {
FIFO_DUMP_FORVER = (1U << 0),
}
}
-static void debugfs_work_handler_v3_hw(struct work_struct *work)
-{
- struct hisi_hba *hisi_hba =
- container_of(work, struct hisi_hba, debugfs_work);
-
- debugfs_snapshot_regs_v3_hw(hisi_hba);
-}
-
static void debugfs_release_v3_hw(struct hisi_hba *hisi_hba, int dump_index)
{
struct device *dev = hisi_hba->dev;
{
const struct hisi_sas_hw *hw = hisi_hba->hw;
struct device *dev = hisi_hba->dev;
- int p, c, d, r, i;
+ int p, c, d, r;
size_t sz;
for (r = 0; r < DEBUGFS_REGS_NUM; r++) {
return 0;
fail:
- for (i = 0; i < hisi_sas_debugfs_dump_count; i++)
- debugfs_release_v3_hw(hisi_hba, i);
+ debugfs_release_v3_hw(hisi_hba, dump_index);
return -ENOMEM;
}
+static int debugfs_snapshot_regs_v3_hw(struct hisi_hba *hisi_hba)
+{
+ int debugfs_dump_index = hisi_hba->debugfs_dump_index;
+ struct device *dev = hisi_hba->dev;
+ u64 timestamp = local_clock();
+
+ if (debugfs_dump_index >= hisi_sas_debugfs_dump_count) {
+ dev_warn(dev, "dump count exceeded!\n");
+ return -EINVAL;
+ }
+
+ if (debugfs_alloc_v3_hw(hisi_hba, debugfs_dump_index)) {
+ dev_warn(dev, "failed to alloc memory\n");
+ return -ENOMEM;
+ }
+
+ do_div(timestamp, NSEC_PER_MSEC);
+ hisi_hba->debugfs_timestamp[debugfs_dump_index] = timestamp;
+
+ debugfs_snapshot_prepare_v3_hw(hisi_hba);
+
+ debugfs_snapshot_global_reg_v3_hw(hisi_hba);
+ debugfs_snapshot_port_reg_v3_hw(hisi_hba);
+ debugfs_snapshot_axi_reg_v3_hw(hisi_hba);
+ debugfs_snapshot_ras_reg_v3_hw(hisi_hba);
+ debugfs_snapshot_cq_reg_v3_hw(hisi_hba);
+ debugfs_snapshot_dq_reg_v3_hw(hisi_hba);
+ debugfs_snapshot_itct_reg_v3_hw(hisi_hba);
+ debugfs_snapshot_iost_reg_v3_hw(hisi_hba);
+
+ debugfs_create_files_v3_hw(hisi_hba);
+
+ debugfs_snapshot_restore_v3_hw(hisi_hba);
+ hisi_hba->debugfs_dump_index++;
+
+ return 0;
+}
+
static void debugfs_phy_down_cnt_init_v3_hw(struct hisi_hba *hisi_hba)
{
struct dentry *dir = debugfs_create_dir("phy_down_cnt",
hisi_hba, &debugfs_bist_phy_v3_hw_fops);
debugfs_create_file("cnt", 0600, hisi_hba->debugfs_bist_dentry,
- hisi_hba, &debugfs_bist_cnt_v3_hw_ops);
+ hisi_hba, &debugfs_bist_cnt_v3_hw_fops);
debugfs_create_file("loopback_mode", 0600,
hisi_hba->debugfs_bist_dentry,
hisi_hba->debugfs_bist_linkrate = SAS_LINK_RATE_1_5_GBPS;
}
+static void debugfs_exit_v3_hw(struct hisi_hba *hisi_hba)
+{
+ debugfs_remove_recursive(hisi_hba->debugfs_dir);
+ hisi_hba->debugfs_dir = NULL;
+}
+
static void debugfs_init_v3_hw(struct hisi_hba *hisi_hba)
{
struct device *dev = hisi_hba->dev;
- int i;
hisi_hba->debugfs_dir = debugfs_create_dir(dev_name(dev),
hisi_sas_debugfs_dir);
debugfs_phy_down_cnt_init_v3_hw(hisi_hba);
debugfs_fifo_init_v3_hw(hisi_hba);
-
- for (i = 0; i < hisi_sas_debugfs_dump_count; i++) {
- if (debugfs_alloc_v3_hw(hisi_hba, i)) {
- debugfs_remove_recursive(hisi_hba->debugfs_dir);
- dev_dbg(dev, "failed to init debugfs!\n");
- break;
- }
- }
-}
-
-static void debugfs_exit_v3_hw(struct hisi_hba *hisi_hba)
-{
- debugfs_remove_recursive(hisi_hba->debugfs_dir);
}
static int
} while (tmp != bh);
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
- /*
- * If all of the buffers are uptodate then we can set the page
- * uptodate.
- */
- if (folio_uptodate)
- folio_mark_uptodate(folio);
- folio_unlock(folio);
+ folio_end_read(folio, folio_uptodate);
return;
still_busy:
* which may not fail from ordinary buffer allocations.
*/
struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
- bool retry)
+ gfp_t gfp)
{
struct buffer_head *bh, *head;
- gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
long offset;
struct mem_cgroup *memcg, *old_memcg;
- if (retry)
- gfp |= __GFP_NOFAIL;
-
/* The folio lock pins the memcg */
memcg = folio_memcg(folio);
old_memcg = set_active_memcg(memcg);
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
bool retry)
{
- return folio_alloc_buffers(page_folio(page), size, retry);
+ gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
+ if (retry)
+ gfp |= __GFP_NOFAIL;
+
+ return folio_alloc_buffers(page_folio(page), size, gfp);
}
EXPORT_SYMBOL_GPL(alloc_page_buffers);
struct buffer_head *bh;
sector_t end_block;
int ret = 0;
- gfp_t gfp_mask;
-
- gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
-
- /*
- * XXX: __getblk_slow() can not really deal with failure and
- * will endlessly loop on improvised global reclaim. Prefer
- * looping in the allocator rather than here, at least that
- * code knows what it's doing.
- */
- gfp_mask |= __GFP_NOFAIL;
folio = __filemap_get_folio(inode->i_mapping, index,
- FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp_mask);
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
bh = folio_buffers(folio);
if (bh) {
goto failed;
}
- bh = folio_alloc_buffers(folio, size, true);
+ ret = -ENOMEM;
+ bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
+ if (!bh)
+ goto failed;
/*
* Link the folio to the buffers and initialise them. Take the
}
EXPORT_SYMBOL(__find_get_block);
-/*
- * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
- * which corresponds to the passed block_device, block and size. The
- * returned buffer has its reference count incremented.
+/**
+ * bdev_getblk - Get a buffer_head in a block device's buffer cache.
+ * @bdev: The block device.
+ * @block: The block number.
+ * @size: The size of buffer_heads for this @bdev.
+ * @gfp: The memory allocation flags to use.
*
- * __getblk_gfp() will lock up the machine if grow_dev_page's
- * try_to_free_buffers() attempt is failing. FIXME, perhaps?
+ * Return: The buffer head, or NULL if memory could not be allocated.
*/
-struct buffer_head *
-__getblk_gfp(struct block_device *bdev, sector_t block,
- unsigned size, gfp_t gfp)
+struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
{
struct buffer_head *bh = __find_get_block(bdev, block, size);
- might_sleep();
- if (bh == NULL)
- bh = __getblk_slow(bdev, block, size, gfp);
- return bh;
+ might_alloc(gfp);
+ if (bh)
+ return bh;
+
+ return __getblk_slow(bdev, block, size, gfp);
}
-EXPORT_SYMBOL(__getblk_gfp);
+EXPORT_SYMBOL(bdev_getblk);
/*
* Do async read-ahead on a buffer..
*/
void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
{
- struct buffer_head *bh = __getblk(bdev, block, size);
+ struct buffer_head *bh = bdev_getblk(bdev, block, size,
+ GFP_NOWAIT | __GFP_MOVABLE);
+
if (likely(bh)) {
bh_readahead(bh, REQ_RAHEAD);
brelse(bh);
__bread_gfp(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
- struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
+ struct buffer_head *bh;
+
+ gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+
+ /*
+ * Prefer looping in the allocator rather than here, at least that
+ * code knows what it's doing.
+ */
+ gfp |= __GFP_NOFAIL;
+
+ bh = bdev_getblk(bdev, block, size, gfp);
if (likely(bh) && !buffer_uptodate(bh))
bh = __bread_slow(bh);
* block_dirty_folio() via private_lock. try_to_free_buffers
* is already excluded via the folio lock.
*/
-void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
- unsigned long b_state)
+struct buffer_head *create_empty_buffers(struct folio *folio,
+ unsigned long blocksize, unsigned long b_state)
{
struct buffer_head *bh, *head, *tail;
+ gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;
- head = folio_alloc_buffers(folio, blocksize, true);
+ head = folio_alloc_buffers(folio, blocksize, gfp);
bh = head;
do {
bh->b_state |= b_state;
}
folio_attach_private(folio, head);
spin_unlock(&folio->mapping->private_lock);
-}
-EXPORT_SYMBOL(folio_create_empty_buffers);
-void create_empty_buffers(struct page *page,
- unsigned long blocksize, unsigned long b_state)
-{
- folio_create_empty_buffers(page_folio(page), blocksize, b_state);
+ return head;
}
EXPORT_SYMBOL(create_empty_buffers);
struct inode *inode,
unsigned int b_state)
{
+ struct buffer_head *bh;
+
BUG_ON(!folio_test_locked(folio));
- if (!folio_buffers(folio))
- folio_create_empty_buffers(folio,
- 1 << READ_ONCE(inode->i_blkbits),
- b_state);
- return folio_buffers(folio);
+ bh = folio_buffers(folio);
+ if (!bh)
+ bh = create_empty_buffers(folio,
+ 1 << READ_ONCE(inode->i_blkbits), b_state);
+ return bh;
}
/*
if (!nr) {
/*
- * All buffers are uptodate - we can set the folio uptodate
- * as well. But not if get_block() returned an error.
+ * All buffers are uptodate or get_block() returned an
+ * error when trying to map them - we can finish the read.
*/
- if (!page_error)
- folio_mark_uptodate(folio);
- folio_unlock(folio);
+ folio_end_read(folio, !page_error);
return 0;
}
return PTR_ERR(folio);
bh = folio_buffers(folio);
- if (!bh) {
- folio_create_empty_buffers(folio, blocksize, 0);
- bh = folio_buffers(folio);
- }
+ if (!bh)
+ bh = create_empty_buffers(folio, blocksize, 0);
/* Find the buffer that contains "offset" */
offset = offset_in_folio(folio, from);
/*
* Buffer-head allocation
*/
- static struct kmem_cache *bh_cachep __read_mostly;
+ static struct kmem_cache *bh_cachep __ro_after_init;
/*
* Once the number of bh's in the machine exceeds this level, we start
* stripping them in writeback.
*/
- static unsigned long max_buffer_heads;
+ static unsigned long max_buffer_heads __ro_after_init;
int buffer_heads_over_limit;
#include "internal.h"
- static struct kobj_map *cdev_map;
+ static struct kobj_map *cdev_map __ro_after_init;
static DEFINE_MUTEX(chrdevs_lock);
struct module *owner = p->owner;
struct kobject *kobj;
- if (owner && !try_module_get(owner))
+ if (!try_module_get(owner))
return NULL;
kobj = kobject_get_unless_zero(&p->kobj);
if (!kobj)
EXPORT_SYMBOL(rename_lock);
- static struct kmem_cache *dentry_cache __read_mostly;
+ static struct kmem_cache *dentry_cache __ro_after_init;
const struct qstr empty_name = QSTR_INIT("", 0);
EXPORT_SYMBOL(empty_name);
* information, yet avoid using a prime hash-size or similar.
*/
- static unsigned int d_hash_shift __read_mostly;
+ static unsigned int d_hash_shift __ro_after_init;
- static struct hlist_bl_head *dentry_hashtable __read_mostly;
+ static struct hlist_bl_head *dentry_hashtable __ro_after_init;
static inline struct hlist_bl_head *d_hash(unsigned int hash)
{
d_walk(parent, parent, d_genocide_kill);
}
-void d_tmpfile(struct file *file, struct inode *inode)
+void d_mark_tmpfile(struct file *file, struct inode *inode)
{
struct dentry *dentry = file->f_path.dentry;
- inode_dec_link_count(inode);
BUG_ON(dentry->d_name.name != dentry->d_iname ||
!hlist_unhashed(&dentry->d_u.d_alias) ||
!d_unlinked(dentry));
(unsigned long long)inode->i_ino);
spin_unlock(&dentry->d_lock);
spin_unlock(&dentry->d_parent->d_lock);
+}
+EXPORT_SYMBOL(d_mark_tmpfile);
+
+void d_tmpfile(struct file *file, struct inode *inode)
+{
+ struct dentry *dentry = file->f_path.dentry;
+
+ inode_dec_link_count(inode);
+ d_mark_tmpfile(file, inode);
d_instantiate(dentry, inode);
}
EXPORT_SYMBOL(d_tmpfile);
}
/* SLAB cache for __getname() consumers */
- struct kmem_cache *names_cachep __read_mostly;
+ struct kmem_cache *names_cachep __ro_after_init;
EXPORT_SYMBOL(names_cachep);
void __init vfs_caches_init_early(void)
};
/* SLAB cache for file structures */
- static struct kmem_cache *filp_cachep __read_mostly;
+ static struct kmem_cache *filp_cachep __ro_after_init;
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
-/* Container for backing file with optional real path */
+/* Container for backing file with optional user path */
struct backing_file {
struct file file;
- struct path real_path;
+ struct path user_path;
};
static inline struct backing_file *backing_file(struct file *f)
return container_of(f, struct backing_file, file);
}
-struct path *backing_file_real_path(struct file *f)
+struct path *backing_file_user_path(struct file *f)
{
- return &backing_file(f)->real_path;
+ return &backing_file(f)->user_path;
}
-EXPORT_SYMBOL_GPL(backing_file_real_path);
+EXPORT_SYMBOL_GPL(backing_file_user_path);
-static void file_free_rcu(struct rcu_head *head)
+static inline void file_free(struct file *f)
{
- struct file *f = container_of(head, struct file, f_rcuhead);
-
+ security_file_free(f);
+ if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+ percpu_counter_dec(&nr_files);
put_cred(f->f_cred);
- if (unlikely(f->f_mode & FMODE_BACKING))
+ if (unlikely(f->f_mode & FMODE_BACKING)) {
+ path_put(backing_file_user_path(f));
kfree(backing_file(f));
- else
+ } else {
kmem_cache_free(filp_cachep, f);
+ }
}
-static inline void file_free(struct file *f)
+void release_empty_file(struct file *f)
{
- security_file_free(f);
- if (unlikely(f->f_mode & FMODE_BACKING))
- path_put(backing_file_real_path(f));
- if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
- percpu_counter_dec(&nr_files);
- call_rcu(&f->f_rcuhead, file_free_rcu);
+ WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED));
+ if (atomic_long_dec_and_test(&f->f_count)) {
+ security_file_free(f);
+ put_cred(f->f_cred);
+ if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+ percpu_counter_dec(&nr_files);
+ kmem_cache_free(filp_cachep, f);
+ }
}
/*
return error;
}
- atomic_long_set(&f->f_count, 1);
rwlock_init(&f->f_owner.lock);
spin_lock_init(&f->f_lock);
mutex_init(&f->f_pos_lock);
f->f_mode = OPEN_FMODE(flags);
/* f->f_version: 0 */
+ /*
+ * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
+ * fget-rcu pattern users need to be able to handle spurious
+ * refcount bumps we should reinitialize the reused file first.
+ */
+ atomic_long_set(&f->f_count, 1);
return 0;
}
void __init files_init(void)
{
filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
+ SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}
* inode_hash_lock
*/
- static unsigned int i_hash_mask __read_mostly;
- static unsigned int i_hash_shift __read_mostly;
- static struct hlist_head *inode_hashtable __read_mostly;
+ static unsigned int i_hash_mask __ro_after_init;
+ static unsigned int i_hash_shift __ro_after_init;
+ static struct hlist_head *inode_hashtable __ro_after_init;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
/*
static DEFINE_PER_CPU(unsigned long, nr_inodes);
static DEFINE_PER_CPU(unsigned long, nr_unused);
- static struct kmem_cache *inode_cachep __read_mostly;
+ static struct kmem_cache *inode_cachep __ro_after_init;
static long get_nr_inodes(void)
{
static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
struct timespec64 now)
{
- struct timespec64 ctime;
+ struct timespec64 atime, mtime, ctime;
if (!(mnt->mnt_flags & MNT_RELATIME))
return 1;
/*
* Is mtime younger than or equal to atime? If yes, update atime:
*/
- if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
+ atime = inode_get_atime(inode);
+ mtime = inode_get_mtime(inode);
+ if (timespec64_compare(&mtime, &atime) >= 0)
return 1;
/*
* Is ctime younger than or equal to atime? If yes, update atime:
*/
ctime = inode_get_ctime(inode);
- if (timespec64_compare(&ctime, &inode->i_atime) >= 0)
+ if (timespec64_compare(&ctime, &atime) >= 0)
return 1;
/*
* Is the previous atime value older than a day? If yes,
* update atime:
*/
- if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
+ if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60)
return 1;
/*
* Good, we can skip the atime update:
if (flags & (S_MTIME|S_CTIME|S_VERSION)) {
struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 mtime = inode_get_mtime(inode);
now = inode_set_ctime_current(inode);
if (!timespec64_equal(&now, &ctime))
updated |= S_CTIME;
- if (!timespec64_equal(&now, &inode->i_mtime)) {
- inode->i_mtime = now;
+ if (!timespec64_equal(&now, &mtime)) {
+ inode_set_mtime_to_ts(inode, now);
updated |= S_MTIME;
}
if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated))
}
if (flags & S_ATIME) {
- if (!timespec64_equal(&now, &inode->i_atime)) {
- inode->i_atime = now;
+ struct timespec64 atime = inode_get_atime(inode);
+
+ if (!timespec64_equal(&now, &atime)) {
+ inode_set_atime_to_ts(inode, now);
updated |= S_ATIME;
}
}
bool atime_needs_update(const struct path *path, struct inode *inode)
{
struct vfsmount *mnt = path->mnt;
- struct timespec64 now;
+ struct timespec64 now, atime;
if (inode->i_flags & S_NOATIME)
return false;
if (!relatime_need_update(mnt, inode, now))
return false;
- if (timespec64_equal(&inode->i_atime, &now))
+ atime = inode_get_atime(inode);
+ if (timespec64_equal(&atime, &now))
return false;
return true;
if (!sb_start_write_trylock(inode->i_sb))
return;
- if (__mnt_want_write(mnt) != 0)
+ if (mnt_get_write_access(mnt) != 0)
goto skip_update;
/*
* File systems can error out when updating inodes if they need to
* of the fs read only, e.g. subvolumes in Btrfs.
*/
inode_update_time(inode, S_ATIME);
- __mnt_drop_write(mnt);
+ mnt_put_write_access(mnt);
skip_update:
sb_end_write(inode->i_sb);
}
{
int sync_it = 0;
struct timespec64 now = current_time(inode);
- struct timespec64 ctime;
+ struct timespec64 ts;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return 0;
- if (!timespec64_equal(&inode->i_mtime, &now))
+ ts = inode_get_mtime(inode);
+ if (!timespec64_equal(&ts, &now))
sync_it = S_MTIME;
- ctime = inode_get_ctime(inode);
- if (!timespec64_equal(&ctime, &now))
+ ts = inode_get_ctime(inode);
+ if (!timespec64_equal(&ts, &now))
sync_it |= S_CTIME;
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
struct inode *inode = file_inode(file);
/* try to update time settings */
- if (!__mnt_want_write_file(file)) {
+ if (!mnt_get_write_access_file(file)) {
ret = inode_update_time(inode, sync_mode);
- __mnt_drop_write_file(file);
+ mnt_put_write_access_file(file);
}
return ret;
#include "kernfs-internal.h"
- struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
- struct kernfs_global_locks *kernfs_locks;
+ struct kmem_cache *kernfs_node_cache __ro_after_init;
+ struct kmem_cache *kernfs_iattrs_cache __ro_after_init;
+ struct kernfs_global_locks *kernfs_locks __ro_after_init;
static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
{
sb->s_time_gran = 1;
/* sysfs dentries and inodes don't require IO to create */
- sb->s_shrink.seeks = 0;
+ sb->s_shrink->seeks = 0;
/* get root inode, initialize and unlock it */
down_read(&kf_root->kernfs_rwsem);
*/
static DEFINE_SPINLOCK(blocked_lock_lock);
- static struct kmem_cache *flctx_cache __read_mostly;
- static struct kmem_cache *filelock_cache __read_mostly;
+ static struct kmem_cache *flctx_cache __ro_after_init;
+ static struct kmem_cache *filelock_cache __ro_after_init;
static struct file_lock_context *
locks_get_lock_context(struct inode *inode, int type)
* To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
* locks, the ->lock() interface may return asynchronously, before the lock has
* been granted or denied by the underlying filesystem, if (and only if)
- * lm_grant is set. Callers expecting ->lock() to return asynchronously
- * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
- * the request is for a blocking lock. When ->lock() does return asynchronously,
- * it must return FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock
- * request completes.
+ * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations
+ * flags need to be set.
+ *
+ * Callers expecting ->lock() to return asynchronously will only use F_SETLK,
+ * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a
+ * blocking lock. When ->lock() does return asynchronously, it must return
+ * FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock request completes.
* If the request is for non-blocking lock the file system should return
* FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
* with the result. If the request timed out the callback routine will return a
/* Maximum number of mounts in a mount namespace */
static unsigned int sysctl_mount_max __read_mostly = 100000;
- static unsigned int m_hash_mask __read_mostly;
- static unsigned int m_hash_shift __read_mostly;
- static unsigned int mp_hash_mask __read_mostly;
- static unsigned int mp_hash_shift __read_mostly;
+ static unsigned int m_hash_mask __ro_after_init;
+ static unsigned int m_hash_shift __ro_after_init;
+ static unsigned int mp_hash_mask __ro_after_init;
+ static unsigned int mp_hash_shift __ro_after_init;
static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);
- static struct hlist_head *mount_hashtable __read_mostly;
- static struct hlist_head *mountpoint_hashtable __read_mostly;
- static struct kmem_cache *mnt_cache __read_mostly;
+ static struct hlist_head *mount_hashtable __ro_after_init;
+ static struct hlist_head *mountpoint_hashtable __ro_after_init;
+ static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
};
/* /sys/fs */
- struct kobject *fs_kobj;
+ struct kobject *fs_kobj __ro_after_init;
EXPORT_SYMBOL_GPL(fs_kobj);
/*
* can determine when writes are able to occur to a filesystem.
*/
/**
- * __mnt_want_write - get write access to a mount without freeze protection
+ * mnt_get_write_access - get write access to a mount without freeze protection
* @m: the mount on which to take a write
*
* This tells the low-level filesystem that a write is about to be performed to
* it, and makes sure that writes are allowed (mnt it read-write) before
* returning success. This operation does not protect against filesystem being
- * frozen. When the write operation is finished, __mnt_drop_write() must be
+ * frozen. When the write operation is finished, mnt_put_write_access() must be
* called. This is effectively a refcount.
*/
-int __mnt_want_write(struct vfsmount *m)
+int mnt_get_write_access(struct vfsmount *m)
{
struct mount *mnt = real_mount(m);
int ret = 0;
return ret;
}
+EXPORT_SYMBOL_GPL(mnt_get_write_access);
/**
* mnt_want_write - get write access to a mount
int ret;
sb_start_write(m->mnt_sb);
- ret = __mnt_want_write(m);
+ ret = mnt_get_write_access(m);
if (ret)
sb_end_write(m->mnt_sb);
return ret;
EXPORT_SYMBOL_GPL(mnt_want_write);
/**
- * __mnt_want_write_file - get write access to a file's mount
+ * mnt_get_write_access_file - get write access to a file's mount
* @file: the file who's mount on which to take a write
*
- * This is like __mnt_want_write, but if the file is already open for writing it
+ * This is like mnt_get_write_access, but if @file is already open for write it
* skips incrementing mnt_writers (since the open file already has a reference)
* and instead only does the check for emergency r/o remounts. This must be
- * paired with __mnt_drop_write_file.
+ * paired with mnt_put_write_access_file.
*/
-int __mnt_want_write_file(struct file *file)
+int mnt_get_write_access_file(struct file *file)
{
if (file->f_mode & FMODE_WRITER) {
/*
return -EROFS;
return 0;
}
- return __mnt_want_write(file->f_path.mnt);
+ return mnt_get_write_access(file->f_path.mnt);
}
/**
int ret;
sb_start_write(file_inode(file)->i_sb);
- ret = __mnt_want_write_file(file);
+ ret = mnt_get_write_access_file(file);
if (ret)
sb_end_write(file_inode(file)->i_sb);
return ret;
EXPORT_SYMBOL_GPL(mnt_want_write_file);
/**
- * __mnt_drop_write - give up write access to a mount
+ * mnt_put_write_access - give up write access to a mount
* @mnt: the mount on which to give up write access
*
* Tells the low-level filesystem that we are done
* performing writes to it. Must be matched with
- * __mnt_want_write() call above.
+ * mnt_get_write_access() call above.
*/
-void __mnt_drop_write(struct vfsmount *mnt)
+void mnt_put_write_access(struct vfsmount *mnt)
{
preempt_disable();
mnt_dec_writers(real_mount(mnt));
preempt_enable();
}
+EXPORT_SYMBOL_GPL(mnt_put_write_access);
/**
* mnt_drop_write - give up write access to a mount
*/
void mnt_drop_write(struct vfsmount *mnt)
{
- __mnt_drop_write(mnt);
+ mnt_put_write_access(mnt);
sb_end_write(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);
-void __mnt_drop_write_file(struct file *file)
+void mnt_put_write_access_file(struct file *file)
{
if (!(file->f_mode & FMODE_WRITER))
- __mnt_drop_write(file->f_path.mnt);
+ mnt_put_write_access(file->f_path.mnt);
}
void mnt_drop_write_file(struct file *file)
{
- __mnt_drop_write_file(file);
+ mnt_put_write_access_file(file);
sb_end_write(file_inode(file)->i_sb);
}
EXPORT_SYMBOL(mnt_drop_write_file);
{
if (mnt) {
struct mount *m = real_mount(mnt);
- /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+ /* avoid cacheline pingpong */
if (unlikely(m->mnt_expiry_mark))
- m->mnt_expiry_mark = 0;
+ WRITE_ONCE(m->mnt_expiry_mark, 0);
mntput_no_expire(m);
}
}
#define dnotify_sysctl_init() do { } while (0)
#endif
- static struct kmem_cache *dnotify_struct_cache __read_mostly;
- static struct kmem_cache *dnotify_mark_cache __read_mostly;
- static struct fsnotify_group *dnotify_group __read_mostly;
+ static struct kmem_cache *dnotify_struct_cache __ro_after_init;
+ static struct kmem_cache *dnotify_mark_cache __ro_after_init;
+ static struct fsnotify_group *dnotify_group __ro_after_init;
/*
* dnotify will attach one of these to each inode (i_fsnotify_marks) which
struct dnotify_struct *dn;
struct inode *inode;
fl_owner_t id = current->files;
- struct file *f;
+ struct file *f = NULL;
int destroy = 0, error = 0;
__u32 mask;
}
rcu_read_lock();
- f = lookup_fd_rcu(fd);
+ f = lookup_fdget_rcu(fd);
rcu_read_unlock();
/* if (f != filp) means that we lost a race and another task/thread
fsnotify_put_mark(new_fsn_mark);
if (dn)
kmem_cache_free(dnotify_struct_cache, dn);
+ if (f)
+ fput(f);
return error;
}
extern const struct fsnotify_ops fanotify_fsnotify_ops;
- struct kmem_cache *fanotify_mark_cache __read_mostly;
- struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
- struct kmem_cache *fanotify_path_event_cachep __read_mostly;
- struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
+ struct kmem_cache *fanotify_mark_cache __ro_after_init;
+ struct kmem_cache *fanotify_fid_event_cachep __ro_after_init;
+ struct kmem_cache *fanotify_path_event_cachep __ro_after_init;
+ struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
#define FANOTIFY_EVENT_ALIGN 4
#define FANOTIFY_FID_INFO_HDR_LEN \
}
/* Check if filesystem can encode a unique fid */
-static int fanotify_test_fid(struct dentry *dentry)
+static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
{
+ unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
+ const struct export_operations *nop = dentry->d_sb->s_export_op;
+
+ /*
+ * We need to make sure that the filesystem supports encoding of
+ * file handles so user can use name_to_handle_at() to compare fids
+ * reported with events to the file handle of watched objects.
+ */
+ if (!nop)
+ return -EOPNOTSUPP;
+
/*
- * We need to make sure that the file system supports at least
- * encoding a file handle so user can use name_to_handle_at() to
- * compare fid returned with event to the file handle of watched
- * objects. However, even the relaxed AT_HANDLE_FID flag requires
- * at least empty export_operations for ecoding unique file ids.
+ * For sb/mount mark, we also need to make sure that the filesystem
+ * supports decoding file handles, so user has a way to map back the
+ * reported fids to filesystem objects.
*/
- if (!dentry->d_sb->s_export_op)
+ if (mark_type != FAN_MARK_INODE && !nop->fh_to_dentry)
return -EOPNOTSUPP;
return 0;
if (ret)
goto path_put_and_out;
- ret = fanotify_test_fid(path.dentry);
+ ret = fanotify_test_fid(path.dentry, flags);
if (ret)
goto path_put_and_out;
el = &eb->h_list;
}
- BUG_ON(el->l_tree_depth != 0);
+ if (el->l_tree_depth != 0) {
+ retval = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has leaf extent block %llu with an invalid l_tree_depth of %u\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ (unsigned long long)last_eb_blk,
+ le16_to_cpu(el->l_tree_depth));
+ goto bail;
+ }
retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
bail:
}
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, di_bh);
goto next_group;
}
out:
- range->len = trimmed * sb->s_blocksize;
+ range->len = trimmed * osb->s_clustersize;
return ret;
}
static int param_get_dlmfs_capabilities(char *buffer,
const struct kernel_param *kp)
{
- return strlcpy(buffer, DLMFS_CAPABILITIES,
- strlen(DLMFS_CAPABILITIES) + 1);
+ return sysfs_emit(buffer, DLMFS_CAPABILITIES);
}
module_param_call(capabilities, param_set_dlmfs_capabilities,
param_get_dlmfs_capabilities, NULL, 0444);
if (inode) {
inode->i_ino = get_next_ino();
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inc_nlink(inode);
inode->i_fop = &simple_dir_operations;
inode->i_ino = get_next_ino();
inode_init_owner(&nop_mnt_idmap, inode, parent, mode);
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
ip = DLMFS_I(inode);
ip->ip_conn = DLMFS_I(parent)->ip_conn;
inc_nlink(inode);
inode_set_ctime_current(inode);
ocfs2_set_links_count(fe, inode->i_nlink);
- fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_journal_dirty(handle, fe_bh);
err = ocfs2_add_entry(handle, dentry, inode,
ocfs2_set_links_count(fe, inode->i_nlink);
ocfs2_journal_dirty(handle, fe_bh);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
if (S_ISDIR(inode->i_mode))
drop_nlink(dir);
if (status >= 0) {
old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
- old_di->i_ctime = cpu_to_le64(inode_get_ctime(old_inode).tv_sec);
- old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(old_inode).tv_nsec);
+ old_di->i_ctime = cpu_to_le64(inode_get_ctime_sec(old_inode));
+ old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(old_inode));
ocfs2_journal_dirty(handle, old_inode_bh);
} else
mlog_errno(status);
drop_nlink(new_inode);
inode_set_ctime_current(new_inode);
}
- old_dir->i_mtime = inode_set_ctime_current(old_dir);
+ inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
if (update_dot_dot) {
status = ocfs2_update_entry(old_inode, handle,
&old_inode_dot_dot_res, new_dir);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
drop_nlink(old_dir);
if (new_inode) {
drop_nlink(new_inode);
if (old_dir != new_dir) {
/* Keep the same times on both directories.*/
- new_dir->i_mtime = inode_set_ctime_to_ts(new_dir,
- inode_get_ctime(old_dir));
+ inode_set_mtime_to_ts(new_dir,
+ inode_set_ctime_to_ts(new_dir, inode_get_ctime(old_dir)));
/*
* This will also pick up the i_nlink change from the
INODE_CACHE(old_dir),
old_dir_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
ocfs2_set_links_count(fe, old_dir->i_nlink);
ocfs2_journal_dirty(handle, old_dir_bh);
return !pipe_empty(head, tail) || !writers;
}
+static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf,
+ unsigned int tail)
+{
+ pipe_buf_release(pipe, buf);
+
+ /*
+ * If the pipe has a watch_queue, we need additional protection
+ * by the spinlock because notifications get posted with only
+ * this spinlock, no mutex
+ */
+ if (pipe_has_watch_queue(pipe)) {
+ spin_lock_irq(&pipe->rd_wait.lock);
+#ifdef CONFIG_WATCH_QUEUE
+ if (buf->flags & PIPE_BUF_FLAG_LOSS)
+ pipe->note_loss = true;
+#endif
+ pipe->tail = ++tail;
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ return tail;
+ }
+
+ /*
+ * Without a watch_queue, we can simply increment the tail
+ * without the spinlock - the mutex is enough.
+ */
+ pipe->tail = ++tail;
+ return tail;
+}
+
static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
buf->len = 0;
}
- if (!buf->len) {
- pipe_buf_release(pipe, buf);
- spin_lock_irq(&pipe->rd_wait.lock);
-#ifdef CONFIG_WATCH_QUEUE
- if (buf->flags & PIPE_BUF_FLAG_LOSS)
- pipe->note_loss = true;
-#endif
- tail++;
- pipe->tail = tail;
- spin_unlock_irq(&pipe->rd_wait.lock);
- }
+ if (!buf->len)
+ tail = pipe_update_tail(pipe, buf, tail);
total_len -= chars;
if (!total_len)
break; /* common path: read succeeded */
goto out;
}
-#ifdef CONFIG_WATCH_QUEUE
- if (pipe->watch_queue) {
+ if (pipe_has_watch_queue(pipe)) {
ret = -EXDEV;
goto out;
}
-#endif
/*
* If it wasn't empty we try to merge new data into
* it, either the reader will consume it or it'll still
* be there for the next write.
*/
- spin_lock_irq(&pipe->rd_wait.lock);
-
- head = pipe->head;
- if (pipe_full(head, pipe->tail, pipe->max_usage)) {
- spin_unlock_irq(&pipe->rd_wait.lock);
- continue;
- }
-
pipe->head = head + 1;
- spin_unlock_irq(&pipe->rd_wait.lock);
/* Insert it into the buffer array */
buf = &pipe->bufs[head & mask];
kfree(pipe);
}
- static struct vfsmount *pipe_mnt __read_mostly;
+ static struct vfsmount *pipe_mnt __ro_after_init;
/*
* pipefs_dname() is called from d_path().
inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
return inode;
unsigned int nr_slots, size;
long ret = 0;
-#ifdef CONFIG_WATCH_QUEUE
- if (pipe->watch_queue)
+ if (pipe_has_watch_queue(pipe))
return -EBUSY;
-#endif
size = round_pipe_size(arg);
nr_slots = size >> PAGE_SHIFT;
if (file->f_op != &pipefifo_fops || !pipe)
return NULL;
-#ifdef CONFIG_WATCH_QUEUE
- if (for_splice && pipe->watch_queue)
+ if (for_splice && pipe_has_watch_queue(pipe))
return NULL;
-#endif
return pipe;
}
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int oom_adj;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count)) {
static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int oom_score_adj;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count)) {
const char __user * buf, size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int make_it_fail;
int rv;
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- memset(buffer, 0, sizeof(buffer));
+
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
{
struct inode *inode = file_inode(file);
struct task_struct *p;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
int nice;
int err;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
{
struct inode *inode = file_inode(file);
struct task_struct *p;
- char buffer[TASK_COMM_LEN];
+ char buffer[TASK_COMM_LEN] = {};
const size_t maxlen = sizeof(buffer) - 1;
- memset(buffer, 0, sizeof(buffer));
if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
return -EFAULT;
ei = PROC_I(inode);
inode->i_mode = mode;
inode->i_ino = get_next_ino();
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_op = &proc_def_inode_operations;
/*
rc = -ENOENT;
vma = find_exact_vma(mm, vm_start, vm_end);
if (vma && vma->vm_file) {
- *path = vma->vm_file->f_path;
+ *path = *file_user_path(vma->vm_file);
path_get(path);
rc = 0;
}
#ifdef CONFIG_TASK_IO_ACCOUNTING
static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
{
- struct task_io_accounting acct = task->ioac;
- unsigned long flags;
+ struct task_io_accounting acct;
int result;
result = down_read_killable(&task->signal->exec_update_lock);
goto out_unlock;
}
- if (whole && lock_task_sighand(task, &flags)) {
- struct task_struct *t = task;
+ if (whole) {
+ struct signal_struct *sig = task->signal;
+ struct task_struct *t;
+ unsigned int seq = 1;
+ unsigned long flags;
+
+ rcu_read_lock();
+ do {
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
- task_io_accounting_add(&acct, &task->signal->ioac);
- while_each_thread(task, t)
- task_io_accounting_add(&acct, &t->ioac);
+ acct = sig->ioac;
+ __for_each_thread(sig, t)
+ task_io_accounting_add(&acct, &t->ioac);
- unlock_task_sighand(task, &flags);
+ } while (need_seqretry(&sig->stats_lock, seq));
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+ rcu_read_unlock();
+ } else {
+ acct = task->ioac;
}
+
seq_printf(m,
"rchar: %llu\n"
"wchar: %llu\n"
for_each_thread(task, pos) {
if (!nr--)
goto found;
- };
+ }
fail:
pos = NULL;
goto out;
struct task_struct *pos = NULL;
rcu_read_lock();
if (pid_alive(start)) {
- pos = next_thread(start);
- if (thread_group_leader(pos))
- pos = NULL;
- else
+ pos = __next_thread(start);
+ if (pos)
get_task_struct(pos);
}
rcu_read_unlock();
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
{
- struct inode *inode;
- struct proc_inode *ei;
struct hlist_node *node;
struct super_block *old_sb = NULL;
rcu_read_lock();
- for (;;) {
+ while ((node = hlist_first_rcu(inodes))) {
+ struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes);
struct super_block *sb;
- node = hlist_first_rcu(inodes);
- if (!node)
- break;
- ei = hlist_entry(node, struct proc_inode, sibling_inodes);
+ struct inode *inode;
+
spin_lock(lock);
hlist_del_init_rcu(&ei->sibling_inodes);
spin_unlock(lock);
inode->i_private = de->data;
inode->i_ino = de->low_ino;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
PROC_I(inode)->pde = de;
if (is_empty_pde(de)) {
make_empty_dir_inode(inode);
#include <linux/shmem_fs.h>
#include <linux/uaccess.h>
#include <linux/pkeys.h>
+#include <linux/minmax.h>
+#include <linux/overflow.h>
#include <asm/elf.h>
#include <asm/tlb.h>
if (anon_name)
seq_printf(m, "[anon_shmem:%s]", anon_name->name);
else
- seq_file_path(m, file, "\n");
+ seq_path(m, file_user_path(file), "\n");
goto done;
}
static int show_smap(struct seq_file *m, void *v)
{
struct vm_area_struct *vma = v;
- struct mem_size_stats mss;
-
- memset(&mss, 0, sizeof(mss));
+ struct mem_size_stats mss = {};
smap_gather_stats(vma, &mss, 0);
static int show_smaps_rollup(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
- struct mem_size_stats mss;
+ struct mem_size_stats mss = {};
struct mm_struct *mm = priv->mm;
struct vm_area_struct *vma;
unsigned long vma_start = 0, last_vma_end = 0;
goto out_put_task;
}
- memset(&mss, 0, sizeof(mss));
-
ret = mmap_read_lock_killable(mm);
if (ret)
goto out_put_mm;
size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF];
+ char buffer[PROC_NUMBUF] = {};
struct mm_struct *mm;
struct vm_area_struct *vma;
enum clear_refs_types type;
int itype;
int rv;
- memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return 0;
}
+#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \
+ PAGE_IS_FILE | PAGE_IS_PRESENT | \
+ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \
+ PAGE_IS_HUGE)
+#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
+
+struct pagemap_scan_private {
+ struct pm_scan_arg arg;
+ unsigned long masks_of_interest, cur_vma_category;
+ struct page_region *vec_buf;
+ unsigned long vec_buf_len, vec_buf_index, found_pages;
+ struct page_region __user *vec_out;
+};
+
+static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
+ struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte)
+{
+ unsigned long categories = 0;
+
+ if (pte_present(pte)) {
+ struct page *page;
+
+ categories |= PAGE_IS_PRESENT;
+ if (!pte_uffd_wp(pte))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ page = vm_normal_page(vma, addr, pte);
+ if (page && !PageAnon(page))
+ categories |= PAGE_IS_FILE;
+ }
+
+ if (is_zero_pfn(pte_pfn(pte)))
+ categories |= PAGE_IS_PFNZERO;
+ } else if (is_swap_pte(pte)) {
+ swp_entry_t swp;
+
+ categories |= PAGE_IS_SWAPPED;
+ if (!pte_swp_uffd_wp_any(pte))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ swp = pte_to_swp_entry(pte);
+ if (is_pfn_swap_entry(swp) &&
+ !PageAnon(pfn_swap_entry_to_page(swp)))
+ categories |= PAGE_IS_FILE;
+ }
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+ pte_t ptent = ptep_get(pte);
+
+ if (pte_present(ptent)) {
+ pte_t old_pte;
+
+ old_pte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_mkuffd_wp(ptent);
+ ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
+ } else if (is_swap_pte(ptent)) {
+ ptent = pte_swp_mkuffd_wp(ptent);
+ set_pte_at(vma->vm_mm, addr, pte, ptent);
+ } else {
+ set_pte_at(vma->vm_mm, addr, pte,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
+ }
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
+ struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd)
+{
+ unsigned long categories = PAGE_IS_HUGE;
+
+ if (pmd_present(pmd)) {
+ struct page *page;
+
+ categories |= PAGE_IS_PRESENT;
+ if (!pmd_uffd_wp(pmd))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ if (page && !PageAnon(page))
+ categories |= PAGE_IS_FILE;
+ }
+
+ if (is_zero_pfn(pmd_pfn(pmd)))
+ categories |= PAGE_IS_PFNZERO;
+ } else if (is_swap_pmd(pmd)) {
+ swp_entry_t swp;
+
+ categories |= PAGE_IS_SWAPPED;
+ if (!pmd_swp_uffd_wp(pmd))
+ categories |= PAGE_IS_WRITTEN;
+
+ if (p->masks_of_interest & PAGE_IS_FILE) {
+ swp = pmd_to_swp_entry(pmd);
+ if (is_pfn_swap_entry(swp) &&
+ !PageAnon(pfn_swap_entry_to_page(swp)))
+ categories |= PAGE_IS_FILE;
+ }
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old, pmd = *pmdp;
+
+ if (pmd_present(pmd)) {
+ old = pmdp_invalidate_ad(vma, addr, pmdp);
+ pmd = pmd_mkuffd_wp(old);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+ pmd = pmd_swp_mkuffd_wp(pmd);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long pagemap_hugetlb_category(pte_t pte)
+{
+ unsigned long categories = PAGE_IS_HUGE;
+
+ /*
+ * According to pagemap_hugetlb_range(), file-backed HugeTLB
+ * page cannot be swapped. So PAGE_IS_FILE is not checked for
+ * swapped pages.
+ */
+ if (pte_present(pte)) {
+ categories |= PAGE_IS_PRESENT;
+ if (!huge_pte_uffd_wp(pte))
+ categories |= PAGE_IS_WRITTEN;
+ if (!PageAnon(pte_page(pte)))
+ categories |= PAGE_IS_FILE;
+ if (is_zero_pfn(pte_pfn(pte)))
+ categories |= PAGE_IS_PFNZERO;
+ } else if (is_swap_pte(pte)) {
+ categories |= PAGE_IS_SWAPPED;
+ if (!pte_swp_uffd_wp_any(pte))
+ categories |= PAGE_IS_WRITTEN;
+ }
+
+ return categories;
+}
+
+static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t ptent)
+{
+ unsigned long psize;
+
+ if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent))
+ return;
+
+ psize = huge_page_size(hstate_vma(vma));
+
+ if (is_hugetlb_entry_migration(ptent))
+ set_huge_pte_at(vma->vm_mm, addr, ptep,
+ pte_swp_mkuffd_wp(ptent), psize);
+ else if (!huge_pte_none(ptent))
+ huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
+ huge_pte_mkuffd_wp(ptent));
+ else
+ set_huge_pte_at(vma->vm_mm, addr, ptep,
+ make_pte_marker(PTE_MARKER_UFFD_WP), psize);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long end)
+{
+ struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+ if (cur_buf->start != addr)
+ cur_buf->end = addr;
+ else
+ cur_buf->start = cur_buf->end = 0;
+
+ p->found_pages -= (end - addr) / PAGE_SIZE;
+}
+#endif
+
+static bool pagemap_scan_is_interesting_page(unsigned long categories,
+ const struct pagemap_scan_private *p)
+{
+ categories ^= p->arg.category_inverted;
+ if ((categories & p->arg.category_mask) != p->arg.category_mask)
+ return false;
+ if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask))
+ return false;
+
+ return true;
+}
+
+static bool pagemap_scan_is_interesting_vma(unsigned long categories,
+ const struct pagemap_scan_private *p)
+{
+ unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED;
+
+ categories ^= p->arg.category_inverted;
+ if ((categories & required) != required)
+ return false;
+
+ return true;
+}
+
+static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long vma_category = 0;
+
+ if (userfaultfd_wp_async(vma) && userfaultfd_wp_use_markers(vma))
+ vma_category |= PAGE_IS_WPALLOWED;
+ else if (p->arg.flags & PM_SCAN_CHECK_WPASYNC)
+ return -EPERM;
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+
+ if (!pagemap_scan_is_interesting_vma(vma_category, p))
+ return 1;
+
+ p->cur_vma_category = vma_category;
+
+ return 0;
+}
+
+static bool pagemap_scan_push_range(unsigned long categories,
+ struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long end)
+{
+ struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+ /*
+ * When there is no output buffer provided at all, the sentinel values
+ * won't match here. There is no other way for `cur_buf->end` to be
+ * non-zero other than it being non-empty.
+ */
+ if (addr == cur_buf->end && categories == cur_buf->categories) {
+ cur_buf->end = end;
+ return true;
+ }
+
+ if (cur_buf->end) {
+ if (p->vec_buf_index >= p->vec_buf_len - 1)
+ return false;
+
+ cur_buf = &p->vec_buf[++p->vec_buf_index];
+ }
+
+ cur_buf->start = addr;
+ cur_buf->end = end;
+ cur_buf->categories = categories;
+
+ return true;
+}
+
+static int pagemap_scan_output(unsigned long categories,
+ struct pagemap_scan_private *p,
+ unsigned long addr, unsigned long *end)
+{
+ unsigned long n_pages, total_pages;
+ int ret = 0;
+
+ if (!p->vec_buf)
+ return 0;
+
+ categories &= p->arg.return_mask;
+
+ n_pages = (*end - addr) / PAGE_SIZE;
+ if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
+ total_pages > p->arg.max_pages) {
+ size_t n_too_much = total_pages - p->arg.max_pages;
+ *end -= n_too_much * PAGE_SIZE;
+ n_pages -= n_too_much;
+ ret = -ENOSPC;
+ }
+
+ if (!pagemap_scan_push_range(categories, p, addr, *end)) {
+ *end = addr;
+ n_pages = 0;
+ ret = -ENOSPC;
+ }
+
+ p->found_pages += n_pages;
+ if (ret)
+ p->arg.walk_end = *end;
+
+ return ret;
+}
+
+static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long categories;
+ spinlock_t *ptl;
+ int ret = 0;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return -ENOENT;
+
+ categories = p->cur_vma_category |
+ pagemap_thp_category(p, vma, start, *pmd);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ goto out_unlock;
+
+ ret = pagemap_scan_output(categories, p, start, &end);
+ if (start == end)
+ goto out_unlock;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ goto out_unlock;
+ if (~categories & PAGE_IS_WRITTEN)
+ goto out_unlock;
+
+ /*
+ * Break huge page into small pages if the WP operation
+ * needs to be performed on a portion of the huge page.
+ */
+ if (end != start + HPAGE_SIZE) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, start);
+ pagemap_scan_backout_range(p, start, end);
+ /* Report as if there was no THP */
+ return -ENOENT;
+ }
+
+ make_uffd_wp_pmd(vma, start, pmd);
+ flush_tlb_range(vma, start, end);
+out_unlock:
+ spin_unlock(ptl);
+ return ret;
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+ return -ENOENT;
+#endif
+}
+
+static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long addr, flush_end = 0;
+ pte_t *pte, *start_pte;
+ spinlock_t *ptl;
+ int ret;
+
+ arch_enter_lazy_mmu_mode();
+
+ ret = pagemap_scan_thp_entry(pmd, start, end, walk);
+ if (ret != -ENOENT) {
+ arch_leave_lazy_mmu_mode();
+ return ret;
+ }
+
+ ret = 0;
+ start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+ if (!pte) {
+ arch_leave_lazy_mmu_mode();
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+
+ if (!p->vec_out) {
+ /* Fast path for performing exclusive WP */
+ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+ if (pte_uffd_wp(ptep_get(pte)))
+ continue;
+ make_uffd_wp_pte(vma, addr, pte);
+ if (!flush_end)
+ start = addr;
+ flush_end = addr + PAGE_SIZE;
+ }
+ goto flush_and_return;
+ }
+
+ if (!p->arg.category_anyof_mask && !p->arg.category_inverted &&
+ p->arg.category_mask == PAGE_IS_WRITTEN &&
+ p->arg.return_mask == PAGE_IS_WRITTEN) {
+ for (addr = start; addr < end; pte++, addr += PAGE_SIZE) {
+ unsigned long next = addr + PAGE_SIZE;
+
+ if (pte_uffd_wp(ptep_get(pte)))
+ continue;
+ ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN,
+ p, addr, &next);
+ if (next == addr)
+ break;
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ continue;
+ make_uffd_wp_pte(vma, addr, pte);
+ if (!flush_end)
+ start = addr;
+ flush_end = next;
+ }
+ goto flush_and_return;
+ }
+
+ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+ unsigned long categories = p->cur_vma_category |
+ pagemap_page_category(p, vma, addr, ptep_get(pte));
+ unsigned long next = addr + PAGE_SIZE;
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ continue;
+
+ ret = pagemap_scan_output(categories, p, addr, &next);
+ if (next == addr)
+ break;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ continue;
+ if (~categories & PAGE_IS_WRITTEN)
+ continue;
+
+ make_uffd_wp_pte(vma, addr, pte);
+ if (!flush_end)
+ start = addr;
+ flush_end = next;
+ }
+
+flush_and_return:
+ if (flush_end)
+ flush_tlb_range(vma, start, addr);
+
+ pte_unmap_unlock(start_pte, ptl);
+ arch_leave_lazy_mmu_mode();
+
+ cond_resched();
+ return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
+ unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long categories;
+ spinlock_t *ptl;
+ int ret = 0;
+ pte_t pte;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
+ /* Go the short route when not write-protecting pages. */
+
+ pte = huge_ptep_get(ptep);
+ categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ return 0;
+
+ return pagemap_scan_output(categories, p, start, &end);
+ }
+
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
+
+ pte = huge_ptep_get(ptep);
+ categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+ if (!pagemap_scan_is_interesting_page(categories, p))
+ goto out_unlock;
+
+ ret = pagemap_scan_output(categories, p, start, &end);
+ if (start == end)
+ goto out_unlock;
+
+ if (~categories & PAGE_IS_WRITTEN)
+ goto out_unlock;
+
+ if (end != start + HPAGE_SIZE) {
+ /* Partial HugeTLB page WP isn't possible. */
+ pagemap_scan_backout_range(p, start, end);
+ p->arg.walk_end = start;
+ ret = 0;
+ goto out_unlock;
+ }
+
+ make_uffd_wp_huge_pte(vma, start, ptep, pte);
+ flush_hugetlb_tlb_range(vma, start, end);
+
+out_unlock:
+ spin_unlock(ptl);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+
+ return ret;
+}
+#else
+#define pagemap_scan_hugetlb_entry NULL
+#endif
+
+static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
+ int depth, struct mm_walk *walk)
+{
+ struct pagemap_scan_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ int ret, err;
+
+ if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p))
+ return 0;
+
+ ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
+ if (addr == end)
+ return ret;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ return ret;
+
+ err = uffd_wp_range(vma, addr, end - addr, true);
+ if (err < 0)
+ ret = err;
+
+ return ret;
+}
+
+static const struct mm_walk_ops pagemap_scan_ops = {
+ .test_walk = pagemap_scan_test_walk,
+ .pmd_entry = pagemap_scan_pmd_entry,
+ .pte_hole = pagemap_scan_pte_hole,
+ .hugetlb_entry = pagemap_scan_hugetlb_entry,
+};
+
+static int pagemap_scan_get_args(struct pm_scan_arg *arg,
+ unsigned long uarg)
+{
+ if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
+ return -EFAULT;
+
+ if (arg->size != sizeof(struct pm_scan_arg))
+ return -EINVAL;
+
+ /* Validate requested features */
+ if (arg->flags & ~PM_SCAN_FLAGS)
+ return -EINVAL;
+ if ((arg->category_inverted | arg->category_mask |
+ arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
+ return -EINVAL;
+
+ arg->start = untagged_addr((unsigned long)arg->start);
+ arg->end = untagged_addr((unsigned long)arg->end);
+ arg->vec = untagged_addr((unsigned long)arg->vec);
+
+ /* Validate memory pointers */
+ if (!IS_ALIGNED(arg->start, PAGE_SIZE))
+ return -EINVAL;
+ if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start))
+ return -EFAULT;
+ if (!arg->vec && arg->vec_len)
+ return -EINVAL;
+ if (arg->vec && !access_ok((void __user *)(long)arg->vec,
+ arg->vec_len * sizeof(struct page_region)))
+ return -EFAULT;
+
+ /* Fixup default values */
+ arg->end = ALIGN(arg->end, PAGE_SIZE);
+ arg->walk_end = 0;
+ if (!arg->max_pages)
+ arg->max_pages = ULONG_MAX;
+
+ return 0;
+}
+
+static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
+ unsigned long uargl)
+{
+ struct pm_scan_arg __user *uarg = (void __user *)uargl;
+
+ if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
+{
+ if (!p->arg.vec_len)
+ return 0;
+
+ p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
+ p->arg.vec_len);
+ p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
+ GFP_KERNEL);
+ if (!p->vec_buf)
+ return -ENOMEM;
+
+ p->vec_buf->start = p->vec_buf->end = 0;
+ p->vec_out = (struct page_region __user *)(long)p->arg.vec;
+
+ return 0;
+}
+
+static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
+{
+ const struct page_region *buf = p->vec_buf;
+ long n = p->vec_buf_index;
+
+ if (!p->vec_buf)
+ return 0;
+
+ if (buf[n].end != buf[n].start)
+ n++;
+
+ if (!n)
+ return 0;
+
+ if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
+ return -EFAULT;
+
+ p->arg.vec_len -= n;
+ p->vec_out += n;
+
+ p->vec_buf_index = 0;
+ p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
+ p->vec_buf->start = p->vec_buf->end = 0;
+
+ return n;
+}
+
+static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
+{
+ struct mmu_notifier_range range;
+ struct pagemap_scan_private p = {0};
+ unsigned long walk_start;
+ size_t n_ranges_out = 0;
+ int ret;
+
+ ret = pagemap_scan_get_args(&p.arg, uarg);
+ if (ret)
+ return ret;
+
+ p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask |
+ p.arg.return_mask;
+ ret = pagemap_scan_init_bounce_buffer(&p);
+ if (ret)
+ return ret;
+
+ /* Protection change for the range is going to happen. */
+ if (p.arg.flags & PM_SCAN_WP_MATCHING) {
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
+ mm, p.arg.start, p.arg.end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+
+ for (walk_start = p.arg.start; walk_start < p.arg.end;
+ walk_start = p.arg.walk_end) {
+ long n_out;
+
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ break;
+ ret = walk_page_range(mm, walk_start, p.arg.end,
+ &pagemap_scan_ops, &p);
+ mmap_read_unlock(mm);
+
+ n_out = pagemap_scan_flush_buffer(&p);
+ if (n_out < 0)
+ ret = n_out;
+ else
+ n_ranges_out += n_out;
+
+ if (ret != -ENOSPC)
+ break;
+
+ if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages)
+ break;
+ }
+
+ /* ENOSPC signifies early stop (buffer full) from the walk. */
+ if (!ret || ret == -ENOSPC)
+ ret = n_ranges_out;
+
+ /* The walk_end isn't set when ret is zero */
+ if (!p.arg.walk_end)
+ p.arg.walk_end = p.arg.end;
+ if (pagemap_scan_writeback_args(&p.arg, uarg))
+ ret = -EFAULT;
+
+ if (p.arg.flags & PM_SCAN_WP_MATCHING)
+ mmu_notifier_invalidate_range_end(&range);
+
+ kfree(p.vec_buf);
+ return ret;
+}
+
+static long do_pagemap_cmd(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mm_struct *mm = file->private_data;
+
+ switch (cmd) {
+ case PAGEMAP_SCAN:
+ return do_pagemap_scan(mm, arg);
+
+ default:
+ return -EINVAL;
+ }
+}
+
const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
.open = pagemap_open,
.release = pagemap_release,
+ .unlocked_ioctl = do_pagemap_cmd,
+ .compat_ioctl = do_pagemap_cmd,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
struct numa_maps *md = &numa_priv->md;
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
- struct mempolicy *pol;
char buffer[64];
+ struct mempolicy *pol;
+ pgoff_t ilx;
int nid;
if (!mm)
/* Ensure we start with an empty set of numa_maps statistics. */
memset(md, 0, sizeof(*md));
- pol = __get_vma_policy(vma, vma->vm_start);
+ pol = __get_vma_policy(vma, vma->vm_start, &ilx);
if (pol) {
mpol_to_str(buffer, sizeof(buffer), pol);
mpol_cond_put(pol);
if (file) {
seq_puts(m, " file=");
- seq_file_path(m, file, "\n\t= ");
+ seq_path(m, file_user_path(file), "\n\t= ");
} else if (vma_is_initial_heap(vma)) {
seq_puts(m, " heap");
} else if (vma_is_initial_stack(vma)) {
};
#endif
- static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+ static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
/*
* Start with fault_pending_wqh and fault_wqh so they're more likely
return ctx->features & UFFD_FEATURE_INITIALIZED;
}
+static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
+{
+ return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
+}
+
/*
* Whether WP_UNPOPULATED is enabled on the uffd context. It is only
* meaningful when userfaultfd_wp()==true on the vma and when it's
continue;
}
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end,
- new_flags, vma->anon_vma,
- vma->vm_file, vma->vm_pgoff,
- vma_policy(vma),
- NULL_VM_UFFD_CTX, anon_vma_name(vma));
- if (prev) {
- vma = prev;
- } else {
- prev = vma;
- }
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
+ vma->vm_end, new_flags,
+ NULL_VM_UFFD_CTX);
vma_start_write(vma);
userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+
+ prev = vma;
}
mmap_write_unlock(mm);
mmput(mm);
bool basic_ioctls;
unsigned long start, end, vma_end;
struct vma_iterator vmi;
- pgoff_t pgoff;
+ bool wp_async = userfaultfd_wp_async_ctx(ctx);
user_uffdio_register = (struct uffdio_register __user *) arg;
/* check not compatible vmas */
ret = -EINVAL;
- if (!vma_can_userfault(cur, vm_flags))
+ if (!vma_can_userfault(cur, vm_flags, wp_async))
goto out_unlock;
/*
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vm_flags));
+ BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
vma->vm_userfaultfd_ctx.ctx != ctx);
WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
vma_end = min(end, vma->vm_end);
new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
- pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
- vma->anon_vma, vma->vm_file, pgoff,
- vma_policy(vma),
- ((struct vm_userfaultfd_ctx){ ctx }),
- anon_vma_name(vma));
- if (prev) {
- /* vma_merge() invalidated the mas */
- vma = prev;
- goto next;
- }
- if (vma->vm_start < start) {
- ret = split_vma(&vmi, vma, start, 1);
- if (ret)
- break;
- }
- if (vma->vm_end > end) {
- ret = split_vma(&vmi, vma, end, 0);
- if (ret)
- break;
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+ new_flags,
+ (struct vm_userfaultfd_ctx){ctx});
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ break;
}
- next:
+
/*
* In the vma_merge() successful mprotect-like case 8:
* the next vma was merged into the current one and
unsigned long start, end, vma_end;
const void __user *buf = (void __user *)arg;
struct vma_iterator vmi;
- pgoff_t pgoff;
+ bool wp_async = userfaultfd_wp_async_ctx(ctx);
ret = -EFAULT;
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
* provides for more strict behavior to notice
* unregistration errors.
*/
- if (!vma_can_userfault(cur, cur->vm_flags))
+ if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
goto out_unlock;
found = true;
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
+ BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
/*
* Nothing to do: this vma is already registered into this
uffd_wp_range(vma, start, vma_end - start, false);
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
- vma->anon_vma, vma->vm_file, pgoff,
- vma_policy(vma),
- NULL_VM_UFFD_CTX, anon_vma_name(vma));
- if (prev) {
- vma = prev;
- goto next;
- }
- if (vma->vm_start < start) {
- ret = split_vma(&vmi, vma, start, 1);
- if (ret)
- break;
- }
- if (vma->vm_end > end) {
- ret = split_vma(&vmi, vma, end, 0);
- if (ret)
- break;
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+ new_flags, NULL_VM_UFFD_CTX);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ break;
}
- next:
+
/*
* In the vma_merge() successful mprotect-like case 8:
* the next vma was merged into the current one and
return ret;
}
+bool userfaultfd_wp_async(struct vm_area_struct *vma)
+{
+ return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
+}
+
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
ret = -EPERM;
if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
goto err_out;
+
+ /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
+ if (features & UFFD_FEATURE_WP_ASYNC)
+ features |= UFFD_FEATURE_WP_UNPOPULATED;
+
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
#ifndef CONFIG_PTE_MARKER_UFFD_WP
uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
#endif
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
#include <linux/linkage.h>
#include <linux/elfcore.h>
#include <linux/elf.h>
+ #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ #include <asm/crash_core.h>
+ #endif
+
+ /* Location of a reserved region to hold the crash kernel.
+ */
+ extern struct resource crashk_res;
+ extern struct resource crashk_low_res;
#define CRASH_CORE_NOTE_NAME "CORE"
#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
void *data, size_t data_len);
void final_note(Elf_Word *buf);
+ #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
+ #define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20)
+ #endif
+ #endif
+
int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
- unsigned long long *crash_size, unsigned long long *crash_base);
- int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
- unsigned long long *crash_size, unsigned long long *crash_base);
- int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
- unsigned long long *crash_size, unsigned long long *crash_base);
+ unsigned long long *crash_size, unsigned long long *crash_base,
+ unsigned long long *low_size, bool *high);
+
+ #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
+ #define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20)
+ #endif
+ #ifndef CRASH_ALIGN
+ #define CRASH_ALIGN SZ_2M
+ #endif
+ #ifndef CRASH_ADDR_LOW_MAX
+ #define CRASH_ADDR_LOW_MAX SZ_4G
+ #endif
+ #ifndef CRASH_ADDR_HIGH_MAX
+ #define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM()
+ #endif
+
+ void __init reserve_crashkernel_generic(char *cmdline,
+ unsigned long long crash_size,
+ unsigned long long crash_base,
+ unsigned long long crash_low_size,
+ bool high);
+ #else
+ static inline void __init reserve_crashkernel_generic(char *cmdline,
+ unsigned long long crash_size,
+ unsigned long long crash_base,
+ unsigned long long crash_low_size,
+ bool high)
+ {}
+ #endif
/* Alignment required for elf header segment */
#define ELF_CORE_HEADER_ALIGN 4096
struct crash_mem {
unsigned int max_nr_ranges;
unsigned int nr_ranges;
- struct range ranges[];
+ struct range ranges[] __counted_by(max_nr_ranges);
};
extern int crash_exclude_mem_range(struct crash_mem *mem,
#if __has_builtin(__builtin_dynamic_object_size)
#define POS __pass_dynamic_object_size(1)
#define POS0 __pass_dynamic_object_size(0)
-#define __struct_size(p) __builtin_dynamic_object_size(p, 0)
-#define __member_size(p) __builtin_dynamic_object_size(p, 1)
#else
#define POS __pass_object_size(1)
#define POS0 __pass_object_size(0)
-#define __struct_size(p) __builtin_object_size(p, 0)
-#define __member_size(p) __builtin_object_size(p, 1)
#endif
#define __compiletime_lessthan(bounds, length) ( \
__q_size_field, #op), \
#op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \
__fortify_size, \
- "field \"" #p "\" at " __FILE__ ":" __stringify(__LINE__), \
+ "field \"" #p "\" at " FILE_LINE, \
__p_size_field); \
__underlying_##op(p, q, __fortify_size); \
})
struct root_domain;
struct rq;
struct sched_attr;
-struct sched_param;
struct seq_file;
struct sighand_struct;
struct signal_struct;
extern struct mutex sched_domains_mutex;
#endif
+struct sched_param {
+ int sched_priority;
+};
+
struct sched_info {
#ifdef CONFIG_SCHED_INFO
/* Cumulative counters: */
#endif
unsigned int __state;
-#ifdef CONFIG_PREEMPT_RT
/* saved state for "spinlock sleepers" */
unsigned int saved_state;
-#endif
/*
* This begins the randomizable portion of task_struct. Only
struct mm_struct *mm;
struct mm_struct *active_mm;
+ struct address_space *faults_disabled_mapping;
int exit_state;
int exit_code;
* ->sched_remote_wakeup gets used, so it can be in this word.
*/
unsigned sched_remote_wakeup:1;
+#ifdef CONFIG_RT_MUTEXES
+ unsigned sched_rt_mutex:1;
+#endif
/* Bit to tell LSMs we're in execve(): */
unsigned in_execve:1;
/* PID/PID hash table linkage. */
struct pid *thread_pid;
struct hlist_node pid_links[PIDTYPE_MAX];
- struct list_head thread_group;
struct list_head thread_node;
struct completion *vfork_done;
struct mem_cgroup *active_memcg;
#endif
+#ifdef CONFIG_MEMCG_KMEM
+ struct obj_cgroup *objcg;
+#endif
+
#ifdef CONFIG_BLK_CGROUP
struct gendisk *throttle_disk;
#endif
schedule();
}
-#ifdef __ia64__
-# define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3
-#else
-# define ___ARCH_SI_IA64(_a1, _a2, _a3)
-#endif
-int force_sig_fault_to_task(int sig, int code, void __user *addr
- ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
- , struct task_struct *t);
-int force_sig_fault(int sig, int code, void __user *addr
- ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr));
-int send_sig_fault(int sig, int code, void __user *addr
- ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
- , struct task_struct *t);
+int force_sig_fault_to_task(int sig, int code, void __user *addr,
+ struct task_struct *t);
+int force_sig_fault(int sig, int code, void __user *addr);
+int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t);
int force_sig_mceerr(int code, void __user *, short);
int send_sig_mceerr(int code, void __user *, short, struct task_struct *);
while ((t = next_thread(t)) != g)
#define __for_each_thread(signal, t) \
- list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
+ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
+ lockdep_is_held(&tasklist_lock))
#define for_each_thread(p, t) \
__for_each_thread((p)->signal, t)
return p1->signal == p2->signal;
}
- static inline struct task_struct *next_thread(const struct task_struct *p)
+ /*
+ * returns NULL if p is the last thread in the thread group
+ */
+ static inline struct task_struct *__next_thread(struct task_struct *p)
+ {
+ return list_next_or_null_rcu(&p->signal->thread_head,
+ &p->thread_node,
+ struct task_struct,
+ thread_node);
+ }
+
+ static inline struct task_struct *next_thread(struct task_struct *p)
{
- return list_entry_rcu(p->thread_group.next,
- struct task_struct, thread_group);
+ return __next_thread(p) ?: p->group_leader;
}
static inline int thread_group_empty(struct task_struct *p)
{
- return list_empty(&p->thread_group);
+ return thread_group_leader(p) &&
+ list_is_last(&p->thread_node, &p->signal->thread_head);
}
#define delay_group_leader(p) \
.nr_cpus_allowed= NR_CPUS,
.mm = NULL,
.active_mm = &init_mm,
+ .faults_disabled_mapping = NULL,
.restart_block = {
.fn = do_no_restart_syscall,
},
.pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
.timer_slack_ns = 50000, /* 50 usec default slack */
.thread_pid = &init_struct_pid,
- .thread_group = LIST_HEAD_INIT(init_task.thread_group),
.thread_node = LIST_HEAD_INIT(init_signals.thread_head),
#ifdef CONFIG_AUDIT
.loginuid = INVALID_UID,
struct list_head list;
struct audit_tree *owner;
unsigned index; /* index; upper bit indicates 'will prune' */
- } owners[];
+ } owners[] __counted_by(count);
};
struct audit_tree_mark {
* that makes a difference. Some.
*/
- static struct fsnotify_group *audit_tree_group;
- static struct kmem_cache *audit_tree_mark_cachep __read_mostly;
+ static struct fsnotify_group *audit_tree_group __ro_after_init;
+ static struct kmem_cache *audit_tree_mark_cachep __ro_after_init;
static struct audit_tree *alloc_tree(const char *s)
{
#include <asm/unistd.h>
#include <asm/mmu_context.h>
+#include "exit.h"
+
/*
* The default value should be high enough to not crash a system that randomly
* crashes its kernel from time to time, but low enough to at least not permit
list_del_init(&p->sibling);
__this_cpu_dec(process_counts);
}
- list_del_rcu(&p->thread_group);
list_del_rcu(&p->thread_node);
}
exit_mm_release(current, mm);
if (!mm)
return;
- sync_mm_rss(mm);
mmap_read_lock(mm);
mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
- /* sync mm's RSS info before statistics gathering */
- if (tsk->mm)
- sync_mm_rss(tsk->mm);
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
return 0;
}
-struct waitid_info {
- pid_t pid;
- uid_t uid;
- int status;
- int cause;
-};
-
-struct wait_opts {
- enum pid_type wo_type;
- int wo_flags;
- struct pid *wo_pid;
-
- struct waitid_info *wo_info;
- int wo_stat;
- struct rusage *wo_rusage;
-
- wait_queue_entry_t child_wait;
- int notask_error;
-};
-
static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{
return wo->wo_type == PIDTYPE_MAX ||
return 0;
}
+bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
+{
+ if (!eligible_pid(wo, p))
+ return false;
+
+ if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
+ return false;
+
+ return true;
+}
+
static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key)
{
child_wait);
struct task_struct *p = key;
- if (!eligible_pid(wo, p))
- return 0;
+ if (pid_child_should_wake(wo, p))
+ return default_wake_function(wait, mode, sync, key);
- if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
- return 0;
-
- return default_wake_function(wait, mode, sync, key);
+ return 0;
}
void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
return 0;
}
-static long do_wait(struct wait_opts *wo)
+long __do_wait(struct wait_opts *wo)
{
- int retval;
-
- trace_sched_process_wait(wo->wo_pid);
+ long retval;
- init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
- wo->child_wait.private = current;
- add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
-repeat:
/*
* If there is nothing that can match our criteria, just get out.
* We will clear ->notask_error to zero if we see any child that
(!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
goto notask;
- set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock);
if (wo->wo_type == PIDTYPE_PID) {
retval = do_wait_pid(wo);
if (retval)
- goto end;
+ return retval;
} else {
struct task_struct *tsk = current;
do {
retval = do_wait_thread(wo, tsk);
if (retval)
- goto end;
+ return retval;
retval = ptrace_do_wait(wo, tsk);
if (retval)
- goto end;
+ return retval;
if (wo->wo_flags & __WNOTHREAD)
break;
notask:
retval = wo->notask_error;
- if (!retval && !(wo->wo_flags & WNOHANG)) {
- retval = -ERESTARTSYS;
- if (!signal_pending(current)) {
- schedule();
- goto repeat;
- }
- }
-end:
+ if (!retval && !(wo->wo_flags & WNOHANG))
+ return -ERESTARTSYS;
+
+ return retval;
+}
+
+static long do_wait(struct wait_opts *wo)
+{
+ int retval;
+
+ trace_sched_process_wait(wo->wo_pid);
+
+ init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
+ wo->child_wait.private = current;
+ add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
+
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ retval = __do_wait(wo);
+ if (retval != -ERESTARTSYS)
+ break;
+ if (signal_pending(current))
+ break;
+ schedule();
+ } while (1);
+
__set_current_state(TASK_RUNNING);
remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
return retval;
}
-static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
- int options, struct rusage *ru)
+int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+ struct waitid_info *infop, int options,
+ struct rusage *ru)
{
- struct wait_opts wo;
+ unsigned int f_flags = 0;
struct pid *pid = NULL;
enum pid_type type;
- long ret;
- unsigned int f_flags = 0;
if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
__WNOTHREAD|__WCLONE|__WALL))
return -EINVAL;
}
- wo.wo_type = type;
- wo.wo_pid = pid;
- wo.wo_flags = options;
- wo.wo_info = infop;
- wo.wo_rusage = ru;
+ wo->wo_type = type;
+ wo->wo_pid = pid;
+ wo->wo_flags = options;
+ wo->wo_info = infop;
+ wo->wo_rusage = ru;
if (f_flags & O_NONBLOCK)
- wo.wo_flags |= WNOHANG;
+ wo->wo_flags |= WNOHANG;
+
+ return 0;
+}
+
+static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
+ int options, struct rusage *ru)
+{
+ struct wait_opts wo;
+ long ret;
+
+ ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
+ if (ret)
+ return ret;
ret = do_wait(&wo);
- if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
+ if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
ret = -EAGAIN;
- put_pid(pid);
+ put_pid(wo.wo_pid);
return ret;
}
get_file(file);
i_mmap_lock_write(mapping);
- if (tmp->vm_flags & VM_SHARED)
+ if (vma_is_shared_maywrite(tmp))
mapping_allow_writable(mapping);
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
hugetlb_count_init(mm);
if (current->mm) {
- mm->flags = current->mm->flags & MMF_INIT_MASK;
+ mm->flags = mmf_init_flags(current->mm->flags);
mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
} else {
mm->flags = default_dump_filter;
/**
* set_mm_exe_file - change a reference to the mm's executable file
+ * @mm: The mm to change.
+ * @new_exe_file: The new file to use.
*
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
/**
* replace_mm_exe_file - replace a reference to the mm's executable file
+ * @mm: The mm to change.
+ * @new_exe_file: The new file to use.
*
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
/**
* get_mm_exe_file - acquire a reference to the mm's executable file
+ * @mm: The mm of interest.
*
* Returns %NULL if mm has no associated executable file.
* User must release file via fput().
struct file *exe_file;
rcu_read_lock();
- exe_file = rcu_dereference(mm->exe_file);
- if (exe_file && !get_file_rcu(exe_file))
- exe_file = NULL;
+ exe_file = get_file_rcu(&mm->exe_file);
rcu_read_unlock();
return exe_file;
}
/**
* get_task_exe_file - acquire a reference to the task's executable file
+ * @task: The task.
*
* Returns %NULL if task's mm (if any) has no associated executable file or
* this is a kernel thread with borrowed mm (see the comment above get_task_mm).
/**
* get_task_mm - acquire a reference to the task's mm
+ * @task: The task.
*
* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
* this kernel workthread has transiently adopted a user mm with use_mm,
* __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
* @pid: the struct pid for which to create a pidfd
* @flags: flags of the new @pidfd
- * @pidfd: the pidfd to return
+ * @ret: Where to return the file for the pidfd.
*
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
* caller's file descriptor table. The pidfd is reserved but not installed yet.
-
+ *
* The helper doesn't perform checks on @pid which makes it useful for pidfds
* created via CLONE_PIDFD where @pid has no task attached when the pidfd and
* pidfd file are prepared.
* pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
* @pid: the struct pid for which to create a pidfd
* @flags: flags of the new @pidfd
- * @pidfd: the pidfd to return
+ * @ret: Where to return the pidfd.
*
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
* caller's file descriptor table. The pidfd is reserved but not installed yet.
p->io_uring = NULL;
#endif
-#if defined(SPLIT_RSS_COUNTING)
- memset(&p->rss_stat, 0, sizeof(p->rss_stat));
-#endif
-
p->default_timer_slack_ns = current->timer_slack_ns;
#ifdef CONFIG_PSI
p->dirty_paused_when = 0;
p->pdeath_signal = 0;
- INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
clear_posix_cputimers_work(p);
atomic_inc(¤t->signal->live);
refcount_inc(¤t->signal->sigcnt);
task_join_group_stop(p);
- list_add_tail_rcu(&p->thread_group,
- &p->group_leader->thread_group);
list_add_tail_rcu(&p->thread_node,
&p->signal->thread_head);
}
if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
return false;
-#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
+#if !defined(CONFIG_STACK_GROWSUP)
kargs->stack += kargs->stack_size;
#endif
}
}
/**
- * clone3 - create a new process with specific properties
+ * sys_clone3 - create a new process with specific properties
* @uargs: argument structure
* @size: size of @uargs
*
}
EXPORT_SYMBOL(kthread_stop);
+ /**
+ * kthread_stop_put - stop a thread and put its task struct
+ * @k: thread created by kthread_create().
+ *
+ * Stops a thread created by kthread_create() and put its task_struct.
+ * Only use when holding an extra task struct reference obtained by
+ * calling get_task_struct().
+ */
+ int kthread_stop_put(struct task_struct *k)
+ {
+ int ret;
+
+ ret = kthread_stop(k);
+ put_task_struct(k);
+ return ret;
+ }
+ EXPORT_SYMBOL(kthread_stop_put);
+
int kthreadd(void *unused)
{
struct task_struct *tsk = current;
* clearing tsk->mm.
*/
smp_mb__after_spinlock();
- sync_mm_rss(mm);
local_irq_disable();
tsk->mm = NULL;
membarrier_update_current_mm(NULL);
#include "sched.h"
#include "stats.h"
-#include "autogroup.h"
#include "autogroup.h"
#include "pelt.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
struct thread_info *ti = task_thread_info(p);
typeof(ti->flags) val = READ_ONCE(ti->flags);
- for (;;) {
+ do {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
- if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
- break;
- }
+ } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED));
+
return true;
}
static void uclamp_update_util_min_rt_default(struct task_struct *p)
{
- struct rq_flags rf;
- struct rq *rq;
-
if (!rt_task(p))
return;
/* Protect updates to p->uclamp_* */
- rq = task_rq_lock(p, &rf);
+ guard(task_rq_lock)(p);
__uclamp_update_util_min_rt_default(p);
- task_rq_unlock(rq, p, &rf);
}
static inline struct uclamp_se
uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
- rcu_read_lock();
+ guard(rcu)();
cpu_util_update_eff(&root_task_group.css);
- rcu_read_unlock();
}
#else
static void uclamp_update_root_tg(void) { }
smp_mb__after_spinlock();
read_unlock(&tasklist_lock);
- rcu_read_lock();
+ guard(rcu)();
for_each_process_thread(g, p)
uclamp_update_util_min_rt_default(p);
- rcu_read_unlock();
}
static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
p->sched_class->prio_changed(rq, p, oldprio);
}
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
if (p->sched_class == rq->curr->sched_class)
- rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+ rq->curr->sched_class->wakeup_preempt(rq, p, flags);
else if (sched_class_above(p->sched_class, rq->curr->sched_class))
resched_curr(rq);
if (READ_ONCE(p->__state) & state)
return 1;
-#ifdef CONFIG_PREEMPT_RT
if (READ_ONCE(p->saved_state) & state)
return -1;
-#endif
+
return 0;
}
static __always_inline
int task_state_match(struct task_struct *p, unsigned int state)
{
-#ifdef CONFIG_PREEMPT_RT
- int match;
-
/*
- * Serialize against current_save_and_set_rtlock_wait_state() and
- * current_restore_rtlock_saved_state().
+ * Serialize against current_save_and_set_rtlock_wait_state(),
+ * current_restore_rtlock_saved_state(), and __refrigerator().
*/
- raw_spin_lock_irq(&p->pi_lock);
- match = __task_state_match(p, state);
- raw_spin_unlock_irq(&p->pi_lock);
-
- return match;
-#else
+ guard(raw_spinlock_irq)(&p->pi_lock);
return __task_state_match(p, state);
-#endif
}
/*
return;
}
- preempt_disable();
+ guard(preempt)();
this_rq()->nr_pinned++;
p->migration_disabled = 1;
- preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_disable);
* Ensure stop_task runs either before or after this, and that
* __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
*/
- preempt_disable();
+ guard(preempt)();
if (p->cpus_ptr != &p->cpus_mask)
__set_cpus_allowed_ptr(p, &ac);
/*
barrier();
p->migration_disabled = 0;
this_rq()->nr_pinned--;
- preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
rq_lock(rq, rf);
WARN_ON_ONCE(task_cpu(p) != new_cpu);
activate_task(rq, p, 0);
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
return rq;
}
* it.
*/
WARN_ON_ONCE(!pending->stop_pending);
+ preempt_disable();
task_rq_unlock(rq, p, &rf);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
+ preempt_enable();
return 0;
}
out:
complete = true;
}
+ preempt_disable();
task_rq_unlock(rq, p, rf);
-
if (push_task) {
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
p, &rq->push_work);
}
+ preempt_enable();
if (complete)
complete_all(&pending->done);
if (flags & SCA_MIGRATE_ENABLE)
p->migration_flags &= ~MDF_PUSH;
+ preempt_disable();
task_rq_unlock(rq, p, rf);
-
if (!stop_pending) {
stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
&pending->arg, &pending->stop_work);
}
+ preempt_enable();
if (flags & SCA_MIGRATE_ENABLE)
return 0;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
- check_preempt_curr(dst_rq, p, 0);
+ wakeup_preempt(dst_rq, p, 0);
rq_unpin_lock(dst_rq, &drf);
rq_unpin_lock(src_rq, &srf);
*/
void kick_process(struct task_struct *p)
{
- int cpu;
+ guard(preempt)();
+ int cpu = task_cpu(p);
- preempt_disable();
- cpu = task_cpu(p);
if ((cpu != smp_processor_id()) && task_curr(p))
smp_send_reschedule(cpu);
- preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
}
activate_task(rq, p, en_flags);
- check_preempt_curr(rq, p, wake_flags);
+ wakeup_preempt(rq, p, wake_flags);
ttwu_do_wakeup(p);
if (rq->avg_idle > max)
rq->avg_idle = max;
- rq->wake_stamp = jiffies;
- rq->wake_avg_idle = rq->avg_idle / 2;
-
rq->idle_stamp = 0;
}
#endif
* it should preempt the task that is current now.
*/
update_rq_clock(rq);
- check_preempt_curr(rq, p, wake_flags);
+ wakeup_preempt(rq, p, wake_flags);
}
ttwu_do_wakeup(p);
ret = 1;
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
+/*
+ * Whether CPUs are share cache resources, which means LLC on non-cluster
+ * machines and LLC tag or L2 on machines with clusters.
+ */
+bool cpus_share_resources(int this_cpu, int that_cpu)
+{
+ if (this_cpu == that_cpu)
+ return true;
+
+ return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu);
+}
+
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
/*
* The caller holds p::pi_lock if p != current or has preemption
* disabled when p == current.
*
- * The rules of PREEMPT_RT saved_state:
+ * The rules of saved_state:
*
* The related locking code always holds p::pi_lock when updating
* p::saved_state, which means the code is fully serialized in both cases.
*
- * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
- * bits set. This allows to distinguish all wakeup scenarios.
+ * For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT.
+ * No other bits set. This allows to distinguish all wakeup scenarios.
+ *
+ * For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This
+ * allows us to prevent early wakeup of tasks before they can be run on
+ * asymmetric ISA architectures (eg ARMv9).
*/
static __always_inline
bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
*success = !!(match = __task_state_match(p, state));
-#ifdef CONFIG_PREEMPT_RT
/*
* Saved state preserves the task state across blocking on
- * an RT lock. If the state matches, set p::saved_state to
- * TASK_RUNNING, but do not wake the task because it waits
- * for a lock wakeup. Also indicate success because from
- * the regular waker's point of view this has succeeded.
+ * an RT lock or TASK_FREEZABLE tasks. If the state matches,
+ * set p::saved_state to TASK_RUNNING, but do not wake the task
+ * because it waits for a lock wakeup or __thaw_task(). Also
+ * indicate success because from the regular waker's point of
+ * view this has succeeded.
*
* After acquiring the lock the task will restore p::__state
* from p::saved_state which ensures that the regular
*/
if (match < 0)
p->saved_state = TASK_RUNNING;
-#endif
+
return match > 0;
}
* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
* __schedule(). See the comment for smp_mb__after_spinlock().
*
- * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
+ * A similar smp_rmb() lives in __task_needs_rq_lock().
*/
smp_rmb();
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
activate_task(rq, p, ENQUEUE_NOCLOCK);
trace_sched_wakeup_new(p);
- check_preempt_curr(rq, p, WF_FORK);
+ wakeup_preempt(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
/* switch_mm_cid() requires the memory barriers above. */
switch_mm_cid(rq, prev, next);
- rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-
prepare_lock_switch(rq, next, rf);
/* Here we just switch the register state and the stack. */
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
- if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
- && in_atomic_preempt_off()) {
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
pr_err("Preemption disabled at:");
print_ip_sym(KERN_ERR, preempt_disable_ip);
}
struct sched_domain *sd;
int cpu = cpu_of(rq);
- preempt_disable();
- rcu_read_lock();
+ guard(preempt)();
+ guard(rcu)();
+
raw_spin_rq_unlock_irq(rq);
for_each_domain(cpu, sd) {
if (need_resched())
break;
}
raw_spin_rq_lock_irq(rq);
- rcu_read_unlock();
- preempt_enable();
}
static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
update_rq_clock(rq);
+ rq->clock_update_flags = RQCF_UPDATED;
switch_count = &prev->nivcsw;
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
- rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
raw_spin_rq_unlock_irq(rq);
static inline void sched_submit_work(struct task_struct *tsk)
{
+ static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
unsigned int task_flags;
- if (task_is_running(tsk))
- return;
+ /*
+ * Establish LD_WAIT_CONFIG context to ensure none of the code called
+ * will use a blocking primitive -- which would lead to recursion.
+ */
+ lock_map_acquire_try(&sched_map);
task_flags = tsk->flags;
/*
* If a worker goes to sleep, notify and ask workqueue whether it
* wants to wake up a task to maintain concurrency.
*/
- if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
- if (task_flags & PF_WQ_WORKER)
- wq_worker_sleeping(tsk);
- else
- io_wq_worker_sleeping(tsk);
- }
+ if (task_flags & PF_WQ_WORKER)
+ wq_worker_sleeping(tsk);
+ else if (task_flags & PF_IO_WORKER)
+ io_wq_worker_sleeping(tsk);
/*
* spinlock and rwlock must not flush block requests. This will
* make sure to submit it to avoid deadlocks.
*/
blk_flush_plug(tsk->plug, true);
+
+ lock_map_release(&sched_map);
}
static void sched_update_worker(struct task_struct *tsk)
}
}
-asmlinkage __visible void __sched schedule(void)
+static __always_inline void __schedule_loop(unsigned int sched_mode)
{
- struct task_struct *tsk = current;
-
- sched_submit_work(tsk);
do {
preempt_disable();
- __schedule(SM_NONE);
+ __schedule(sched_mode);
sched_preempt_enable_no_resched();
} while (need_resched());
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+ struct task_struct *tsk = current;
+
+#ifdef CONFIG_RT_MUTEXES
+ lockdep_assert(!tsk->sched_rt_mutex);
+#endif
+
+ if (!task_is_running(tsk))
+ sched_submit_work(tsk);
+ __schedule_loop(SM_NONE);
sched_update_worker(tsk);
}
EXPORT_SYMBOL(schedule);
#ifdef CONFIG_PREEMPT_RT
void __sched notrace schedule_rtlock(void)
{
- do {
- preempt_disable();
- __schedule(SM_RTLOCK_WAIT);
- sched_preempt_enable_no_resched();
- } while (need_resched());
+ __schedule_loop(SM_RTLOCK_WAIT);
}
NOKPROBE_SYMBOL(schedule_rtlock);
#endif
#ifdef CONFIG_RT_MUTEXES
+/*
+ * Would be more useful with typeof()/auto_type but they don't mix with
+ * bit-fields. Since it's a local thing, use int. Keep the generic sounding
+ * name such that if someone were to implement this function we get to compare
+ * notes.
+ */
+#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; })
+
+void rt_mutex_pre_schedule(void)
+{
+ lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1));
+ sched_submit_work(current);
+}
+
+void rt_mutex_schedule(void)
+{
+ lockdep_assert(current->sched_rt_mutex);
+ __schedule_loop(SM_NONE);
+}
+
+void rt_mutex_post_schedule(void)
+{
+ sched_update_worker(current);
+ lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0));
+}
+
static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
{
if (pi_task)
void set_user_nice(struct task_struct *p, long nice)
{
bool queued, running;
- int old_prio;
- struct rq_flags rf;
struct rq *rq;
+ int old_prio;
if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
return;
* We have to be careful, if called from sys_setpriority(),
* the task might be in the middle of scheduling on another CPU.
*/
- rq = task_rq_lock(p, &rf);
+ CLASS(task_rq_lock, rq_guard)(p);
+ rq = rq_guard.rq;
+
update_rq_clock(rq);
/*
*/
if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
- goto out_unlock;
+ return;
}
+
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
* lowered its priority, then reschedule its CPU:
*/
p->sched_class->prio_changed(rq, p, old_prio);
-
-out_unlock:
- task_rq_unlock(rq, p, &rf);
}
EXPORT_SYMBOL(set_user_nice);
return pid ? find_task_by_vpid(pid) : current;
}
+static struct task_struct *find_get_task(pid_t pid)
+{
+ struct task_struct *p;
+ guard(rcu)();
+
+ p = find_process_by_pid(pid);
+ if (likely(p))
+ get_task_struct(p);
+
+ return p;
+}
+
+DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
+ find_get_task(pid), pid_t pid)
+
/*
* sched_setparam() passes in -1 for its policy, to let the functions
* it calls know not to change it.
static bool check_same_owner(struct task_struct *p)
{
const struct cred *cred = current_cred(), *pcred;
- bool match;
+ guard(rcu)();
- rcu_read_lock();
pcred = __task_cred(p);
- match = (uid_eq(cred->euid, pcred->euid) ||
- uid_eq(cred->euid, pcred->uid));
- rcu_read_unlock();
- return match;
+ return (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
}
/*
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
struct sched_param lparam;
- struct task_struct *p;
- int retval;
if (!param || pid < 0)
return -EINVAL;
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
return -EFAULT;
- rcu_read_lock();
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (likely(p))
- get_task_struct(p);
- rcu_read_unlock();
-
- if (likely(p)) {
- retval = sched_setscheduler(p, policy, &lparam);
- put_task_struct(p);
- }
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
- return retval;
+ return sched_setscheduler(p, policy, &lparam);
}
/*
unsigned int, flags)
{
struct sched_attr attr;
- struct task_struct *p;
int retval;
if (!uattr || pid < 0 || flags)
if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
attr.sched_policy = SETPARAM_POLICY;
- rcu_read_lock();
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (likely(p))
- get_task_struct(p);
- rcu_read_unlock();
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
- if (likely(p)) {
- if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
- get_params(p, &attr);
- retval = sched_setattr(p, &attr);
- put_task_struct(p);
- }
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ get_params(p, &attr);
- return retval;
+ return sched_setattr(p, &attr);
}
/**
if (pid < 0)
return -EINVAL;
- retval = -ESRCH;
- rcu_read_lock();
+ guard(rcu)();
p = find_process_by_pid(pid);
- if (p) {
- retval = security_task_getscheduler(p);
- if (!retval)
- retval = p->policy
- | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (!retval) {
+ retval = p->policy;
+ if (p->sched_reset_on_fork)
+ retval |= SCHED_RESET_ON_FORK;
}
- rcu_read_unlock();
return retval;
}
if (!param || pid < 0)
return -EINVAL;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- retval = -ESRCH;
- if (!p)
- goto out_unlock;
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
- if (task_has_rt_policy(p))
- lp.sched_priority = p->rt_priority;
- rcu_read_unlock();
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
+ }
/*
* This one might sleep, we cannot do it with a spinlock held ...
*/
- retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
-
- return retval;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
+ return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
}
/*
usize < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- retval = -ESRCH;
- if (!p)
- goto out_unlock;
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
- kattr.sched_policy = p->policy;
- if (p->sched_reset_on_fork)
- kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- get_params(p, &kattr);
- kattr.sched_flags &= SCHED_FLAG_ALL;
+ kattr.sched_policy = p->policy;
+ if (p->sched_reset_on_fork)
+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ get_params(p, &kattr);
+ kattr.sched_flags &= SCHED_FLAG_ALL;
#ifdef CONFIG_UCLAMP_TASK
- /*
- * This could race with another potential updater, but this is fine
- * because it'll correctly read the old or the new value. We don't need
- * to guarantee who wins the race as long as it doesn't return garbage.
- */
- kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+ /*
+ * This could race with another potential updater, but this is fine
+ * because it'll correctly read the old or the new value. We don't need
+ * to guarantee who wins the race as long as it doesn't return garbage.
+ */
+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
-
- rcu_read_unlock();
+ }
return sched_attr_copy_to_user(uattr, &kattr, usize);
-
-out_unlock:
- rcu_read_unlock();
- return retval;
}
#ifdef CONFIG_SMP
int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
- int ret = 0;
-
/*
* If the task isn't a deadline task or admission control is
* disabled then we don't care about affinity changes.
* tasks allowed to run on all the CPUs in the task's
* root_domain.
*/
- rcu_read_lock();
+ guard(rcu)();
if (!cpumask_subset(task_rq(p)->rd->span, mask))
- ret = -EBUSY;
- rcu_read_unlock();
- return ret;
+ return -EBUSY;
+
+ return 0;
}
#endif
{
struct affinity_context ac;
struct cpumask *user_mask;
- struct task_struct *p;
int retval;
- rcu_read_lock();
-
- p = find_process_by_pid(pid);
- if (!p) {
- rcu_read_unlock();
+ CLASS(find_get_task, p)(pid);
+ if (!p)
return -ESRCH;
- }
-
- /* Prevent p going away */
- get_task_struct(p);
- rcu_read_unlock();
- if (p->flags & PF_NO_SETAFFINITY) {
- retval = -EINVAL;
- goto out_put_task;
- }
+ if (p->flags & PF_NO_SETAFFINITY)
+ return -EINVAL;
if (!check_same_owner(p)) {
- rcu_read_lock();
- if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
- rcu_read_unlock();
- retval = -EPERM;
- goto out_put_task;
- }
- rcu_read_unlock();
+ guard(rcu)();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
+ return -EPERM;
}
retval = security_task_setscheduler(p);
if (retval)
- goto out_put_task;
+ return retval;
/*
* With non-SMP configs, user_cpus_ptr/user_mask isn't used and
if (user_mask) {
cpumask_copy(user_mask, in_mask);
} else if (IS_ENABLED(CONFIG_SMP)) {
- retval = -ENOMEM;
- goto out_put_task;
+ return -ENOMEM;
}
ac = (struct affinity_context){
retval = __sched_setaffinity(p, &ac);
kfree(ac.user_mask);
-out_put_task:
- put_task_struct(p);
return retval;
}
long sched_getaffinity(pid_t pid, struct cpumask *mask)
{
struct task_struct *p;
- unsigned long flags;
int retval;
- rcu_read_lock();
-
- retval = -ESRCH;
+ guard(rcu)();
p = find_process_by_pid(pid);
if (!p)
- goto out_unlock;
+ return -ESRCH;
retval = security_task_getscheduler(p);
if (retval)
- goto out_unlock;
+ return retval;
- raw_spin_lock_irqsave(&p->pi_lock, flags);
+ guard(raw_spinlock_irqsave)(&p->pi_lock);
cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-out_unlock:
- rcu_read_unlock();
-
- return retval;
+ return 0;
}
/**
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
- unsigned long flags;
int yielded = 0;
- local_irq_save(flags);
- rq = this_rq();
+ scoped_guard (irqsave) {
+ rq = this_rq();
again:
- p_rq = task_rq(p);
- /*
- * If we're the only runnable task on the rq and target rq also
- * has only one task, there's absolutely no point in yielding.
- */
- if (rq->nr_running == 1 && p_rq->nr_running == 1) {
- yielded = -ESRCH;
- goto out_irq;
- }
+ p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (rq->nr_running == 1 && p_rq->nr_running == 1)
+ return -ESRCH;
- double_rq_lock(rq, p_rq);
- if (task_rq(p) != p_rq) {
- double_rq_unlock(rq, p_rq);
- goto again;
- }
+ guard(double_rq_lock)(rq, p_rq);
+ if (task_rq(p) != p_rq)
+ goto again;
- if (!curr->sched_class->yield_to_task)
- goto out_unlock;
+ if (!curr->sched_class->yield_to_task)
+ return 0;
- if (curr->sched_class != p->sched_class)
- goto out_unlock;
+ if (curr->sched_class != p->sched_class)
+ return 0;
- if (task_on_cpu(p_rq, p) || !task_is_running(p))
- goto out_unlock;
+ if (task_on_cpu(p_rq, p) || !task_is_running(p))
+ return 0;
- yielded = curr->sched_class->yield_to_task(rq, p);
- if (yielded) {
- schedstat_inc(rq->yld_count);
- /*
- * Make p's CPU reschedule; pick_next_entity takes care of
- * fairness.
- */
- if (preempt && rq != p_rq)
- resched_curr(p_rq);
+ yielded = curr->sched_class->yield_to_task(rq, p);
+ if (yielded) {
+ schedstat_inc(rq->yld_count);
+ /*
+ * Make p's CPU reschedule; pick_next_entity
+ * takes care of fairness.
+ */
+ if (preempt && rq != p_rq)
+ resched_curr(p_rq);
+ }
}
-out_unlock:
- double_rq_unlock(rq, p_rq);
-out_irq:
- local_irq_restore(flags);
-
- if (yielded > 0)
+ if (yielded)
schedule();
return yielded;
static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
{
- struct task_struct *p;
- unsigned int time_slice;
- struct rq_flags rf;
- struct rq *rq;
+ unsigned int time_slice = 0;
int retval;
if (pid < 0)
return -EINVAL;
- retval = -ESRCH;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- if (!p)
- goto out_unlock;
+ scoped_guard (rcu) {
+ struct task_struct *p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
- rq = task_rq_lock(p, &rf);
- time_slice = 0;
- if (p->sched_class->get_rr_interval)
- time_slice = p->sched_class->get_rr_interval(rq, p);
- task_rq_unlock(rq, p, &rf);
+ scoped_guard (task_rq_lock, p) {
+ struct rq *rq = scope.rq;
+ if (p->sched_class->get_rr_interval)
+ time_slice = p->sched_class->get_rr_interval(rq, p);
+ }
+ }
- rcu_read_unlock();
jiffies_to_timespec64(time_slice, t);
return 0;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
}
/**
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n",
- free, task_pid_nr(p), ppid,
- read_task_thread_flags(p));
+ pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n",
+ free, task_pid_nr(p), task_tgid_nr(p),
+ ppid, read_task_thread_flags(p));
print_worker_info(KERN_INFO, p);
print_stop_info(KERN_INFO, p);
* Temporarily drop rq->lock such that we can wake-up the stop task.
* Both preemption and IRQs are still disabled.
*/
+ preempt_disable();
raw_spin_rq_unlock(rq);
stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
this_cpu_ptr(&push_work));
+ preempt_enable();
/*
* At this point need_resched() is true and we'll take the loop in
* schedule(). The next pick is obviously going to be the stop task
LIST_HEAD(task_groups);
/* Cacheline aligned slab cache for task_group */
- static struct kmem_cache *task_group_cache __read_mostly;
+ static struct kmem_cache *task_group_cache __ro_after_init;
#endif
void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
+ rq->cpu_capacity = SCHED_CAPACITY_SCALE;
rq->balance_callback = &balance_push_callback;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
- rq->wake_stamp = jiffies;
- rq->wake_avg_idle = rq->avg_idle;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
INIT_LIST_HEAD(&rq->cfs_tasks);
#endif /* CONFIG_MAGIC_SYSRQ */
-#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+#if defined(CONFIG_KGDB_KDB)
/*
- * These functions are only useful for the IA64 MCA handling, or kdb.
+ * These functions are only useful for kdb.
*
* They can only be called when the whole system has been
* stopped - every CPU needs to be quiescent, and no scheduling
return cpu_curr(cpu);
}
-#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
-
-#ifdef CONFIG_IA64
-/**
- * ia64_set_curr_task - set the current task for a given CPU.
- * @cpu: the processor in question.
- * @p: the task pointer to set.
- *
- * Description: This function must only be used when non-maskable interrupts
- * are serviced on a separate stack. It allows the architecture to switch the
- * notion of the current task on a CPU in a non-blocking manner. This function
- * must be called with all CPU's synchronized, and interrupts disabled, the
- * and caller must save the original value of the current task (see
- * curr_task() above) and restore that value before reenabling interrupts and
- * re-starting the system.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- */
-void ia64_set_curr_task(int cpu, struct task_struct *p)
-{
- cpu_curr(cpu) = p;
-}
-
-#endif
+#endif /* defined(CONFIG_KGDB_KDB) */
#ifdef CONFIG_CGROUP_SCHED
/* task_group_lock serializes the addition/removal of task groups */
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct task_group *group;
- struct rq_flags rf;
struct rq *rq;
- rq = task_rq_lock(tsk, &rf);
+ CLASS(task_rq_lock, rq_guard)(tsk);
+ rq = rq_guard.rq;
+
/*
* Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
* group changes.
*/
group = sched_get_task_group(tsk);
if (group == tsk->sched_task_group)
- goto unlock;
+ return;
update_rq_clock(rq);
*/
resched_curr(rq);
}
-
-unlock:
- task_rq_unlock(rq, tsk, &rf);
}
static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
#ifdef CONFIG_UCLAMP_TASK_GROUP
/* Propagate the effective uclamp value for the new group */
- mutex_lock(&uclamp_mutex);
- rcu_read_lock();
+ guard(mutex)(&uclamp_mutex);
+ guard(rcu)();
cpu_util_update_eff(css);
- rcu_read_unlock();
- mutex_unlock(&uclamp_mutex);
#endif
return 0;
static_branch_enable(&sched_uclamp_used);
- mutex_lock(&uclamp_mutex);
- rcu_read_lock();
+ guard(mutex)(&uclamp_mutex);
+ guard(rcu)();
tg = css_tg(of_css(of));
if (tg->uclamp_req[clamp_id].value != req.util)
/* Update effective clamps to track the most restrictive value */
cpu_util_update_eff(of_css(of));
- rcu_read_unlock();
- mutex_unlock(&uclamp_mutex);
-
return nbytes;
}
u64 percent;
u32 rem;
- rcu_read_lock();
- tg = css_tg(seq_css(sf));
- util_clamp = tg->uclamp_req[clamp_id].value;
- rcu_read_unlock();
+ scoped_guard (rcu) {
+ tg = css_tg(seq_css(sf));
+ util_clamp = tg->uclamp_req[clamp_id].value;
+ }
if (util_clamp == SCHED_CAPACITY_SCALE) {
seq_puts(sf, "max\n");
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
- cpus_read_lock();
- mutex_lock(&cfs_constraints_mutex);
+ guard(cpus_read_lock)();
+ guard(mutex)(&cfs_constraints_mutex);
+
ret = __cfs_schedulable(tg, period, quota);
if (ret)
- goto out_unlock;
+ return ret;
runtime_enabled = quota != RUNTIME_INF;
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
*/
if (runtime_enabled && !runtime_was_enabled)
cfs_bandwidth_usage_inc();
- raw_spin_lock_irq(&cfs_b->lock);
- cfs_b->period = ns_to_ktime(period);
- cfs_b->quota = quota;
- cfs_b->burst = burst;
- __refill_cfs_bandwidth_runtime(cfs_b);
+ scoped_guard (raw_spinlock_irq, &cfs_b->lock) {
+ cfs_b->period = ns_to_ktime(period);
+ cfs_b->quota = quota;
+ cfs_b->burst = burst;
- /* Restart the period timer (if active) to handle new period expiry: */
- if (runtime_enabled)
- start_cfs_bandwidth(cfs_b);
+ __refill_cfs_bandwidth_runtime(cfs_b);
- raw_spin_unlock_irq(&cfs_b->lock);
+ /*
+ * Restart the period timer (if active) to handle new
+ * period expiry:
+ */
+ if (runtime_enabled)
+ start_cfs_bandwidth(cfs_b);
+ }
for_each_online_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
struct rq *rq = cfs_rq->rq;
- struct rq_flags rf;
- rq_lock_irq(rq, &rf);
+ guard(rq_lock_irq)(rq);
cfs_rq->runtime_enabled = runtime_enabled;
cfs_rq->runtime_remaining = 0;
if (cfs_rq->throttled)
unthrottle_cfs_rq(cfs_rq);
- rq_unlock_irq(rq, &rf);
}
+
if (runtime_was_enabled && !runtime_enabled)
cfs_bandwidth_usage_dec();
-out_unlock:
- mutex_unlock(&cfs_constraints_mutex);
- cpus_read_unlock();
- return ret;
+ return 0;
}
static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
{
- int ret;
struct cfs_schedulable_data data = {
.tg = tg,
.period = period,
do_div(data.quota, NSEC_PER_USEC);
}
- rcu_read_lock();
- ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
- rcu_read_unlock();
-
- return ret;
+ guard(rcu)();
+ return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
}
static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
* are not the last task to be migrated from this cpu for this mm, so
* there is no need to move src_cid to the destination cpu.
*/
- rcu_read_lock();
+ guard(rcu)();
src_task = rcu_dereference(src_rq->curr);
if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- rcu_read_unlock();
t->last_mm_cid = -1;
return -1;
}
- rcu_read_unlock();
return src_cid;
}
* the lazy-put flag, this task will be responsible for transitioning
* from lazy-put flag set to MM_CID_UNSET.
*/
- rcu_read_lock();
- src_task = rcu_dereference(src_rq->curr);
- if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- rcu_read_unlock();
- /*
- * We observed an active task for this mm, there is therefore
- * no point in moving this cid to the destination cpu.
- */
- t->last_mm_cid = -1;
- return -1;
+ scoped_guard (rcu) {
+ src_task = rcu_dereference(src_rq->curr);
+ if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+ /*
+ * We observed an active task for this mm, there is therefore
+ * no point in moving this cid to the destination cpu.
+ */
+ t->last_mm_cid = -1;
+ return -1;
+ }
}
- rcu_read_unlock();
/*
* The src_cid is unused, so it can be unset.
{
struct rq *rq = cpu_rq(cpu);
struct task_struct *t;
- unsigned long flags;
int cid, lazy_cid;
cid = READ_ONCE(pcpu_cid->cid);
* the lazy-put flag, that task will be responsible for transitioning
* from lazy-put flag set to MM_CID_UNSET.
*/
- rcu_read_lock();
- t = rcu_dereference(rq->curr);
- if (READ_ONCE(t->mm_cid_active) && t->mm == mm) {
- rcu_read_unlock();
- return;
+ scoped_guard (rcu) {
+ t = rcu_dereference(rq->curr);
+ if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
+ return;
}
- rcu_read_unlock();
/*
* The cid is unused, so it can be unset.
* Disable interrupts to keep the window of cid ownership without rq
* lock small.
*/
- local_irq_save(flags);
- if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
- __mm_cid_put(mm, cid);
- local_irq_restore(flags);
+ scoped_guard (irqsave) {
+ if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+ __mm_cid_put(mm, cid);
+ }
}
static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
* snapshot associated with this cid if an active task using the mm is
* observed on this rq.
*/
- rcu_read_lock();
- curr = rcu_dereference(rq->curr);
- if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
- WRITE_ONCE(pcpu_cid->time, rq_clock);
- rcu_read_unlock();
- return;
+ scoped_guard (rcu) {
+ curr = rcu_dereference(rq->curr);
+ if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
+ WRITE_ONCE(pcpu_cid->time, rq_clock);
+ return;
+ }
}
- rcu_read_unlock();
if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
return;
void sched_mm_cid_exit_signals(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq_flags rf;
struct rq *rq;
if (!mm)
preempt_disable();
rq = this_rq();
- rq_lock_irqsave(rq, &rf);
+ guard(rq_lock_irqsave)(rq);
preempt_enable_no_resched(); /* holding spinlock */
WRITE_ONCE(t->mm_cid_active, 0);
/*
smp_mb();
mm_cid_put(mm);
t->last_mm_cid = t->mm_cid = -1;
- rq_unlock_irqrestore(rq, &rf);
}
void sched_mm_cid_before_execve(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq_flags rf;
struct rq *rq;
if (!mm)
preempt_disable();
rq = this_rq();
- rq_lock_irqsave(rq, &rf);
+ guard(rq_lock_irqsave)(rq);
preempt_enable_no_resched(); /* holding spinlock */
WRITE_ONCE(t->mm_cid_active, 0);
/*
smp_mb();
mm_cid_put(mm);
t->last_mm_cid = t->mm_cid = -1;
- rq_unlock_irqrestore(rq, &rf);
}
void sched_mm_cid_after_execve(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq_flags rf;
struct rq *rq;
if (!mm)
preempt_disable();
rq = this_rq();
- rq_lock_irqsave(rq, &rf);
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 1);
- /*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
- */
- smp_mb();
- t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
- rq_unlock_irqrestore(rq, &rf);
+ scoped_guard (rq_lock_irqsave, rq) {
+ preempt_enable_no_resched(); /* holding spinlock */
+ WRITE_ONCE(t->mm_cid_active, 1);
+ /*
+ * Store t->mm_cid_active before loading per-mm/cpu cid.
+ * Matches barrier in sched_mm_cid_remote_clear_old().
+ */
+ smp_mb();
+ t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
+ }
rseq_set_notify_resume(t);
}
int override_rlimit, const unsigned int sigqueue_flags)
{
struct sigqueue *q = NULL;
- struct ucounts *ucounts = NULL;
+ struct ucounts *ucounts;
long sigpending;
/*
signal->flags = SIGNAL_GROUP_EXIT;
signal->group_exit_code = sig;
signal->group_stop_count = 0;
- t = p;
- do {
+ __for_each_thread(signal, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
- } while_each_thread(p, t);
+ }
return;
}
}
int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
struct task_struct *p = NULL;
- int retval, success;
+ int ret = -ESRCH;
- success = 0;
- retval = -ESRCH;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID);
- success |= !err;
- retval = err;
+ /*
+ * If group_send_sig_info() succeeds at least once ret
+ * becomes 0 and after that the code below has no effect.
+ * Otherwise we return the last err or -ESRCH if this
+ * process group is empty.
+ */
+ if (ret)
+ ret = err;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
- return success ? 0 : retval;
+
+ return ret;
}
int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
force_sig(SIGSEGV);
}
-int force_sig_fault_to_task(int sig, int code, void __user *addr
- ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
- , struct task_struct *t)
+int force_sig_fault_to_task(int sig, int code, void __user *addr,
+ struct task_struct *t)
{
struct kernel_siginfo info;
info.si_errno = 0;
info.si_code = code;
info.si_addr = addr;
-#ifdef __ia64__
- info.si_imm = imm;
- info.si_flags = flags;
- info.si_isr = isr;
-#endif
return force_sig_info_to_task(&info, t, HANDLER_CURRENT);
}
-int force_sig_fault(int sig, int code, void __user *addr
- ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr))
+int force_sig_fault(int sig, int code, void __user *addr)
{
- return force_sig_fault_to_task(sig, code, addr
- ___ARCH_SI_IA64(imm, flags, isr), current);
+ return force_sig_fault_to_task(sig, code, addr, current);
}
-int send_sig_fault(int sig, int code, void __user *addr
- ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
- , struct task_struct *t)
+int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t)
{
struct kernel_siginfo info;
info.si_errno = 0;
info.si_code = code;
info.si_addr = addr;
-#ifdef __ia64__
- info.si_imm = imm;
- info.si_flags = flags;
- info.si_isr = isr;
-#endif
return send_sig_info(info.si_signo, &info, t);
}
do_notify_parent_cldstop(current, false, why);
/*
- * Don't want to allow preemption here, because
- * sys_ptrace() needs this task to be inactive.
+ * The previous do_notify_parent_cldstop() invocation woke ptracer.
+ * One a PREEMPTION kernel this can result in preemption requirement
+ * which will be fulfilled after read_unlock() and the ptracer will be
+ * put on the CPU.
+ * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for
+ * this task wait in schedule(). If this task gets preempted then it
+ * remains enqueued on the runqueue. The ptracer will observe this and
+ * then sleep for a delay of one HZ tick. In the meantime this task
+ * gets scheduled, enters schedule() and will wait for the ptracer.
+ *
+ * This preemption point is not bad from a correctness point of
+ * view but extends the runtime by one HZ tick time due to the
+ * ptracer's sleep. The preempt-disable section ensures that there
+ * will be no preemption between unlock and schedule() and so
+ * improving the performance since the ptracer will observe that
+ * the tracee is scheduled out once it gets on the CPU.
*
- * XXX: implement read_unlock_no_resched().
+ * On PREEMPT_RT locking tasklist_lock does not disable preemption.
+ * Therefore the task can be preempted after do_notify_parent_cldstop()
+ * before unlocking tasklist_lock so there is no benefit in doing this.
+ *
+ * In fact disabling preemption is harmful on PREEMPT_RT because
+ * the spinlock_t in cgroup_enter_frozen() must not be acquired
+ * with preemption disabled due to the 'sleeping' spinlock
+ * substitution of RT.
*/
- preempt_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
read_unlock(&tasklist_lock);
cgroup_enter_frozen();
- preempt_enable_no_resched();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable_no_resched();
schedule();
cgroup_leave_frozen(true);
unsigned long flags;
u64 tgutime, tgstime, utime, stime;
unsigned long maxrss = 0;
+ struct signal_struct *sig = p->signal;
memset((char *)r, 0, sizeof (*r));
utime = stime = 0;
if (who == RUSAGE_THREAD) {
task_cputime_adjusted(current, &utime, &stime);
accumulate_thread_rusage(p, r);
- maxrss = p->signal->maxrss;
+ maxrss = sig->maxrss;
goto out;
}
switch (who) {
case RUSAGE_BOTH:
case RUSAGE_CHILDREN:
- utime = p->signal->cutime;
- stime = p->signal->cstime;
- r->ru_nvcsw = p->signal->cnvcsw;
- r->ru_nivcsw = p->signal->cnivcsw;
- r->ru_minflt = p->signal->cmin_flt;
- r->ru_majflt = p->signal->cmaj_flt;
- r->ru_inblock = p->signal->cinblock;
- r->ru_oublock = p->signal->coublock;
- maxrss = p->signal->cmaxrss;
+ utime = sig->cutime;
+ stime = sig->cstime;
+ r->ru_nvcsw = sig->cnvcsw;
+ r->ru_nivcsw = sig->cnivcsw;
+ r->ru_minflt = sig->cmin_flt;
+ r->ru_majflt = sig->cmaj_flt;
+ r->ru_inblock = sig->cinblock;
+ r->ru_oublock = sig->coublock;
+ maxrss = sig->cmaxrss;
if (who == RUSAGE_CHILDREN)
break;
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
utime += tgutime;
stime += tgstime;
- r->ru_nvcsw += p->signal->nvcsw;
- r->ru_nivcsw += p->signal->nivcsw;
- r->ru_minflt += p->signal->min_flt;
- r->ru_majflt += p->signal->maj_flt;
- r->ru_inblock += p->signal->inblock;
- r->ru_oublock += p->signal->oublock;
- if (maxrss < p->signal->maxrss)
- maxrss = p->signal->maxrss;
- t = p;
- do {
+ r->ru_nvcsw += sig->nvcsw;
+ r->ru_nivcsw += sig->nivcsw;
+ r->ru_minflt += sig->min_flt;
+ r->ru_majflt += sig->maj_flt;
+ r->ru_inblock += sig->inblock;
+ r->ru_oublock += sig->oublock;
+ if (maxrss < sig->maxrss)
+ maxrss = sig->maxrss;
+ __for_each_thread(sig, t)
accumulate_thread_rusage(t, r);
- } while_each_thread(p, t);
break;
default:
}
#endif /* CONFIG_ANON_VMA_NAME */
+static inline unsigned long get_current_mdwe(void)
+{
+ unsigned long ret = 0;
+
+ if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags))
+ ret |= PR_MDWE_REFUSE_EXEC_GAIN;
+ if (test_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags))
+ ret |= PR_MDWE_NO_INHERIT;
+
+ return ret;
+}
+
static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
+ unsigned long current_bits;
+
if (arg3 || arg4 || arg5)
return -EINVAL;
- if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN))
+ if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT))
+ return -EINVAL;
+
+ /* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */
+ if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
return -EINVAL;
+ current_bits = get_current_mdwe();
+ if (current_bits && current_bits != bits)
+ return -EPERM; /* Cannot unset the flags */
+
+ if (bits & PR_MDWE_NO_INHERIT)
+ set_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags);
if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
set_bit(MMF_HAS_MDWE, ¤t->mm->flags);
- else if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags))
- return -EPERM; /* Cannot unset the flag */
return 0;
}
{
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
-
- return test_bit(MMF_HAS_MDWE, ¤t->mm->flags) ?
- PR_MDWE_REFUSE_EXEC_GAIN : 0;
+ return get_current_mdwe();
}
static int prctl_get_auxv(void __user *addr, unsigned long len)
#include <linux/bsearch.h>
#include <linux/sort.h>
- static struct kmem_cache *user_ns_cachep __read_mostly;
+ static struct kmem_cache *user_ns_cachep __ro_after_init;
static DEFINE_MUTEX(userns_state_mutex);
static bool new_idmap_permitted(const struct file *file,
kfree(ns->projid_map.forward);
kfree(ns->projid_map.reverse);
}
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+ kfree(ns->binfmt_misc);
+#endif
retire_userns_sysctls(ns);
key_free_user_ns(ns);
ns_free_inum(&ns->ns);
* process context while holding a pool lock. Bounce to a dedicated kthread
* worker to avoid A-A deadlocks.
*/
- static struct kthread_worker *pwq_release_worker;
+ static struct kthread_worker *pwq_release_worker __ro_after_init;
- struct workqueue_struct *system_wq __read_mostly;
+ struct workqueue_struct *system_wq __ro_after_init;
EXPORT_SYMBOL(system_wq);
- struct workqueue_struct *system_highpri_wq __read_mostly;
+ struct workqueue_struct *system_highpri_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_highpri_wq);
- struct workqueue_struct *system_long_wq __read_mostly;
+ struct workqueue_struct *system_long_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_long_wq);
- struct workqueue_struct *system_unbound_wq __read_mostly;
+ struct workqueue_struct *system_unbound_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_unbound_wq);
- struct workqueue_struct *system_freezable_wq __read_mostly;
+ struct workqueue_struct *system_freezable_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_wq);
- struct workqueue_struct *system_power_efficient_wq __read_mostly;
+ struct workqueue_struct *system_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
- struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
+ struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
static int worker_thread(void *__worker);
{
struct worker *worker;
int id;
- char id_buf[16];
+ char id_buf[23];
/* ID is needed to determine kthread name */
id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
}
cpus_read_unlock();
+ /* for unbound pwq, flush the pwq_release_worker ensures that the
+ * pwq_release_workfn() completes before calling kfree(wq).
+ */
+ if (ret)
+ kthread_flush_worker(pwq_release_worker);
+
return ret;
enomem:
if (wq->cpu_pwq) {
- for_each_possible_cpu(cpu)
- kfree(*per_cpu_ptr(wq->cpu_pwq, cpu));
+ for_each_possible_cpu(cpu) {
+ struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
+
+ if (pwq)
+ kmem_cache_free(pwq_cache, pwq);
+ }
free_percpu(wq->cpu_pwq);
wq->cpu_pwq = NULL;
}
}
/**
- * work_on_cpu - run a function in thread context on a particular cpu
+ * work_on_cpu_key - run a function in thread context on a particular cpu
* @cpu: the cpu to run on
* @fn: the function to run
* @arg: the function arg
+ * @key: The lock class key for lock debugging purposes
*
* It is up to the caller to ensure that the cpu doesn't go offline.
* The caller must not hold any locks which would prevent @fn from completing.
*
* Return: The value @fn returns.
*/
-long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
+long work_on_cpu_key(int cpu, long (*fn)(void *),
+ void *arg, struct lock_class_key *key)
{
struct work_for_cpu wfc = { .fn = fn, .arg = arg };
- INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
+ INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
schedule_work_on(cpu, &wfc.work);
flush_work(&wfc.work);
destroy_work_on_stack(&wfc.work);
return wfc.ret;
}
-EXPORT_SYMBOL_GPL(work_on_cpu);
+EXPORT_SYMBOL_GPL(work_on_cpu_key);
/**
- * work_on_cpu_safe - run a function in thread context on a particular cpu
+ * work_on_cpu_safe_key - run a function in thread context on a particular cpu
* @cpu: the cpu to run on
* @fn: the function to run
* @arg: the function argument
+ * @key: The lock class key for lock debugging purposes
*
* Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
* any locks which would prevent @fn from completing.
*
* Return: The value @fn returns.
*/
-long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
+ void *arg, struct lock_class_key *key)
{
long ret = -ENODEV;
cpus_read_lock();
if (cpu_online(cpu))
- ret = work_on_cpu(cpu, fn, arg);
+ ret = work_on_cpu_key(cpu, fn, arg, key);
cpus_read_unlock();
return ret;
}
-EXPORT_SYMBOL_GPL(work_on_cpu_safe);
+EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
#endif /* CONFIG_SMP */
#ifdef CONFIG_FREEZER
list_for_each_entry(wq, &workqueues, list) {
if (!(wq->flags & WQ_UNBOUND))
continue;
+
/* creating multiple pwqs breaks ordering guarantee */
- if (wq->flags & __WQ_ORDERED)
- continue;
+ if (!list_empty(&wq->pwqs)) {
+ if (wq->flags & __WQ_ORDERED_EXPLICIT)
+ continue;
+ wq->flags &= ~__WQ_ORDERED;
+ }
ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
if (IS_ERR(ctx)) {
region->ar.start = start;
region->ar.end = end;
region->nr_accesses = 0;
+ region->nr_accesses_bp = 0;
INIT_LIST_HEAD(®ion->list);
region->age = 0;
}
struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
- enum damos_action action, struct damos_quota *quota,
+ enum damos_action action,
+ unsigned long apply_interval_us,
+ struct damos_quota *quota,
struct damos_watermarks *wmarks)
{
struct damos *scheme;
return NULL;
scheme->pattern = *pattern;
scheme->action = action;
+ scheme->apply_interval_us = apply_interval_us;
+ /*
+ * next_apply_sis will be set when kdamond starts. While kdamond is
+ * running, it will also updated when it is added to the DAMON context,
+ * or damon_attrs are updated.
+ */
+ scheme->next_apply_sis = 0;
INIT_LIST_HEAD(&scheme->filters);
scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
return scheme;
}
+static void damos_set_next_apply_sis(struct damos *s, struct damon_ctx *ctx)
+{
+ unsigned long sample_interval = ctx->attrs.sample_interval ?
+ ctx->attrs.sample_interval : 1;
+ unsigned long apply_interval = s->apply_interval_us ?
+ s->apply_interval_us : ctx->attrs.aggr_interval;
+
+ s->next_apply_sis = ctx->passed_sample_intervals +
+ apply_interval / sample_interval;
+}
+
void damon_add_scheme(struct damon_ctx *ctx, struct damos *s)
{
list_add_tail(&s->list, &ctx->schemes);
+ damos_set_next_apply_sis(s, ctx);
}
static void damon_del_scheme(struct damos *s)
ctx->attrs.aggr_interval = 100 * 1000;
ctx->attrs.ops_update_interval = 60 * 1000 * 1000;
- ktime_get_coarse_ts64(&ctx->last_aggregation);
- ctx->last_ops_update = ctx->last_aggregation;
+ ctx->passed_sample_intervals = 0;
+ /* These will be set from kdamond_init_intervals_sis() */
+ ctx->next_aggregation_sis = 0;
+ ctx->next_ops_update_sis = 0;
mutex_init(&ctx->kdamond_lock);
static unsigned int damon_accesses_bp_to_nr_accesses(
unsigned int accesses_bp, struct damon_attrs *attrs)
{
- unsigned int max_nr_accesses =
- attrs->aggr_interval / attrs->sample_interval;
-
- return accesses_bp * max_nr_accesses / 10000;
+ return accesses_bp * damon_max_nr_accesses(attrs) / 10000;
}
/* convert nr_accesses to access ratio in bp (per 10,000) */
static unsigned int damon_nr_accesses_to_accesses_bp(
unsigned int nr_accesses, struct damon_attrs *attrs)
{
- unsigned int max_nr_accesses =
- attrs->aggr_interval / attrs->sample_interval;
-
- return nr_accesses * 10000 / max_nr_accesses;
+ return nr_accesses * 10000 / damon_max_nr_accesses(attrs);
}
static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses,
{
r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses,
old_attrs, new_attrs);
+ r->nr_accesses_bp = r->nr_accesses * 10000;
r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs);
}
* @ctx: monitoring context
* @attrs: monitoring attributes
*
- * This function should not be called while the kdamond is running.
+ * This function should be called while the kdamond is not running, or an
+ * access check results aggregation is not ongoing (e.g., from
+ * &struct damon_callback->after_aggregation or
+ * &struct damon_callback->after_wmarks_check callbacks).
+ *
* Every time interval is in micro-seconds.
*
* Return: 0 on success, negative error code otherwise.
*/
int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
{
+ unsigned long sample_interval = attrs->sample_interval ?
+ attrs->sample_interval : 1;
+ struct damos *s;
+
if (attrs->min_nr_regions < 3)
return -EINVAL;
if (attrs->min_nr_regions > attrs->max_nr_regions)
if (attrs->sample_interval > attrs->aggr_interval)
return -EINVAL;
+ ctx->next_aggregation_sis = ctx->passed_sample_intervals +
+ attrs->aggr_interval / sample_interval;
+ ctx->next_ops_update_sis = ctx->passed_sample_intervals +
+ attrs->ops_update_interval / sample_interval;
+
damon_update_monitoring_results(ctx, attrs);
ctx->attrs = *attrs;
+
+ damon_for_each_scheme(s, ctx)
+ damos_set_next_apply_sis(s, ctx);
+
return 0;
}
if (tsk) {
get_task_struct(tsk);
mutex_unlock(&ctx->kdamond_lock);
- kthread_stop(tsk);
- put_task_struct(tsk);
+ kthread_stop_put(tsk);
return 0;
}
mutex_unlock(&ctx->kdamond_lock);
return err;
}
-/*
- * damon_check_reset_time_interval() - Check if a time interval is elapsed.
- * @baseline: the time to check whether the interval has elapsed since
- * @interval: the time interval (microseconds)
- *
- * See whether the given time interval has passed since the given baseline
- * time. If so, it also updates the baseline to current time for next check.
- *
- * Return: true if the time interval has passed, or false otherwise.
- */
-static bool damon_check_reset_time_interval(struct timespec64 *baseline,
- unsigned long interval)
-{
- struct timespec64 now;
-
- ktime_get_coarse_ts64(&now);
- if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) <
- interval * 1000)
- return false;
- *baseline = now;
- return true;
-}
-
-/*
- * Check whether it is time to flush the aggregated information
- */
-static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
-{
- return damon_check_reset_time_interval(&ctx->last_aggregation,
- ctx->attrs.aggr_interval);
-}
-
/*
* Reset the aggregated monitoring results ('nr_accesses' of each region).
*/
struct damon_region *r;
damon_for_each_region(r, t) {
- trace_damon_aggregated(t, ti, r, damon_nr_regions(t));
+ trace_damon_aggregated(ti, r, damon_nr_regions(t));
r->last_nr_accesses = r->nr_accesses;
r->nr_accesses = 0;
}
static bool __damos_valid_target(struct damon_region *r, struct damos *s)
{
unsigned long sz;
+ unsigned int nr_accesses = r->nr_accesses_bp / 10000;
sz = damon_sz_region(r);
return s->pattern.min_sz_region <= sz &&
sz <= s->pattern.max_sz_region &&
- s->pattern.min_nr_accesses <= r->nr_accesses &&
- r->nr_accesses <= s->pattern.max_nr_accesses &&
+ s->pattern.min_nr_accesses <= nr_accesses &&
+ nr_accesses <= s->pattern.max_nr_accesses &&
s->pattern.min_age_region <= r->age &&
r->age <= s->pattern.max_age_region;
}
struct timespec64 begin, end;
unsigned long sz_applied = 0;
int err = 0;
+ /*
+ * We plan to support multiple context per kdamond, as DAMON sysfs
+ * implies with 'nr_contexts' file. Nevertheless, only single context
+ * per kdamond is supported for now. So, we can simply use '0' context
+ * index here.
+ */
+ unsigned int cidx = 0;
+ struct damos *siter; /* schemes iterator */
+ unsigned int sidx = 0;
+ struct damon_target *titer; /* targets iterator */
+ unsigned int tidx = 0;
+ bool do_trace = false;
+
+ /* get indices for trace_damos_before_apply() */
+ if (trace_damos_before_apply_enabled()) {
+ damon_for_each_scheme(siter, c) {
+ if (siter == s)
+ break;
+ sidx++;
+ }
+ damon_for_each_target(titer, c) {
+ if (titer == t)
+ break;
+ tidx++;
+ }
+ do_trace = true;
+ }
if (c->ops.apply_scheme) {
if (quota->esz && quota->charged_sz + sz > quota->esz) {
ktime_get_coarse_ts64(&begin);
if (c->callback.before_damos_apply)
err = c->callback.before_damos_apply(c, t, r, s);
- if (!err)
+ if (!err) {
+ trace_damos_before_apply(cidx, sidx, tidx, r,
+ damon_nr_regions(t), do_trace);
sz_applied = c->ops.apply_scheme(c, t, r, s);
+ }
ktime_get_coarse_ts64(&end);
quota->total_charged_ns += timespec64_to_ns(&end) -
timespec64_to_ns(&begin);
struct damon_target *t;
struct damon_region *r, *next_r;
struct damos *s;
+ unsigned long sample_interval = c->attrs.sample_interval ?
+ c->attrs.sample_interval : 1;
+ bool has_schemes_to_apply = false;
damon_for_each_scheme(s, c) {
+ if (c->passed_sample_intervals != s->next_apply_sis)
+ continue;
+
+ s->next_apply_sis +=
+ (s->apply_interval_us ? s->apply_interval_us :
+ c->attrs.aggr_interval) / sample_interval;
+
if (!s->wmarks.activated)
continue;
+ has_schemes_to_apply = true;
+
damos_adjust_quota(c, s);
}
+ if (!has_schemes_to_apply)
+ return;
+
damon_for_each_target(t, c) {
damon_for_each_region_safe(r, next_r, t)
damon_do_apply_schemes(c, t, r);
l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
(sz_l + sz_r);
+ l->nr_accesses_bp = l->nr_accesses * 10000;
l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r);
l->ar.end = r->ar.end;
damon_destroy_region(r, t);
new->age = r->age;
new->last_nr_accesses = r->last_nr_accesses;
+ new->nr_accesses_bp = r->nr_accesses_bp;
damon_insert_region(new, r, damon_next_region(r), t);
}
last_nr_regions = nr_regions;
}
-/*
- * Check whether it is time to check and apply the operations-related data
- * structures.
- *
- * Returns true if it is.
- */
-static bool kdamond_need_update_operations(struct damon_ctx *ctx)
-{
- return damon_check_reset_time_interval(&ctx->last_ops_update,
- ctx->attrs.ops_update_interval);
-}
-
/*
* Check whether current monitoring should be stopped
*
static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric)
{
- struct sysinfo i;
-
switch (metric) {
case DAMOS_WMARK_FREE_MEM_RATE:
- si_meminfo(&i);
- return i.freeram * 1000 / i.totalram;
+ return global_zone_page_state(NR_FREE_PAGES) * 1000 /
+ totalram_pages();
default:
break;
}
return -EBUSY;
}
+static void kdamond_init_intervals_sis(struct damon_ctx *ctx)
+{
+ unsigned long sample_interval = ctx->attrs.sample_interval ?
+ ctx->attrs.sample_interval : 1;
+ unsigned long apply_interval;
+ struct damos *scheme;
+
+ ctx->passed_sample_intervals = 0;
+ ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval;
+ ctx->next_ops_update_sis = ctx->attrs.ops_update_interval /
+ sample_interval;
+
+ damon_for_each_scheme(scheme, ctx) {
+ apply_interval = scheme->apply_interval_us ?
+ scheme->apply_interval_us : ctx->attrs.aggr_interval;
+ scheme->next_apply_sis = apply_interval / sample_interval;
+ }
+}
+
/*
* The monitoring daemon that runs as a kernel thread
*/
pr_debug("kdamond (%d) starts\n", current->pid);
+ kdamond_init_intervals_sis(ctx);
+
if (ctx->ops.init)
ctx->ops.init(ctx);
if (ctx->callback.before_start && ctx->callback.before_start(ctx))
sz_limit = damon_region_sz_limit(ctx);
while (!kdamond_need_stop(ctx)) {
+ /*
+ * ctx->attrs and ctx->next_{aggregation,ops_update}_sis could
+ * be changed from after_wmarks_check() or after_aggregation()
+ * callbacks. Read the values here, and use those for this
+ * iteration. That is, damon_set_attrs() updated new values
+ * are respected from next iteration.
+ */
+ unsigned long next_aggregation_sis = ctx->next_aggregation_sis;
+ unsigned long next_ops_update_sis = ctx->next_ops_update_sis;
+ unsigned long sample_interval = ctx->attrs.sample_interval;
+
if (kdamond_wait_activation(ctx))
break;
ctx->callback.after_sampling(ctx))
break;
- kdamond_usleep(ctx->attrs.sample_interval);
+ kdamond_usleep(sample_interval);
+ ctx->passed_sample_intervals++;
if (ctx->ops.check_accesses)
max_nr_accesses = ctx->ops.check_accesses(ctx);
- if (kdamond_aggregate_interval_passed(ctx)) {
+ if (ctx->passed_sample_intervals == next_aggregation_sis) {
kdamond_merge_regions(ctx,
max_nr_accesses / 10,
sz_limit);
if (ctx->callback.after_aggregation &&
ctx->callback.after_aggregation(ctx))
break;
- if (!list_empty(&ctx->schemes))
- kdamond_apply_schemes(ctx);
+ }
+
+ /*
+ * do kdamond_apply_schemes() after kdamond_merge_regions() if
+ * possible, to reduce overhead
+ */
+ if (!list_empty(&ctx->schemes))
+ kdamond_apply_schemes(ctx);
+
+ sample_interval = ctx->attrs.sample_interval ?
+ ctx->attrs.sample_interval : 1;
+ if (ctx->passed_sample_intervals == next_aggregation_sis) {
+ ctx->next_aggregation_sis = next_aggregation_sis +
+ ctx->attrs.aggr_interval / sample_interval;
+
kdamond_reset_aggregated(ctx);
kdamond_split_regions(ctx);
if (ctx->ops.reset_aggregated)
ctx->ops.reset_aggregated(ctx);
}
- if (kdamond_need_update_operations(ctx)) {
+ if (ctx->passed_sample_intervals == next_ops_update_sis) {
+ ctx->next_ops_update_sis = next_ops_update_sis +
+ ctx->attrs.ops_update_interval /
+ sample_interval;
if (ctx->ops.update)
ctx->ops.update(ctx);
sz_limit = damon_region_sz_limit(ctx);
return damon_set_regions(t, &addr_range, 1);
}
+/*
+ * damon_moving_sum() - Calculate an inferred moving sum value.
+ * @mvsum: Inferred sum of the last @len_window values.
+ * @nomvsum: Non-moving sum of the last discrete @len_window window values.
+ * @len_window: The number of last values to take care of.
+ * @new_value: New value that will be added to the pseudo moving sum.
+ *
+ * Moving sum (moving average * window size) is good for handling noise, but
+ * the cost of keeping past values can be high for arbitrary window size. This
+ * function implements a lightweight pseudo moving sum function that doesn't
+ * keep the past window values.
+ *
+ * It simply assumes there was no noise in the past, and get the no-noise
+ * assumed past value to drop from @nomvsum and @len_window. @nomvsum is a
+ * non-moving sum of the last window. For example, if @len_window is 10 and we
+ * have 25 values, @nomvsum is the sum of the 11th to 20th values of the 25
+ * values. Hence, this function simply drops @nomvsum / @len_window from
+ * given @mvsum and add @new_value.
+ *
+ * For example, if @len_window is 10 and @nomvsum is 50, the last 10 values for
+ * the last window could be vary, e.g., 0, 10, 0, 10, 0, 10, 0, 0, 0, 20. For
+ * calculating next moving sum with a new value, we should drop 0 from 50 and
+ * add the new value. However, this function assumes it got value 5 for each
+ * of the last ten times. Based on the assumption, when the next value is
+ * measured, it drops the assumed past value, 5 from the current sum, and add
+ * the new value to get the updated pseduo-moving average.
+ *
+ * This means the value could have errors, but the errors will be disappeared
+ * for every @len_window aligned calls. For example, if @len_window is 10, the
+ * pseudo moving sum with 11th value to 19th value would have an error. But
+ * the sum with 20th value will not have the error.
+ *
+ * Return: Pseudo-moving average after getting the @new_value.
+ */
+static unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
+ unsigned int len_window, unsigned int new_value)
+{
+ return mvsum - nomvsum / len_window + new_value;
+}
+
+/**
+ * damon_update_region_access_rate() - Update the access rate of a region.
+ * @r: The DAMON region to update for its access check result.
+ * @accessed: Whether the region has accessed during last sampling interval.
+ * @attrs: The damon_attrs of the DAMON context.
+ *
+ * Update the access rate of a region with the region's last sampling interval
+ * access check result.
+ *
+ * Usually this will be called by &damon_operations->check_accesses callback.
+ */
+void damon_update_region_access_rate(struct damon_region *r, bool accessed,
+ struct damon_attrs *attrs)
+{
+ unsigned int len_window = 1;
+
+ /*
+ * sample_interval can be zero, but cannot be larger than
+ * aggr_interval, owing to validation of damon_set_attrs().
+ */
+ if (attrs->sample_interval)
+ len_window = damon_max_nr_accesses(attrs);
+ r->nr_accesses_bp = damon_moving_sum(r->nr_accesses_bp,
+ r->last_nr_accesses * 10000, len_window,
+ accessed ? 10000 : 0);
+
+ if (accessed)
+ r->nr_accesses++;
+}
+
static int __init damon_init(void)
{
damon_region_cache = KMEM_CACHE(damon_region, 0);
#define MM_SLOTS_HASH_BITS 10
static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
- static struct kmem_cache *mm_slot_cache __read_mostly;
+ static struct kmem_cache *mm_slot_cache __ro_after_init;
struct collapse_control {
bool is_khugepaged;
}
}
-static bool is_refcount_suitable(struct page *page)
+static bool is_refcount_suitable(struct folio *folio)
{
int expected_refcount;
- expected_refcount = total_mapcount(page);
- if (PageSwapCache(page))
- expected_refcount += compound_nr(page);
+ expected_refcount = folio_mapcount(folio);
+ if (folio_test_swapcache(folio))
+ expected_refcount += folio_nr_pages(folio);
- return page_count(page) == expected_refcount;
+ return folio_ref_count(folio) == expected_refcount;
}
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
struct list_head *compound_pagelist)
{
struct page *page = NULL;
+ struct folio *folio = NULL;
pte_t *_pte;
int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
bool writable = false;
goto out;
}
- VM_BUG_ON_PAGE(!PageAnon(page), page);
+ folio = page_folio(page);
+ VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
if (page_mapcount(page) > 1) {
++shared;
}
}
- if (PageCompound(page)) {
- struct page *p;
- page = compound_head(page);
+ if (folio_test_large(folio)) {
+ struct folio *f;
/*
* Check if we have dealt with the compound page
* already
*/
- list_for_each_entry(p, compound_pagelist, lru) {
- if (page == p)
+ list_for_each_entry(f, compound_pagelist, lru) {
+ if (folio == f)
goto next;
}
}
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
- if (!trylock_page(page)) {
+ if (!folio_trylock(folio)) {
result = SCAN_PAGE_LOCK;
goto out;
}
* but not from this process. The other process cannot write to
* the page, only trigger CoW.
*/
- if (!is_refcount_suitable(page)) {
- unlock_page(page);
+ if (!is_refcount_suitable(folio)) {
+ folio_unlock(folio);
result = SCAN_PAGE_COUNT;
goto out;
}
* Isolate the page to avoid collapsing an hugepage
* currently in use by the VM.
*/
- if (!isolate_lru_page(page)) {
- unlock_page(page);
+ if (!folio_isolate_lru(folio)) {
+ folio_unlock(folio);
result = SCAN_DEL_PAGE_LRU;
goto out;
}
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_is_file_lru(page),
- compound_nr(page));
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageLRU(page), page);
-
- if (PageCompound(page))
- list_add_tail(&page->lru, compound_pagelist);
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ folio_nr_pages(folio));
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+
+ if (folio_test_large(folio))
+ list_add_tail(&folio->lru, compound_pagelist);
next:
/*
* If collapse was initiated by khugepaged, check that there is
* enough young pte to justify collapsing the page
*/
if (cc->is_khugepaged &&
- (pte_young(pteval) || page_is_young(page) ||
- PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+ (pte_young(pteval) || folio_test_young(folio) ||
+ folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
address)))
referenced++;
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
referenced, writable, result);
return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
- trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
referenced, writable, result);
return result;
}
}
#endif
-static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
+static bool hpage_collapse_alloc_folio(struct folio **folio, gfp_t gfp, int node,
nodemask_t *nmask)
{
- *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask);
- if (unlikely(!*hpage)) {
+ *folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, nmask);
+
+ if (unlikely(!*folio)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
return false;
}
- folio_prep_large_rmappable((struct folio *)*hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
return true;
}
int node = hpage_collapse_find_target_node(cc);
struct folio *folio;
- if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
+ if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask)) {
+ *hpage = NULL;
return SCAN_ALLOC_HUGE_PAGE_FAIL;
+ }
- folio = page_folio(*hpage);
if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
folio_put(folio);
*hpage = NULL;
return SCAN_CGROUP_CHARGE_FAIL;
}
- count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
+ count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
+
+ *hpage = folio_page(folio, 0);
return SCAN_SUCCEED;
}
int result = SCAN_FAIL, referenced = 0;
int none_or_zero = 0, shared = 0;
struct page *page = NULL;
+ struct folio *folio = NULL;
unsigned long _address;
spinlock_t *ptl;
int node = NUMA_NO_NODE, unmapped = 0;
}
}
- page = compound_head(page);
-
+ folio = page_folio(page);
/*
* Record which node the original page is from and save this
* information to cc->node_load[].
* Khugepaged will allocate hugepage from the node has the max
* hit record.
*/
- node = page_to_nid(page);
+ node = folio_nid(folio);
if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
goto out_unmap;
}
cc->node_load[node]++;
- if (!PageLRU(page)) {
+ if (!folio_test_lru(folio)) {
result = SCAN_PAGE_LRU;
goto out_unmap;
}
- if (PageLocked(page)) {
+ if (folio_test_locked(folio)) {
result = SCAN_PAGE_LOCK;
goto out_unmap;
}
- if (!PageAnon(page)) {
+ if (!folio_test_anon(folio)) {
result = SCAN_PAGE_ANON;
goto out_unmap;
}
* has excessive GUP pins (i.e. 512). Anyway the same check
* will be done again later the risk seems low.
*/
- if (!is_refcount_suitable(page)) {
+ if (!is_refcount_suitable(folio)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
* enough young pte to justify collapsing the page
*/
if (cc->is_khugepaged &&
- (pte_young(pteval) || page_is_young(page) ||
- PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+ (pte_young(pteval) || folio_test_young(folio) ||
+ folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
address)))
referenced++;
}
*mmap_locked = false;
}
out:
- trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
+ trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
none_or_zero, result, unmapped);
return result;
}
bool notified = false;
unsigned long haddr = addr & HPAGE_PMD_MASK;
struct vm_area_struct *vma = vma_lookup(mm, haddr);
- struct page *hpage;
+ struct folio *folio;
pte_t *start_pte, *pte;
pmd_t *pmd, pgt_pmd;
spinlock_t *pml = NULL, *ptl;
if (userfaultfd_wp(vma))
return SCAN_PTE_UFFD_WP;
- hpage = find_lock_page(vma->vm_file->f_mapping,
+ folio = filemap_lock_folio(vma->vm_file->f_mapping,
linear_page_index(vma, haddr));
- if (!hpage)
+ if (IS_ERR(folio))
return SCAN_PAGE_NULL;
- if (!PageHead(hpage)) {
- result = SCAN_FAIL;
- goto drop_hpage;
- }
-
- if (compound_order(hpage) != HPAGE_PMD_ORDER) {
+ if (folio_order(folio) != HPAGE_PMD_ORDER) {
result = SCAN_PAGE_COMPOUND;
- goto drop_hpage;
+ goto drop_folio;
}
result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
*/
goto maybe_install_pmd;
default:
- goto drop_hpage;
+ goto drop_folio;
}
result = SCAN_FAIL;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
if (!start_pte) /* mmap_lock + page lock should prevent this */
- goto drop_hpage;
+ goto drop_folio;
/* step 1: check all mapped PTEs are to the right huge page */
for (i = 0, addr = haddr, pte = start_pte;
* Note that uprobe, debugger, or MAP_PRIVATE may change the
* page table, but the new page will not be a subpage of hpage.
*/
- if (hpage + i != page)
+ if (folio_page(folio, i) != page)
goto abort;
}
* page_table_lock) ptl nests inside pml. The less time we hold pml,
* the better; but userfaultfd's mfill_atomic_pte() on a private VMA
* inserts a valid as-if-COWed PTE without even looking up page cache.
- * So page lock of hpage does not protect from it, so we must not drop
+ * So page lock of folio does not protect from it, so we must not drop
* ptl before pgt_pmd is removed, so uffd private needs pml taken now.
*/
if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
continue;
/*
* We dropped ptl after the first scan, to do the mmu_notifier:
- * page lock stops more PTEs of the hpage being faulted in, but
+ * page lock stops more PTEs of the folio being faulted in, but
* does not stop write faults COWing anon copies from existing
* PTEs; and does not stop those being swapped out or migrated.
*/
goto abort;
}
page = vm_normal_page(vma, addr, ptent);
- if (hpage + i != page)
+ if (folio_page(folio, i) != page)
goto abort;
/*
/* step 3: set proper refcount and mm_counters. */
if (nr_ptes) {
- page_ref_sub(hpage, nr_ptes);
- add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
+ folio_ref_sub(folio, nr_ptes);
+ add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
}
/* step 4: remove empty page table */
maybe_install_pmd:
/* step 5: install pmd entry */
result = install_pmd
- ? set_huge_pmd(vma, haddr, pmd, hpage)
+ ? set_huge_pmd(vma, haddr, pmd, &folio->page)
: SCAN_SUCCEED;
- goto drop_hpage;
+ goto drop_folio;
abort:
if (nr_ptes) {
flush_tlb_mm(mm);
- page_ref_sub(hpage, nr_ptes);
- add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
+ folio_ref_sub(folio, nr_ptes);
+ add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
}
if (start_pte)
pte_unmap_unlock(start_pte, ptl);
spin_unlock(pml);
if (notified)
mmu_notifier_invalidate_range_end(&range);
-drop_hpage:
- unlock_page(hpage);
- put_page(hpage);
+drop_folio:
+ folio_unlock(folio);
+ folio_put(folio);
return result;
}
#include <linux/iversion.h>
#include "swap.h"
- static struct vfsmount *shm_mnt;
+ static struct vfsmount *shm_mnt __ro_after_init;
#ifdef CONFIG_SHMEM
/*
#endif
static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
- struct folio **foliop, enum sgp_type sgp,
- gfp_t gfp, struct vm_area_struct *vma,
- vm_fault_t *fault_type);
+ struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
+ struct mm_struct *fault_mm, vm_fault_t *fault_type);
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
/*
* ... whereas tmpfs objects are accounted incrementally as
* pages are allocated, in order to allow large sparse files.
- * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
+ * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM,
* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
*/
-static inline int shmem_acct_block(unsigned long flags, long pages)
+static inline int shmem_acct_blocks(unsigned long flags, long pages)
{
if (!(flags & VM_NORESERVE))
return 0;
vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
}
-static int shmem_inode_acct_block(struct inode *inode, long pages)
+static int shmem_inode_acct_blocks(struct inode *inode, long pages)
{
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
int err = -ENOSPC;
- if (shmem_acct_block(info->flags, pages))
+ if (shmem_acct_blocks(info->flags, pages))
return err;
might_sleep(); /* when quotas */
if (sbinfo->max_blocks) {
- if (percpu_counter_compare(&sbinfo->used_blocks,
- sbinfo->max_blocks - pages) > 0)
+ if (!percpu_counter_limited_add(&sbinfo->used_blocks,
+ sbinfo->max_blocks, pages))
goto unacct;
err = dquot_alloc_block_nodirty(inode, pages);
- if (err)
+ if (err) {
+ percpu_counter_sub(&sbinfo->used_blocks, pages);
goto unacct;
-
- percpu_counter_add(&sbinfo->used_blocks, pages);
+ }
} else {
err = dquot_alloc_block_nodirty(inode, pages);
if (err)
{
struct address_space *mapping = inode->i_mapping;
- if (shmem_inode_acct_block(inode, pages))
+ if (shmem_inode_acct_blocks(inode, pages))
return false;
/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
- * Like filemap_add_folio, but error if expected item has gone.
+ * Somewhat like filemap_add_folio, but error if expected item has gone.
*/
static int shmem_add_to_page_cache(struct folio *folio,
struct address_space *mapping,
- pgoff_t index, void *expected, gfp_t gfp,
- struct mm_struct *charge_mm)
+ pgoff_t index, void *expected, gfp_t gfp)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
long nr = folio_nr_pages(folio);
- int error;
VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
folio->mapping = mapping;
folio->index = index;
- if (!folio_test_swapcache(folio)) {
- error = mem_cgroup_charge(folio, charge_mm, gfp);
- if (error) {
- if (folio_test_pmd_mappable(folio)) {
- count_vm_event(THP_FILE_FALLBACK);
- count_vm_event(THP_FILE_FALLBACK_CHARGE);
- }
- goto error;
- }
- }
+ gfp &= GFP_RECLAIM_MASK;
folio_throttle_swaprate(folio, gfp);
do {
xas_store(&xas, folio);
if (xas_error(&xas))
goto unlock;
- if (folio_test_pmd_mappable(folio)) {
- count_vm_event(THP_FILE_ALLOC);
+ if (folio_test_pmd_mappable(folio))
__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
- }
- mapping->nrpages += nr;
__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
__lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
+ mapping->nrpages += nr;
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
if (xas_error(&xas)) {
- error = xas_error(&xas);
- goto error;
+ folio->mapping = NULL;
+ folio_ref_sub(folio, nr);
+ return xas_error(&xas);
}
return 0;
-error:
- folio->mapping = NULL;
- folio_ref_sub(folio, nr);
- return error;
}
/*
- * Like delete_from_page_cache, but substitutes swap for @folio.
+ * Somewhat like filemap_remove_folio, but substitutes swap for @folio.
*/
static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
{
cond_resched_rcu();
}
}
-
rcu_read_unlock();
return swapped << PAGE_SHIFT;
void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
shmem_undo_range(inode, lstart, lend, false);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
inode_inc_iversion(inode);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
if (i_uid_needs_update(idmap, attr, inode) ||
i_gid_needs_update(idmap, attr, inode)) {
error = dquot_transfer(idmap, inode, attr);
-
if (error)
return error;
}
if (!error && update_ctime) {
inode_set_ctime_current(inode);
if (update_mtime)
- inode->i_mtime = inode_get_ctime(inode);
+ inode_set_mtime_to_ts(inode, inode_get_ctime(inode));
inode_inc_iversion(inode);
}
return error;
if (!xa_is_value(folio))
continue;
- error = shmem_swapin_folio(inode, indices[i],
- &folio, SGP_CACHE,
- mapping_gfp_mask(mapping),
- NULL, NULL);
+ error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
+ mapping_gfp_mask(mapping), NULL, NULL);
if (error == 0) {
folio_unlock(folio);
folio_put(folio);
return NULL;
}
#endif /* CONFIG_NUMA && CONFIG_TMPFS */
-#ifndef CONFIG_NUMA
-#define vm_policy vm_private_data
-#endif
-static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
- struct shmem_inode_info *info, pgoff_t index)
-{
- /* Create a pseudo vma that just contains the policy */
- vma_init(vma, NULL);
- /* Bias interleave by inode number to distribute better across nodes */
- vma->vm_pgoff = index + info->vfs_inode.i_ino;
- vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
-}
-
-static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
-{
- /* Drop reference taken by mpol_shared_policy_lookup() */
- mpol_cond_put(vma->vm_policy);
-}
+static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
+ pgoff_t index, unsigned int order, pgoff_t *ilx);
-static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
- struct vm_area_struct pvma;
+ struct mempolicy *mpol;
+ pgoff_t ilx;
struct page *page;
- struct vm_fault vmf = {
- .vma = &pvma,
- };
- shmem_pseudo_vma_init(&pvma, info, index);
- page = swap_cluster_readahead(swap, gfp, &vmf);
- shmem_pseudo_vma_destroy(&pvma);
+ mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
+ page = swap_cluster_readahead(swap, gfp, mpol, ilx);
+ mpol_cond_put(mpol);
if (!page)
return NULL;
static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
- struct vm_area_struct pvma;
- struct address_space *mapping = info->vfs_inode.i_mapping;
- pgoff_t hindex;
- struct folio *folio;
+ struct mempolicy *mpol;
+ pgoff_t ilx;
+ struct page *page;
- hindex = round_down(index, HPAGE_PMD_NR);
- if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
- XA_PRESENT))
- return NULL;
+ mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx);
+ page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id());
+ mpol_cond_put(mpol);
- shmem_pseudo_vma_init(&pvma, info, hindex);
- folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
- shmem_pseudo_vma_destroy(&pvma);
- if (!folio)
- count_vm_event(THP_FILE_FALLBACK);
- return folio;
+ return page_rmappable_folio(page);
}
static struct folio *shmem_alloc_folio(gfp_t gfp,
- struct shmem_inode_info *info, pgoff_t index)
+ struct shmem_inode_info *info, pgoff_t index)
{
- struct vm_area_struct pvma;
- struct folio *folio;
+ struct mempolicy *mpol;
+ pgoff_t ilx;
+ struct page *page;
- shmem_pseudo_vma_init(&pvma, info, index);
- folio = vma_alloc_folio(gfp, 0, &pvma, 0, false);
- shmem_pseudo_vma_destroy(&pvma);
+ mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
+ page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id());
+ mpol_cond_put(mpol);
- return folio;
+ return (struct folio *)page;
}
-static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
- pgoff_t index, bool huge)
+static struct folio *shmem_alloc_and_add_folio(gfp_t gfp,
+ struct inode *inode, pgoff_t index,
+ struct mm_struct *fault_mm, bool huge)
{
+ struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct folio *folio;
- int nr;
- int err;
+ long pages;
+ int error;
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
huge = false;
- nr = huge ? HPAGE_PMD_NR : 1;
- err = shmem_inode_acct_block(inode, nr);
- if (err)
- goto failed;
+ if (huge) {
+ pages = HPAGE_PMD_NR;
+ index = round_down(index, HPAGE_PMD_NR);
+
+ /*
+ * Check for conflict before waiting on a huge allocation.
+ * Conflict might be that a huge page has just been allocated
+ * and added to page cache by a racing thread, or that there
+ * is already at least one small page in the huge extent.
+ * Be careful to retry when appropriate, but not forever!
+ * Elsewhere -EEXIST would be the right code, but not here.
+ */
+ if (xa_find(&mapping->i_pages, &index,
+ index + HPAGE_PMD_NR - 1, XA_PRESENT))
+ return ERR_PTR(-E2BIG);
- if (huge)
folio = shmem_alloc_hugefolio(gfp, info, index);
- else
+ if (!folio)
+ count_vm_event(THP_FILE_FALLBACK);
+ } else {
+ pages = 1;
folio = shmem_alloc_folio(gfp, info, index);
- if (folio) {
- __folio_set_locked(folio);
- __folio_set_swapbacked(folio);
- return folio;
}
+ if (!folio)
+ return ERR_PTR(-ENOMEM);
- err = -ENOMEM;
- shmem_inode_unacct_blocks(inode, nr);
-failed:
- return ERR_PTR(err);
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+
+ gfp &= GFP_RECLAIM_MASK;
+ error = mem_cgroup_charge(folio, fault_mm, gfp);
+ if (error) {
+ if (xa_find(&mapping->i_pages, &index,
+ index + pages - 1, XA_PRESENT)) {
+ error = -EEXIST;
+ } else if (huge) {
+ count_vm_event(THP_FILE_FALLBACK);
+ count_vm_event(THP_FILE_FALLBACK_CHARGE);
+ }
+ goto unlock;
+ }
+
+ error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp);
+ if (error)
+ goto unlock;
+
+ error = shmem_inode_acct_blocks(inode, pages);
+ if (error) {
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ long freed;
+ /*
+ * Try to reclaim some space by splitting a few
+ * large folios beyond i_size on the filesystem.
+ */
+ shmem_unused_huge_shrink(sbinfo, NULL, 2);
+ /*
+ * And do a shmem_recalc_inode() to account for freed pages:
+ * except our folio is there in cache, so not quite balanced.
+ */
+ spin_lock(&info->lock);
+ freed = pages + info->alloced - info->swapped -
+ READ_ONCE(mapping->nrpages);
+ if (freed > 0)
+ info->alloced -= freed;
+ spin_unlock(&info->lock);
+ if (freed > 0)
+ shmem_inode_unacct_blocks(inode, freed);
+ error = shmem_inode_acct_blocks(inode, pages);
+ if (error) {
+ filemap_remove_folio(folio);
+ goto unlock;
+ }
+ }
+
+ shmem_recalc_inode(inode, pages, 0);
+ folio_add_lru(folio);
+ return folio;
+
+unlock:
+ folio_unlock(folio);
+ folio_put(folio);
+ return ERR_PTR(error);
}
/*
*/
static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio **foliop, enum sgp_type sgp,
- gfp_t gfp, struct vm_area_struct *vma,
+ gfp_t gfp, struct mm_struct *fault_mm,
vm_fault_t *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
- struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
struct swap_info_struct *si;
struct folio *folio = NULL;
swp_entry_t swap;
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
- count_memcg_event_mm(charge_mm, PGMAJFAULT);
+ count_memcg_event_mm(fault_mm, PGMAJFAULT);
}
/* Here we actually start the io */
- folio = shmem_swapin(swap, gfp, info, index);
+ folio = shmem_swapin_cluster(swap, gfp, info, index);
if (!folio) {
error = -ENOMEM;
goto failed;
}
error = shmem_add_to_page_cache(folio, mapping, index,
- swp_to_radix_entry(swap), gfp,
- charge_mm);
+ swp_to_radix_entry(swap), gfp);
if (error)
goto failed;
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache.
*
- * vma, vmf, and fault_type are only supplied by shmem_fault:
- * otherwise they are NULL.
+ * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
*/
static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
- struct vm_area_struct *vma, struct vm_fault *vmf,
- vm_fault_t *fault_type)
+ struct vm_fault *vmf, vm_fault_t *fault_type)
{
- struct address_space *mapping = inode->i_mapping;
- struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_sb_info *sbinfo;
- struct mm_struct *charge_mm;
+ struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
+ struct mm_struct *fault_mm;
struct folio *folio;
- pgoff_t hindex;
- gfp_t huge_gfp;
int error;
- int once = 0;
- int alloced = 0;
+ bool alloced;
if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
return -EFBIG;
repeat:
if (sgp <= SGP_CACHE &&
- ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
+ ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
return -EINVAL;
- }
- sbinfo = SHMEM_SB(inode->i_sb);
- charge_mm = vma ? vma->vm_mm : NULL;
+ alloced = false;
+ fault_mm = vma ? vma->vm_mm : NULL;
- folio = filemap_get_entry(mapping, index);
+ folio = filemap_get_entry(inode->i_mapping, index);
if (folio && vma && userfaultfd_minor(vma)) {
if (!xa_is_value(folio))
folio_put(folio);
if (xa_is_value(folio)) {
error = shmem_swapin_folio(inode, index, &folio,
- sgp, gfp, vma, fault_type);
+ sgp, gfp, fault_mm, fault_type);
if (error == -EEXIST)
goto repeat;
folio_lock(folio);
/* Has the folio been truncated or swapped out? */
- if (unlikely(folio->mapping != mapping)) {
+ if (unlikely(folio->mapping != inode->i_mapping)) {
folio_unlock(folio);
folio_put(folio);
goto repeat;
return 0;
}
- if (!shmem_is_huge(inode, index, false,
- vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0))
- goto alloc_nohuge;
+ if (shmem_is_huge(inode, index, false, fault_mm,
+ vma ? vma->vm_flags : 0)) {
+ gfp_t huge_gfp;
- huge_gfp = vma_thp_gfp_mask(vma);
- huge_gfp = limit_gfp_mask(huge_gfp, gfp);
- folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true);
- if (IS_ERR(folio)) {
-alloc_nohuge:
- folio = shmem_alloc_and_acct_folio(gfp, inode, index, false);
+ huge_gfp = vma_thp_gfp_mask(vma);
+ huge_gfp = limit_gfp_mask(huge_gfp, gfp);
+ folio = shmem_alloc_and_add_folio(huge_gfp,
+ inode, index, fault_mm, true);
+ if (!IS_ERR(folio)) {
+ count_vm_event(THP_FILE_ALLOC);
+ goto alloced;
+ }
+ if (PTR_ERR(folio) == -EEXIST)
+ goto repeat;
}
- if (IS_ERR(folio)) {
- int retry = 5;
+ folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false);
+ if (IS_ERR(folio)) {
error = PTR_ERR(folio);
+ if (error == -EEXIST)
+ goto repeat;
folio = NULL;
- if (error != -ENOSPC)
- goto unlock;
- /*
- * Try to reclaim some space by splitting a large folio
- * beyond i_size on the filesystem.
- */
- while (retry--) {
- int ret;
-
- ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
- if (ret == SHRINK_STOP)
- break;
- if (ret)
- goto alloc_nohuge;
- }
goto unlock;
}
- hindex = round_down(index, folio_nr_pages(folio));
-
- if (sgp == SGP_WRITE)
- __folio_set_referenced(folio);
-
- error = shmem_add_to_page_cache(folio, mapping, hindex,
- NULL, gfp & GFP_RECLAIM_MASK,
- charge_mm);
- if (error)
- goto unacct;
-
- folio_add_lru(folio);
- shmem_recalc_inode(inode, folio_nr_pages(folio), 0);
+alloced:
alloced = true;
-
if (folio_test_pmd_mappable(folio) &&
DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
folio_next_index(folio) - 1) {
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ struct shmem_inode_info *info = SHMEM_I(inode);
/*
* Part of the large folio is beyond i_size: subject
* to shrink under memory pressure.
spin_unlock(&sbinfo->shrinklist_lock);
}
+ if (sgp == SGP_WRITE)
+ folio_set_referenced(folio);
/*
* Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
*/
/* Perhaps the file has been truncated since we checked */
if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
- if (alloced) {
- folio_clear_dirty(folio);
- filemap_remove_folio(folio);
- shmem_recalc_inode(inode, 0, 0);
- }
error = -EINVAL;
goto unlock;
}
/*
* Error recovery.
*/
-unacct:
- shmem_inode_unacct_blocks(inode, folio_nr_pages(folio));
-
- if (folio_test_large(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- goto alloc_nohuge;
- }
unlock:
+ if (alloced)
+ filemap_remove_folio(folio);
+ shmem_recalc_inode(inode, 0, 0);
if (folio) {
folio_unlock(folio);
folio_put(folio);
}
- if (error == -ENOSPC && !once++) {
- shmem_recalc_inode(inode, 0, 0);
- goto repeat;
- }
- if (error == -EEXIST)
- goto repeat;
return error;
}
enum sgp_type sgp)
{
return shmem_get_folio_gfp(inode, index, foliop, sgp,
- mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
+ mapping_gfp_mask(inode->i_mapping), NULL, NULL);
}
/*
* entry unconditionally - even if something else had already woken the
* target.
*/
-static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
+static int synchronous_wake_function(wait_queue_entry_t *wait,
+ unsigned int mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);
list_del_init(&wait->entry);
return ret;
}
+/*
+ * Trinity finds that probing a hole which tmpfs is punching can
+ * prevent the hole-punch from ever completing: which in turn
+ * locks writers out with its hold on i_rwsem. So refrain from
+ * faulting pages into the hole while it's being punched. Although
+ * shmem_undo_range() does remove the additions, it may be unable to
+ * keep up, as each new page needs its own unmap_mapping_range() call,
+ * and the i_mmap tree grows ever slower to scan if new vmas are added.
+ *
+ * It does not matter if we sometimes reach this check just before the
+ * hole-punch begins, so that one fault then races with the punch:
+ * we just need to make racing faults a rare case.
+ *
+ * The implementation below would be much simpler if we just used a
+ * standard mutex or completion: but we cannot take i_rwsem in fault,
+ * and bloating every shmem inode for this unlikely case would be sad.
+ */
+static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
+{
+ struct shmem_falloc *shmem_falloc;
+ struct file *fpin = NULL;
+ vm_fault_t ret = 0;
+
+ spin_lock(&inode->i_lock);
+ shmem_falloc = inode->i_private;
+ if (shmem_falloc &&
+ shmem_falloc->waitq &&
+ vmf->pgoff >= shmem_falloc->start &&
+ vmf->pgoff < shmem_falloc->next) {
+ wait_queue_head_t *shmem_falloc_waitq;
+ DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
+
+ ret = VM_FAULT_NOPAGE;
+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
+ shmem_falloc_waitq = shmem_falloc->waitq;
+ prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode->i_lock);
+ schedule();
+
+ /*
+ * shmem_falloc_waitq points into the shmem_fallocate()
+ * stack of the hole-punching task: shmem_falloc_waitq
+ * is usually invalid by the time we reach here, but
+ * finish_wait() does not dereference it in that case;
+ * though i_lock needed lest racing with wake_up_all().
+ */
+ spin_lock(&inode->i_lock);
+ finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
+ }
+ spin_unlock(&inode->i_lock);
+ if (fpin) {
+ fput(fpin);
+ ret = VM_FAULT_RETRY;
+ }
+ return ret;
+}
+
static vm_fault_t shmem_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = vmf->vma;
- struct inode *inode = file_inode(vma->vm_file);
+ struct inode *inode = file_inode(vmf->vma->vm_file);
gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
struct folio *folio = NULL;
+ vm_fault_t ret = 0;
int err;
- vm_fault_t ret = VM_FAULT_LOCKED;
/*
* Trinity finds that probing a hole which tmpfs is punching can
- * prevent the hole-punch from ever completing: which in turn
- * locks writers out with its hold on i_rwsem. So refrain from
- * faulting pages into the hole while it's being punched. Although
- * shmem_undo_range() does remove the additions, it may be unable to
- * keep up, as each new page needs its own unmap_mapping_range() call,
- * and the i_mmap tree grows ever slower to scan if new vmas are added.
- *
- * It does not matter if we sometimes reach this check just before the
- * hole-punch begins, so that one fault then races with the punch:
- * we just need to make racing faults a rare case.
- *
- * The implementation below would be much simpler if we just used a
- * standard mutex or completion: but we cannot take i_rwsem in fault,
- * and bloating every shmem inode for this unlikely case would be sad.
+ * prevent the hole-punch from ever completing: noted in i_private.
*/
if (unlikely(inode->i_private)) {
- struct shmem_falloc *shmem_falloc;
-
- spin_lock(&inode->i_lock);
- shmem_falloc = inode->i_private;
- if (shmem_falloc &&
- shmem_falloc->waitq &&
- vmf->pgoff >= shmem_falloc->start &&
- vmf->pgoff < shmem_falloc->next) {
- struct file *fpin;
- wait_queue_head_t *shmem_falloc_waitq;
- DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
-
- ret = VM_FAULT_NOPAGE;
- fpin = maybe_unlock_mmap_for_io(vmf, NULL);
- if (fpin)
- ret = VM_FAULT_RETRY;
-
- shmem_falloc_waitq = shmem_falloc->waitq;
- prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
- TASK_UNINTERRUPTIBLE);
- spin_unlock(&inode->i_lock);
- schedule();
-
- /*
- * shmem_falloc_waitq points into the shmem_fallocate()
- * stack of the hole-punching task: shmem_falloc_waitq
- * is usually invalid by the time we reach here, but
- * finish_wait() does not dereference it in that case;
- * though i_lock needed lest racing with wake_up_all().
- */
- spin_lock(&inode->i_lock);
- finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
- spin_unlock(&inode->i_lock);
-
- if (fpin)
- fput(fpin);
+ ret = shmem_falloc_wait(vmf, inode);
+ if (ret)
return ret;
- }
- spin_unlock(&inode->i_lock);
}
+ WARN_ON_ONCE(vmf->page != NULL);
err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
- gfp, vma, vmf, &ret);
+ gfp, vmf, &ret);
if (err)
return vmf_error(err);
- if (folio)
+ if (folio) {
vmf->page = folio_file_page(folio, vmf->pgoff);
+ ret |= VM_FAULT_LOCKED;
+ }
return ret;
}
}
static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
- unsigned long addr)
+ unsigned long addr, pgoff_t *ilx)
{
struct inode *inode = file_inode(vma->vm_file);
pgoff_t index;
+ /*
+ * Bias interleave by inode number to distribute better across nodes;
+ * but this interface is independent of which page order is used, so
+ * supplies only that bias, letting caller apply the offset (adjusted
+ * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
+ */
+ *ilx = inode->i_ino;
index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
}
-#endif
+
+static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
+ pgoff_t index, unsigned int order, pgoff_t *ilx)
+{
+ struct mempolicy *mpol;
+
+ /* Bias interleave by inode number to distribute better across nodes */
+ *ilx = info->vfs_inode.i_ino + (index >> order);
+
+ mpol = mpol_shared_policy_lookup(&info->policy, index);
+ return mpol ? mpol : get_task_policy(current);
+}
+#else
+static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
+ pgoff_t index, unsigned int order, pgoff_t *ilx)
+{
+ *ilx = 0;
+ return NULL;
+}
+#endif /* CONFIG_NUMA */
int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{
struct shmem_inode_info *info = SHMEM_I(inode);
int ret;
- ret = seal_check_future_write(info->seals, vma);
+ ret = seal_check_write(info->seals, vma);
if (ret)
return ret;
if (err)
return ERR_PTR(err);
-
inode = new_inode(sb);
if (!inode) {
shmem_free_inode(sb, 0);
inode->i_ino = ino;
inode_init_owner(idmap, inode, dir, mode);
inode->i_blocks = 0;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_generation = get_random_u32();
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
atomic_set(&info->stop_eviction, 0);
info->seals = F_SEAL_SEAL;
info->flags = flags & VM_NORESERVE;
- info->i_crtime = inode->i_mtime;
+ info->i_crtime = inode_get_mtime(inode);
info->fsflags = (dir == NULL) ? 0 :
SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
if (info->fsflags)
shmem_set_inode_flags(inode, info->fsflags);
INIT_LIST_HEAD(&info->shrinklist);
INIT_LIST_HEAD(&info->swaplist);
- INIT_LIST_HEAD(&info->swaplist);
- if (sbinfo->noswap)
- mapping_set_unevictable(inode->i_mapping);
simple_xattrs_init(&info->xattrs);
cache_no_acl(inode);
+ if (sbinfo->noswap)
+ mapping_set_unevictable(inode->i_mapping);
mapping_set_large_folios(inode->i_mapping);
switch (mode & S_IFMT) {
int ret;
pgoff_t max_off;
- if (shmem_inode_acct_block(inode, 1)) {
+ if (shmem_inode_acct_blocks(inode, 1)) {
/*
* We may have got a page, returned -ENOENT triggering a retry,
* and now we find ourselves with -ENOMEM. Release the page, to
if (unlikely(pgoff >= max_off))
goto out_release;
- ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
- gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm);
+ ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp);
+ if (ret)
+ goto out_release;
+ ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
if (ret)
goto out_release;
}
ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
-
if (ret)
return ret;
error = simple_acl_create(dir, inode);
if (error)
goto out_iput;
- error = security_inode_init_security(inode, dir,
- &dentry->d_name,
+ error = security_inode_init_security(inode, dir, &dentry->d_name,
shmem_initxattrs, NULL);
if (error && error != -EOPNOTSUPP)
goto out_iput;
goto out_iput;
dir->i_size += BOGO_DIRENT_SIZE;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
int error;
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
-
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
goto err_out;
}
-
- error = security_inode_init_security(inode, dir,
- NULL,
+ error = security_inode_init_security(inode, dir, NULL,
shmem_initxattrs, NULL);
if (error && error != -EOPNOTSUPP)
goto out_iput;
/*
* Link a file..
*/
-static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+static int shmem_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
{
struct inode *inode = d_inode(old_dentry);
int ret = 0;
}
dir->i_size += BOGO_DIRENT_SIZE;
- dir->i_mtime = inode_set_ctime_to_ts(dir,
- inode_set_ctime_current(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
inode_inc_iversion(dir);
inc_nlink(inode);
ihold(inode); /* New dentry reference */
- dget(dentry); /* Extra pinning count for the created dentry */
+ dget(dentry); /* Extra pinning count for the created dentry */
d_instantiate(dentry, inode);
out:
return ret;
simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
dir->i_size -= BOGO_DIRENT_SIZE;
- dir->i_mtime = inode_set_ctime_to_ts(dir,
- inode_set_ctime_current(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
inode_inc_iversion(dir);
drop_nlink(inode);
- dput(dentry); /* Undo the count from "create" - this does all the work */
+ dput(dentry); /* Undo the count from "create" - does all the work */
return 0;
}
inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
VM_NORESERVE);
-
if (IS_ERR(inode))
return PTR_ERR(inode);
folio_put(folio);
}
dir->i_size += BOGO_DIRENT_SIZE;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry);
folio_put(arg);
}
-static const char *shmem_get_link(struct dentry *dentry,
- struct inode *inode,
+static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
struct delayed_call *done)
{
struct folio *folio = NULL;
* Callback for security_inode_init_security() for acquiring xattrs.
*/
static int shmem_initxattrs(struct inode *inode,
- const struct xattr *xattr_array,
- void *fs_info)
+ const struct xattr *xattr_array, void *fs_info)
{
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
.set = shmem_xattr_handler_set,
};
-static const struct xattr_handler *shmem_xattr_handlers[] = {
+static const struct xattr_handler * const shmem_xattr_handlers[] = {
&shmem_security_xattr_handler,
&shmem_trusted_xattr_handler,
&shmem_user_xattr_handler,
return alias ?: d_find_any_alias(inode);
}
-
static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
}
#endif /* CONFIG_TMPFS_QUOTA */
- inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0,
- VM_NORESERVE);
+ inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
+ S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
goto failed;
#endif
};
- static struct kmem_cache *shmem_inode_cachep;
+ static struct kmem_cache *shmem_inode_cachep __ro_after_init;
static struct inode *shmem_alloc_inode(struct super_block *sb)
{
inode_init_once(&info->vfs_inode);
}
- static void shmem_init_inodecache(void)
+ static void __init shmem_init_inodecache(void)
{
shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
sizeof(struct shmem_inode_info),
0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
}
- static void shmem_destroy_inodecache(void)
+ static void __init shmem_destroy_inodecache(void)
{
kmem_cache_destroy(shmem_inode_cachep);
}
.parameters = shmem_fs_parameters,
#endif
.kill_sb = kill_litter_super,
-#ifdef CONFIG_SHMEM
.fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
-#else
- .fs_flags = FS_USERNS_MOUNT,
-#endif
};
void __init shmem_init(void)
for (i = 0; i < ARRAY_SIZE(values); i++) {
len += sysfs_emit_at(buf, len,
- shmem_huge == values[i] ? "%s[%s]" : "%s%s",
- i ? " " : "",
- shmem_format_huge(values[i]));
+ shmem_huge == values[i] ? "%s[%s]" : "%s%s",
+ i ? " " : "", shmem_format_huge(values[i]));
}
-
len += sysfs_emit_at(buf, len, "\n");
return len;
#define shmem_acct_size(flags, size) 0
#define shmem_unacct_size(flags, size) do {} while (0)
-static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir,
- umode_t mode, dev_t dev, unsigned long flags)
+static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
+ struct super_block *sb, struct inode *dir,
+ umode_t mode, dev_t dev, unsigned long flags)
{
struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
return inode ? inode : ERR_PTR(-ENOSPC);
/* common code */
-static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
- unsigned long flags, unsigned int i_flags)
+static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
+ loff_t size, unsigned long flags, unsigned int i_flags)
{
struct inode *inode;
struct file *res;
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
S_IFREG | S_IRWXUGO, 0, flags);
-
if (IS_ERR(inode)) {
shmem_unacct_size(flags, size);
return ERR_CAST(inode);
BUG_ON(!shmem_mapping(mapping));
error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
- gfp, NULL, NULL, NULL);
+ gfp, NULL, NULL);
if (error)
return ERR_PTR(error);
pf(VID_RND) /* Random VLAN ID */ \
pf(SVID_RND) /* Random SVLAN ID */ \
pf(NODE) /* Node memory alloc*/ \
+ pf(SHARED) /* Shared SKB */ \
#define pf(flag) flag##_SHIFT,
enum pkt_flags {
seq_puts(seq, " Flags: ");
for (i = 0; i < NR_PKT_FLAGS; i++) {
- if (i == F_FLOW_SEQ)
+ if (i == FLOW_SEQ_SHIFT)
if (!pkt_dev->cflows)
continue;
- if (pkt_dev->flags & (1 << i))
+ if (pkt_dev->flags & (1 << i)) {
seq_printf(seq, "%s ", pkt_flag_names[i]);
- else if (i == F_FLOW_SEQ)
- seq_puts(seq, "FLOW_RND ");
-
#ifdef CONFIG_XFRM
- if (i == F_IPSEC && pkt_dev->spi)
- seq_printf(seq, "spi:%u", pkt_dev->spi);
+ if (i == IPSEC_SHIFT && pkt_dev->spi)
+ seq_printf(seq, "spi:%u ", pkt_dev->spi);
#endif
+ } else if (i == FLOW_SEQ_SHIFT) {
+ seq_puts(seq, "FLOW_RND ");
+ }
}
seq_puts(seq, "\n");
((pkt_dev->xmit_mode == M_NETIF_RECEIVE) ||
!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
return -ENOTSUPP;
- if (value > 0 && pkt_dev->n_imix_entries > 0)
+ if (value > 0 && (pkt_dev->n_imix_entries > 0 ||
+ !(pkt_dev->flags & F_SHARED)))
return -EINVAL;
i += len;
((pkt_dev->xmit_mode == M_START_XMIT) &&
(!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))))
return -ENOTSUPP;
+
+ if (value > 1 && !(pkt_dev->flags & F_SHARED))
+ return -EINVAL;
+
pkt_dev->burst = value < 1 ? 1 : value;
sprintf(pg_result, "OK: burst=%u", pkt_dev->burst);
return count;
return count;
}
if (!strcmp(name, "flag")) {
+ bool disable = false;
__u32 flag;
char f[32];
- bool disable = false;
+ char *end;
memset(f, 0, 32);
len = strn_len(&user_buffer[i], sizeof(f) - 1);
i += len;
flag = pktgen_read_flag(f, &disable);
-
if (flag) {
- if (disable)
+ if (disable) {
+ /* If "clone_skb", or "burst" parameters are
+ * configured, it means that the skb still
+ * needs to be referenced by the pktgen, so
+ * the skb must be shared.
+ */
+ if (flag == F_SHARED && (pkt_dev->clone_skb ||
+ pkt_dev->burst > 1))
+ return -EINVAL;
pkt_dev->flags &= ~flag;
- else
+ } else {
pkt_dev->flags |= flag;
- } else {
- sprintf(pg_result,
- "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
- f,
- "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
- "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
- "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
- "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
- "NO_TIMESTAMP, "
-#ifdef CONFIG_XFRM
- "IPSEC, "
-#endif
- "NODE_ALLOC\n");
+ }
+
+ sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
return count;
}
- sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
+
+ /* Unknown flag */
+ end = pkt_dev->result + sizeof(pkt_dev->result);
+ pg_result += sprintf(pg_result,
+ "Flag -:%s:- unknown\n"
+ "Available flags, (prepend ! to un-set flag):\n", f);
+
+ for (int n = 0; n < NR_PKT_FLAGS && pg_result < end; n++) {
+ if (!IS_ENABLED(CONFIG_XFRM) && n == IPSEC_SHIFT)
+ continue;
+ pg_result += snprintf(pg_result, end - pg_result,
+ "%s, ", pkt_flag_names[n]);
+ }
+ if (!WARN_ON_ONCE(pg_result >= end)) {
+ /* Remove the comma and whitespace at the end */
+ *(pg_result - 2) = '\0';
+ }
+
return count;
}
if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) {
static void pktgen_xmit(struct pktgen_dev *pkt_dev)
{
- unsigned int burst = READ_ONCE(pkt_dev->burst);
+ bool skb_shared = !!(READ_ONCE(pkt_dev->flags) & F_SHARED);
struct net_device *odev = pkt_dev->odev;
struct netdev_queue *txq;
+ unsigned int burst = 1;
struct sk_buff *skb;
+ int clone_skb = 0;
int ret;
+ /* If 'skb_shared' is false, the read of possible
+ * new values (if any) for 'burst' and 'clone_skb' will be skipped to
+ * prevent some concurrent changes from slipping in. And the stabilized
+ * config will be read in during the next run of pktgen_xmit.
+ */
+ if (skb_shared) {
+ burst = READ_ONCE(pkt_dev->burst);
+ clone_skb = READ_ONCE(pkt_dev->clone_skb);
+ }
+
/* If device is offline, then don't send */
if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) {
pktgen_stop_device(pkt_dev);
/* If no skb or clone count exhausted then get new one */
if (!pkt_dev->skb || (pkt_dev->last_ok &&
- ++pkt_dev->clone_count >= pkt_dev->clone_skb)) {
+ ++pkt_dev->clone_count >= clone_skb)) {
/* build a new pkt */
kfree_skb(pkt_dev->skb);
if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) {
skb = pkt_dev->skb;
skb->protocol = eth_type_trans(skb, skb->dev);
- refcount_add(burst, &skb->users);
+ if (skb_shared)
+ refcount_add(burst, &skb->users);
local_bh_disable();
do {
ret = netif_receive_skb(skb);
pkt_dev->errors++;
pkt_dev->sofar++;
pkt_dev->seq_num++;
+ if (unlikely(!skb_shared)) {
+ pkt_dev->skb = NULL;
+ break;
+ }
if (refcount_read(&skb->users) != burst) {
/* skb was queued by rps/rfs or taps,
* so cannot reuse this skb
goto out; /* Skips xmit_mode M_START_XMIT */
} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
local_bh_disable();
- refcount_inc(&pkt_dev->skb->users);
+ if (skb_shared)
+ refcount_inc(&pkt_dev->skb->users);
ret = dev_queue_xmit(pkt_dev->skb);
+
+ if (!skb_shared && dev_xmit_complete(ret))
+ pkt_dev->skb = NULL;
+
switch (ret) {
case NET_XMIT_SUCCESS:
pkt_dev->sofar++;
pkt_dev->last_ok = 0;
goto unlock;
}
- refcount_add(burst, &pkt_dev->skb->users);
+ if (skb_shared)
+ refcount_add(burst, &pkt_dev->skb->users);
xmit_more:
ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0);
+ if (!skb_shared && dev_xmit_complete(ret))
+ pkt_dev->skb = NULL;
+
switch (ret) {
case NETDEV_TX_OK:
pkt_dev->last_ok = 1;
fallthrough;
case NETDEV_TX_BUSY:
/* Retry it next time */
- refcount_dec(&(pkt_dev->skb->users));
+ if (skb_shared)
+ refcount_dec(&pkt_dev->skb->users);
pkt_dev->last_ok = 0;
}
if (unlikely(burst))
/* If pkt_dev->count is zero, then run forever */
if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
- pktgen_wait_for_skb(pkt_dev);
+ if (pkt_dev->skb)
+ pktgen_wait_for_skb(pkt_dev);
/* Done with this */
pktgen_stop_device(pkt_dev);
pkt_dev->svlan_id = 0xffff;
pkt_dev->burst = 1;
pkt_dev->node = NUMA_NO_NODE;
+ pkt_dev->flags = F_SHARED; /* SKB shared by default */
err = pktgen_setup_dev(t->net, pkt_dev, ifname);
if (err)
list_for_each_safe(q, n, &list) {
t = list_entry(q, struct pktgen_thread, th_list);
list_del(&t->th_list);
- kthread_stop(t->tsk);
- put_task_struct(t->tsk);
+ kthread_stop_put(t->tsk);
kfree(t);
}
static struct rb_root integrity_iint_tree = RB_ROOT;
static DEFINE_RWLOCK(integrity_iint_lock);
- static struct kmem_cache *iint_cache __read_mostly;
+ static struct kmem_cache *iint_cache __ro_after_init;
struct dentry *integrity_dir;
return iint;
}
-static void iint_free(struct integrity_iint_cache *iint)
+#define IMA_MAX_NESTING (FILESYSTEM_MAX_STACK_DEPTH+1)
+
+/*
+ * It is not clear that IMA should be nested at all, but as long is it measures
+ * files both on overlayfs and on underlying fs, we need to annotate the iint
+ * mutex to avoid lockdep false positives related to IMA + overlayfs.
+ * See ovl_lockdep_annotate_inode_mutex_key() for more details.
+ */
+static inline void iint_lockdep_annotate(struct integrity_iint_cache *iint,
+ struct inode *inode)
+{
+#ifdef CONFIG_LOCKDEP
+ static struct lock_class_key iint_mutex_key[IMA_MAX_NESTING];
+
+ int depth = inode->i_sb->s_stack_depth;
+
+ if (WARN_ON_ONCE(depth < 0 || depth >= IMA_MAX_NESTING))
+ depth = 0;
+
+ lockdep_set_class(&iint->mutex, &iint_mutex_key[depth]);
+#endif
+}
+
+static void iint_init_always(struct integrity_iint_cache *iint,
+ struct inode *inode)
{
- kfree(iint->ima_hash);
iint->ima_hash = NULL;
iint->version = 0;
iint->flags = 0UL;
iint->ima_creds_status = INTEGRITY_UNKNOWN;
iint->evm_status = INTEGRITY_UNKNOWN;
iint->measured_pcrs = 0;
+ mutex_init(&iint->mutex);
+ iint_lockdep_annotate(iint, inode);
+}
+
+static void iint_free(struct integrity_iint_cache *iint)
+{
+ kfree(iint->ima_hash);
+ mutex_destroy(&iint->mutex);
kmem_cache_free(iint_cache, iint);
}
if (!iint)
return NULL;
+ iint_init_always(iint, inode);
+
write_lock(&integrity_iint_lock);
p = &integrity_iint_tree.rb_node;
iint_free(iint);
}
-static void init_once(void *foo)
+static void iint_init_once(void *foo)
{
struct integrity_iint_cache *iint = (struct integrity_iint_cache *) foo;
memset(iint, 0, sizeof(*iint));
- iint->ima_file_status = INTEGRITY_UNKNOWN;
- iint->ima_mmap_status = INTEGRITY_UNKNOWN;
- iint->ima_bprm_status = INTEGRITY_UNKNOWN;
- iint->ima_read_status = INTEGRITY_UNKNOWN;
- iint->ima_creds_status = INTEGRITY_UNKNOWN;
- iint->evm_status = INTEGRITY_UNKNOWN;
- mutex_init(&iint->mutex);
}
static int __init integrity_iintcache_init(void)
{
iint_cache =
kmem_cache_create("iint_cache", sizeof(struct integrity_iint_cache),
- 0, SLAB_PANIC, init_once);
+ 0, SLAB_PANIC, iint_init_once);
return 0;
}
DEFINE_LSM(integrity) = {
memory protection key tests
- soft_dirty
test soft dirty page bit semantics
+- pagemap
+ test pagemap_scan IOCTL
- cow
test copy-on-write semantics
- thp
CATEGORY="hugetlb" run_test ./hugepage-vmemmap
CATEGORY="hugetlb" run_test ./hugetlb-madvise
+# For this test, we need one and just one huge page
+echo 1 > /proc/sys/vm/nr_hugepages
+CATEGORY="hugetlb" run_test ./hugetlb_fault_after_madv
+
if test_selected "hugetlb"; then
echo "NOTE: These hugetlb tests provide minimal coverage. Use"
echo " https://github.com/libhugetlbfs/libhugetlbfs.git for"
# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
CATEGORY="madv_populate" run_test ./madv_populate
+ echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
CATEGORY="memfd_secret" run_test ./memfd_secret
# KSM KSM_MERGE_TIME_HUGE_PAGES test with size of 100
CATEGORY="soft_dirty" run_test ./soft-dirty
fi
+CATEGORY="pagemap" run_test ./pagemap_ioctl
+
# COW tests
CATEGORY="cow" run_test ./cow