config ARCH_HAS_CPU_FINALIZE_INIT
bool
-# Select if arch init_task must go in the __init_task_data section
-config ARCH_TASK_STRUCT_ON_STACK
- bool
-
-# Select if arch has its private alloc_task_struct() function
-config ARCH_TASK_STRUCT_ALLOCATOR
- bool
-
+ # The architecture has a per-task state that includes the mm's PASID
+ config ARCH_HAS_CPU_PASID
+ bool
+ select IOMMU_MM_DATA
+
config HAVE_ARCH_THREAD_STRUCT_WHITELIST
bool
- depends on !ARCH_TASK_STRUCT_ALLOCATOR
help
An architecture should select this to provide hardened usercopy
knowledge about what region of the thread_struct should be
should be implemented. Without this, the entire thread_struct
field in task_struct will be left whitelisted.
-# Select if arch has its private alloc_thread_stack() function
-config ARCH_THREAD_STACK_ALLOCATOR
- bool
-
# Select if arch wants to size task_struct dynamically via arch_task_struct_size:
config ARCH_WANTS_DYNAMIC_TASK_STRUCT
bool
config HAVE_ARCH_NODE_DEV_GROUP
bool
+config ARCH_HAS_HW_PTE_YOUNG
+ bool
+ help
+ Architectures that select this option are capable of setting the
+ accessed bit in PTE entries when using them as part of linear address
+ translations. Architectures that require runtime check should select
+ this option and override arch_has_hw_pte_young().
+
config ARCH_HAS_NONLEAF_PMD_YOUNG
bool
help
int i = 0;
int order_idx = 0;
- if (array_size <= PAGE_SIZE)
- pages = kzalloc(array_size, GFP_KERNEL);
- else
- pages = vzalloc(array_size);
+ pages = kvzalloc(array_size, GFP_KERNEL);
if (!pages)
return NULL;
EXPORT_SYMBOL_GPL(arm_iommu_detach_device);
static void arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size,
- const struct iommu_ops *iommu, bool coherent)
+ bool coherent)
{
struct dma_iommu_mapping *mapping;
#else
static void arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size,
- const struct iommu_ops *iommu, bool coherent)
+ bool coherent)
{
}
#endif /* CONFIG_ARM_DMA_USE_IOMMU */
void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
- const struct iommu_ops *iommu, bool coherent)
+ bool coherent)
{
/*
* Due to legacy code that sets the ->dma_coherent flag from a bus
if (dev->dma_ops)
return;
- if (iommu)
- arm_setup_iommu_dma_ops(dev, dma_base, size, iommu, coherent);
+ if (device_iommu_mapped(dev))
+ arm_setup_iommu_dma_ops(dev, dma_base, size, coherent);
xen_setup_dma_ops(dev);
dev->archdata.dma_ops_setup = true;
#
select ACPI_LEGACY_TABLES_LOOKUP if ACPI
select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
+ select ACPI_HOTPLUG_CPU if ACPI_PROCESSOR && HOTPLUG_CPU
select ARCH_32BIT_OFF_T if X86_32
select ARCH_CLOCKSOURCE_INIT
select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
select ARCH_HAS_CACHE_LINE_SIZE
select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
select ARCH_HAS_CPU_FINALIZE_INIT
+ select ARCH_HAS_CPU_PASID if IOMMU_SVA
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_PTE_DEVMAP if X86_64
select ARCH_HAS_PTE_SPECIAL
+ select ARCH_HAS_HW_PTE_YOUNG
select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_COPY_MC if X86_64
select GENERIC_CLOCKEVENTS_MIN_ADJUST
select GENERIC_CMOS_UPDATE
select GENERIC_CPU_AUTOPROBE
+ select GENERIC_CPU_DEVICES
select GENERIC_CPU_VULNERABILITIES
select GENERIC_EARLY_IOREMAP
select GENERIC_ENTRY
select HAS_IOPORT
select HAVE_ACPI_APEI if ACPI
select HAVE_ACPI_APEI_NMI if ACPI
- select HAVE_ALIGNED_STRUCT_PAGE if SLUB
+ select HAVE_ALIGNED_STRUCT_PAGE
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
select HAVE_ARCH_HUGE_VMALLOC if X86_64
def_bool y
depends on INTEL_IOMMU && ACPI
-config X86_32_SMP
- def_bool y
- depends on X86_32 && SMP
-
config X86_64_SMP
def_bool y
depends on X86_64 && SMP
config HIGHMEM64G
bool "64GB"
- depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !MWINCHIP3D && !MK6
+ depends on X86_HAVE_PAE
select X86_PAE
help
Select this if you have a 32-bit processor and more than 4
config X86_PAE
bool "PAE (Physical Address Extension) Support"
- depends on X86_32 && !HIGHMEM4G
+ depends on X86_32 && X86_HAVE_PAE
select PHYS_ADDR_T_64BIT
select SWIOTLB
help
depends on CPU_SUP_INTEL
depends on X86_64
depends on KVM_INTEL
+ depends on X86_X2APIC
+ select ARCH_KEEP_MEMBLOCK
+ depends on CONTIG_ALLOC
+ depends on !KEXEC_CORE
+ depends on X86_MCE
help
Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
host and certain physical attacks. This option enables necessary TDX
#include <linux/nmi.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/io.h>
#include <linux/hardirq.h>
#include <linux/atomic.h>
*/
static bool try_fixup_enqcmd_gp(void)
{
- #ifdef CONFIG_IOMMU_SVA
+ #ifdef CONFIG_ARCH_HAS_CPU_PASID
u32 pasid;
/*
if (!mm_valid_pasid(current->mm))
return false;
- pasid = current->mm->pasid;
+ pasid = mm_get_enqcmd_pasid(current->mm);
/*
* Did this thread already have its PASID activated?
r->cpu_start = rentry->res->start;
r->dma_start = rentry->res->start - rentry->offset;
r->size = resource_size(rentry->res);
- r->offset = rentry->offset;
r++;
}
}
return fwspec ? fwspec->ops : NULL;
}
- static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev,
- const u32 *id_in)
+ static int acpi_iommu_configure_id(struct device *dev, const u32 *id_in)
{
int err;
const struct iommu_ops *ops;
ops = acpi_iommu_fwspec_ops(dev);
if (ops) {
mutex_unlock(&iommu_probe_device_lock);
- return ops;
+ return 0;
}
err = iort_iommu_configure_id(dev, id_in);
/* Ignore all other errors apart from EPROBE_DEFER */
if (err == -EPROBE_DEFER) {
- return ERR_PTR(err);
+ return err;
} else if (err) {
dev_dbg(dev, "Adding to IOMMU failed: %d\n", err);
- return NULL;
+ return -ENODEV;
}
- return acpi_iommu_fwspec_ops(dev);
+ if (!acpi_iommu_fwspec_ops(dev))
+ return -ENODEV;
+ return 0;
}
#else /* !CONFIG_IOMMU_API */
return -ENODEV;
}
- static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev,
- const u32 *id_in)
+ static int acpi_iommu_configure_id(struct device *dev, const u32 *id_in)
{
- return NULL;
+ return -ENODEV;
}
#endif /* !CONFIG_IOMMU_API */
int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
const u32 *input_id)
{
- const struct iommu_ops *iommu;
+ int ret;
if (attr == DEV_DMA_NOT_SUPPORTED) {
set_dma_ops(dev, &dma_dummy_ops);
acpi_arch_dma_setup(dev);
- iommu = acpi_iommu_configure_id(dev, input_id);
- if (PTR_ERR(iommu) == -EPROBE_DEFER)
+ ret = acpi_iommu_configure_id(dev, input_id);
+ if (ret == -EPROBE_DEFER)
return -EPROBE_DEFER;
- arch_setup_dma_ops(dev, 0, U64_MAX,
- iommu, attr == DEV_DMA_COHERENT);
+ /*
+ * Historically this routine doesn't fail driver probing due to errors
+ * in acpi_iommu_configure_id()
+ */
+
+ arch_setup_dma_ops(dev, 0, U64_MAX, attr == DEV_DMA_COHERENT);
return 0;
}
* Some ACPI devs contain SerialBus resources even though they are not
* attached to a serial bus at all.
*/
+ {ACPI_VIDEO_HID, },
{"MSHW0028", },
/*
* HIDs of device with an UartSerialBusV2 resource for which userspace
}
}
-static u32 acpi_scan_check_dep(acpi_handle handle, bool check_dep)
+static u32 acpi_scan_check_dep(acpi_handle handle)
{
struct acpi_handle_list dep_devices;
- acpi_status status;
u32 count;
int i;
* 2. ACPI nodes describing USB ports.
* Still, checking for _HID catches more then just these cases ...
*/
- if (!check_dep || !acpi_has_method(handle, "_DEP") ||
- !acpi_has_method(handle, "_HID"))
+ if (!acpi_has_method(handle, "_DEP") || !acpi_has_method(handle, "_HID"))
return 0;
- status = acpi_evaluate_reference(handle, "_DEP", NULL, &dep_devices);
- if (ACPI_FAILURE(status)) {
+ if (!acpi_evaluate_reference(handle, "_DEP", NULL, &dep_devices)) {
acpi_handle_debug(handle, "Failed to evaluate _DEP.\n");
return 0;
}
struct acpi_device_info *info;
struct acpi_dep_data *dep;
bool skip, honor_dep;
+ acpi_status status;
status = acpi_get_object_info(dep_devices.handles[i], &info);
if (ACPI_FAILURE(status)) {
return count;
}
-static acpi_status acpi_bus_check_add(acpi_handle handle, bool check_dep,
+static acpi_status acpi_scan_check_crs_csi2_cb(acpi_handle handle, u32 a, void *b, void **c)
+{
+ acpi_mipi_check_crs_csi2(handle);
+ return AE_OK;
+}
+
+static acpi_status acpi_bus_check_add(acpi_handle handle, bool first_pass,
struct acpi_device **adev_p)
{
struct acpi_device *device = acpi_fetch_acpi_dev(handle);
if (acpi_device_should_be_hidden(handle))
return AE_OK;
- /* Bail out if there are dependencies. */
- if (acpi_scan_check_dep(handle, check_dep) > 0)
- return AE_CTRL_DEPTH;
+ if (first_pass) {
+ acpi_mipi_check_crs_csi2(handle);
+
+ /* Bail out if there are dependencies. */
+ if (acpi_scan_check_dep(handle) > 0) {
+ /*
+ * The entire CSI-2 connection graph needs to be
+ * extracted before any drivers or scan handlers
+ * are bound to struct device objects, so scan
+ * _CRS CSI-2 resource descriptors for all
+ * devices below the current handle.
+ */
+ acpi_walk_namespace(ACPI_TYPE_DEVICE, handle,
+ ACPI_UINT32_MAX,
+ acpi_scan_check_crs_csi2_cb,
+ NULL, NULL, NULL);
+ return AE_CTRL_DEPTH;
+ }
+ }
fallthrough;
case ACPI_TYPE_ANY: /* for ACPI_ROOT_OBJECT */
}
/*
- * If check_dep is true at this point, the device has no dependencies,
+ * If first_pass is true at this point, the device has no dependencies,
* or the creation of the device object would have been postponed above.
*/
- acpi_add_single_object(&device, handle, type, !check_dep);
+ acpi_add_single_object(&device, handle, type, !first_pass);
if (!device)
return AE_CTRL_DEPTH;
acpi_walk_namespace(ACPI_TYPE_ANY, handle, ACPI_UINT32_MAX,
acpi_bus_check_add_2, NULL, NULL, (void **)&adev);
+
+ /*
+ * Populate the ACPI _CRS CSI-2 software nodes for the ACPI devices that
+ * have been added above.
+ */
+ acpi_mipi_init_crs_csi2_swnodes();
+
acpi_bus_attach(adev, NULL);
}
if (!device)
return -ENODEV;
+ /*
+ * Set up ACPI _CRS CSI-2 software nodes using information extracted
+ * from the _CRS CSI-2 resource descriptors during the ACPI namespace
+ * walk above and MIPI DisCo for Imaging device properties.
+ */
+ acpi_mipi_scan_crs_csi2();
+ acpi_mipi_init_crs_csi2_swnodes();
+
acpi_bus_attach(device, (void *)true);
/* Pass 2: Enumerate all of the remaining devices. */
acpi_scan_postponed();
+ acpi_mipi_crs_csi2_cleanup();
+
return 0;
}
EXPORT_SYMBOL(acpi_bus_scan);
LIST_HEAD(acpihid_map);
const struct iommu_ops amd_iommu_ops;
- const struct iommu_dirty_ops amd_dirty_ops;
+ static const struct iommu_dirty_ops amd_dirty_ops;
int amd_iommu_max_glx_val = -1;
*
****************************************************************************/
+ static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom)
+ {
+ return (pdom && (pdom->flags & PD_IOMMUV2_MASK));
+ }
+
static inline int get_acpihid_device_id(struct device *dev,
struct acpihid_map_entry **entry)
{
if (dev_data->domain)
detach_device(dev);
- dev_iommu_priv_set(dev, NULL);
-
/*
* We keep dev_data around for unplugged devices and reuse it when the
* device is re-plugged - not doing so would introduce a ton of races.
}
static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
- size_t size, u16 domid, int pde)
+ size_t size, u16 domid,
+ ioasid_t pasid, bool gn)
{
u64 inv_address = build_inv_address(address, size);
memset(cmd, 0, sizeof(*cmd));
+
cmd->data[1] |= domid;
cmd->data[2] = lower_32_bits(inv_address);
cmd->data[3] = upper_32_bits(inv_address);
+ /* PDE bit - we want to flush everything, not only the PTEs */
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+ if (gn) {
+ cmd->data[0] |= pasid;
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
+ }
CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
- if (pde) /* PDE bit - we want to flush everything, not only the PTEs */
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
}
static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
- u64 address, size_t size)
+ u64 address, size_t size,
+ ioasid_t pasid, bool gn)
{
u64 inv_address = build_inv_address(address, size);
memset(cmd, 0, sizeof(*cmd));
+
cmd->data[0] = devid;
cmd->data[0] |= (qdep & 0xff) << 24;
cmd->data[1] = devid;
cmd->data[2] = lower_32_bits(inv_address);
cmd->data[3] = upper_32_bits(inv_address);
- CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
- }
-
- static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, u32 pasid,
- u64 address, bool size)
- {
- memset(cmd, 0, sizeof(*cmd));
-
- address &= ~(0xfffULL);
-
- cmd->data[0] = pasid;
- cmd->data[1] = domid;
- cmd->data[2] = lower_32_bits(address);
- cmd->data[3] = upper_32_bits(address);
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
- if (size)
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
- CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
- }
-
- static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, u32 pasid,
- int qdep, u64 address, bool size)
- {
- memset(cmd, 0, sizeof(*cmd));
-
- address &= ~(0xfffULL);
+ if (gn) {
+ cmd->data[0] |= ((pasid >> 8) & 0xff) << 16;
+ cmd->data[1] |= (pasid & 0xff) << 16;
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
+ }
- cmd->data[0] = devid;
- cmd->data[0] |= ((pasid >> 8) & 0xff) << 16;
- cmd->data[0] |= (qdep & 0xff) << 24;
- cmd->data[1] = devid;
- cmd->data[1] |= (pasid & 0xff) << 16;
- cmd->data[2] = lower_32_bits(address);
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
- cmd->data[3] = upper_32_bits(address);
- if (size)
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
}
for (dom_id = 0; dom_id <= last_bdf; ++dom_id) {
struct iommu_cmd cmd;
build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
- dom_id, 1);
+ dom_id, IOMMU_NO_PASID, false);
iommu_queue_command(iommu, &cmd);
}
struct iommu_cmd cmd;
build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
- dom_id, 1);
+ dom_id, IOMMU_NO_PASID, false);
iommu_queue_command(iommu, &cmd);
iommu_completion_wait(iommu);
iommu_completion_wait(iommu);
}
- void iommu_flush_all_caches(struct amd_iommu *iommu)
+ void amd_iommu_flush_all_caches(struct amd_iommu *iommu)
{
if (check_feature(FEATURE_IA)) {
amd_iommu_flush_all(iommu);
/*
* Command send function for flushing on-device TLB
*/
- static int device_flush_iotlb(struct iommu_dev_data *dev_data,
- u64 address, size_t size)
+ static int device_flush_iotlb(struct iommu_dev_data *dev_data, u64 address,
+ size_t size, ioasid_t pasid, bool gn)
{
struct amd_iommu *iommu;
struct iommu_cmd cmd;
if (!iommu)
return -EINVAL;
- build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size);
+ build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address,
+ size, pasid, gn);
return iommu_queue_command(iommu, &cmd);
}
return ret;
}
- if (dev_data->ats_enabled)
- ret = device_flush_iotlb(dev_data, 0, ~0UL);
+ if (dev_data->ats_enabled) {
+ /* Invalidate the entire contents of an IOTLB */
+ ret = device_flush_iotlb(dev_data, 0, ~0UL,
+ IOMMU_NO_PASID, false);
+ }
return ret;
}
* page. Otherwise it flushes the whole TLB of the IOMMU.
*/
static void __domain_flush_pages(struct protection_domain *domain,
- u64 address, size_t size, int pde)
+ u64 address, size_t size)
{
struct iommu_dev_data *dev_data;
struct iommu_cmd cmd;
int ret = 0, i;
+ ioasid_t pasid = IOMMU_NO_PASID;
+ bool gn = false;
+
+ if (pdom_is_v2_pgtbl_mode(domain))
+ gn = true;
- build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
+ build_inv_iommu_pages(&cmd, address, size, domain->id, pasid, gn);
for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
if (!domain->dev_iommu[i])
if (!dev_data->ats_enabled)
continue;
- ret |= device_flush_iotlb(dev_data, address, size);
+ ret |= device_flush_iotlb(dev_data, address, size, pasid, gn);
}
WARN_ON(ret);
}
- static void domain_flush_pages(struct protection_domain *domain,
- u64 address, size_t size, int pde)
+ void amd_iommu_domain_flush_pages(struct protection_domain *domain,
+ u64 address, size_t size)
{
if (likely(!amd_iommu_np_cache)) {
- __domain_flush_pages(domain, address, size, pde);
+ __domain_flush_pages(domain, address, size);
+
+ /* Wait until IOMMU TLB and all device IOTLB flushes are complete */
+ amd_iommu_domain_flush_complete(domain);
+
return;
}
flush_size = 1ul << min_alignment;
- __domain_flush_pages(domain, address, flush_size, pde);
+ __domain_flush_pages(domain, address, flush_size);
address += flush_size;
size -= flush_size;
}
+
+ /* Wait until IOMMU TLB and all device IOTLB flushes are complete */
+ amd_iommu_domain_flush_complete(domain);
}
/* Flush the whole IO/TLB for a given protection domain - including PDE */
- void amd_iommu_domain_flush_tlb_pde(struct protection_domain *domain)
+ static void amd_iommu_domain_flush_all(struct protection_domain *domain)
{
- domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
+ amd_iommu_domain_flush_pages(domain, 0,
+ CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
}
void amd_iommu_domain_flush_complete(struct protection_domain *domain)
unsigned long flags;
spin_lock_irqsave(&domain->lock, flags);
- domain_flush_pages(domain, iova, size, 1);
- amd_iommu_domain_flush_complete(domain);
+ amd_iommu_domain_flush_pages(domain, iova, size);
spin_unlock_irqrestore(&domain->lock, flags);
}
}
/* Flush the DTE entry */
device_flush_dte(dev_data);
- /* Flush IOTLB */
- amd_iommu_domain_flush_tlb_pde(domain);
-
- /* Wait for the flushes to finish */
- amd_iommu_domain_flush_complete(domain);
+ /* Flush IOTLB and wait for the flushes to finish */
+ amd_iommu_domain_flush_all(domain);
/* decrease reference counters - needs to happen after the flushes */
domain->dev_iommu[iommu->index] -= 1;
do_attach(dev_data, domain);
- /*
- * We might boot into a crash-kernel here. The crashed kernel
- * left the caches in the IOMMU dirty. So we have to flush
- * here to evict all dirty stuff.
- */
- amd_iommu_domain_flush_tlb_pde(domain);
-
- amd_iommu_domain_flush_complete(domain);
-
out:
spin_unlock(&dev_data->lock);
amd_iommu_update_and_flush_device_table(domain);
/* Flush domain TLB(s) and wait for completion */
- amd_iommu_domain_flush_tlb_pde(domain);
- amd_iommu_domain_flush_complete(domain);
+ amd_iommu_domain_flush_all(domain);
}
/*****************************************************************************
}
/* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
- if (domain_flush) {
- amd_iommu_domain_flush_tlb_pde(pdomain);
- amd_iommu_domain_flush_complete(pdomain);
- }
+ if (domain_flush)
+ amd_iommu_domain_flush_all(pdomain);
+
pdomain->dirty_tracking = enable;
spin_unlock_irqrestore(&pdomain->lock, flags);
unsigned long flags;
spin_lock_irqsave(&dom->lock, flags);
- amd_iommu_domain_flush_tlb_pde(dom);
- amd_iommu_domain_flush_complete(dom);
+ amd_iommu_domain_flush_all(dom);
spin_unlock_irqrestore(&dom->lock, flags);
}
unsigned long flags;
spin_lock_irqsave(&dom->lock, flags);
- domain_flush_pages(dom, gather->start, gather->end - gather->start + 1, 1);
- amd_iommu_domain_flush_complete(dom);
+ amd_iommu_domain_flush_pages(dom, gather->start,
+ gather->end - gather->start + 1);
spin_unlock_irqrestore(&dom->lock, flags);
}
return true;
}
- const struct iommu_dirty_ops amd_dirty_ops = {
+ static const struct iommu_dirty_ops amd_dirty_ops = {
.set_dirty_tracking = amd_iommu_set_dirty_tracking,
.read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
};
};
static int __flush_pasid(struct protection_domain *domain, u32 pasid,
- u64 address, bool size)
+ u64 address, size_t size)
{
struct iommu_dev_data *dev_data;
struct iommu_cmd cmd;
if (!(domain->flags & PD_IOMMUV2_MASK))
return -EINVAL;
- build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size);
+ build_inv_iommu_pages(&cmd, address, size, domain->id, pasid, true);
/*
* IOMMU TLB needs to be flushed before Device TLB to
iommu = rlookup_amd_iommu(dev_data->dev);
if (!iommu)
continue;
- build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid,
- qdep, address, size);
+ build_inv_iotlb_pages(&cmd, dev_data->devid, qdep,
+ address, size, pasid, true);
ret = iommu_queue_command(iommu, &cmd);
if (ret != 0)
static int __amd_iommu_flush_page(struct protection_domain *domain, u32 pasid,
u64 address)
{
- return __flush_pasid(domain, pasid, address, false);
+ return __flush_pasid(domain, pasid, address, PAGE_SIZE);
}
int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid,
static int __amd_iommu_flush_tlb(struct protection_domain *domain, u32 pasid)
{
- return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
- true);
+ return __flush_pasid(domain, pasid, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
}
int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid)
return index;
}
- static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
- struct irte_ga *irte)
+ static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
+ struct irte_ga *irte)
{
struct irq_remap_table *table;
struct irte_ga *entry;
raw_spin_unlock_irqrestore(&table->lock, flags);
+ return 0;
+ }
+
+ static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
+ struct irte_ga *irte)
+ {
+ bool ret;
+
+ ret = __modify_irte_ga(iommu, devid, index, irte);
+ if (ret)
+ return ret;
+
iommu_flush_irt_and_complete(iommu, devid);
return 0;
data->irq_2_irte.devid = devid;
data->irq_2_irte.index = index + sub_handle;
- iommu->irte_ops->prepare(data->entry, apic->delivery_mode,
+ iommu->irte_ops->prepare(data->entry, APIC_DELIVERY_MODE_FIXED,
apic->dest_mode_logical, irq_cfg->vector,
irq_cfg->dest_apicid, devid);
entry->lo.fields_remap.valid = valid;
entry->lo.fields_remap.dm = apic->dest_mode_logical;
- entry->lo.fields_remap.int_type = apic->delivery_mode;
+ entry->lo.fields_remap.int_type = APIC_DELIVERY_MODE_FIXED;
entry->hi.fields.vector = cfg->vector;
entry->lo.fields_remap.destination =
APICID_TO_IRTE_DEST_LO(cfg->dest_apicid);
}
entry->lo.fields_vapic.is_run = is_run;
- return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
- ir_data->irq_2_irte.index, entry);
+ return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
+ ir_data->irq_2_irte.index, entry);
}
EXPORT_SYMBOL(amd_iommu_update_ga);
#endif
#ifdef CONFIG_CMA_ALIGNMENT
#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + CONFIG_CMA_ALIGNMENT)
#else
-#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + MAX_ORDER)
+#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + MAX_PAGE_ORDER)
#endif
/*
#define STRTAB_L1_DESC_L2PTR_MASK GENMASK_ULL(51, 6)
#define STRTAB_STE_DWORDS 8
+
+ struct arm_smmu_ste {
+ __le64 data[STRTAB_STE_DWORDS];
+ };
+
#define STRTAB_STE_0_V (1UL << 0)
#define STRTAB_STE_0_CFG GENMASK_ULL(3, 1)
#define STRTAB_STE_0_CFG_ABORT 0
struct arm_smmu_strtab_l1_desc {
u8 span;
- __le64 *l2ptr;
+ struct arm_smmu_ste *l2ptr;
dma_addr_t l2ptr_dma;
};
enum arm_smmu_domain_stage {
ARM_SMMU_DOMAIN_S1 = 0,
ARM_SMMU_DOMAIN_S2,
- ARM_SMMU_DOMAIN_NESTED,
ARM_SMMU_DOMAIN_BYPASS,
};
#include <linux/spinlock.h>
#include <linux/swiotlb.h>
#include <linux/vmalloc.h>
+ #include <trace/events/swiotlb.h>
#include "dma-iommu.h"
struct page **pages;
unsigned int i = 0, nid = dev_to_node(dev);
- order_mask &= GENMASK(MAX_ORDER, 0);
+ order_mask &= GENMASK(MAX_PAGE_ORDER, 0);
if (!order_mask)
return NULL;
return DMA_MAPPING_ERROR;
}
+ trace_swiotlb_bounced(dev, phys, size);
+
aligned_size = iova_align(iovad, size);
phys = swiotlb_tbl_map_single(dev, phys, size, aligned_size,
iova_mask(iovad), dir, attrs);
{
u32 value, old;
+ if (client->regs.sid.security == 0 && client->regs.sid.override == 0)
+ return;
+
value = readl(mc->regs + client->regs.sid.security);
if ((value & MC_SID_STREAMID_SECURITY_OVERRIDE) == 0) {
/*
static int tegra186_mc_probe_device(struct tegra_mc *mc, struct device *dev)
{
#if IS_ENABLED(CONFIG_IOMMU_API)
- struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct of_phandle_args args;
unsigned int i, index = 0;
+ u32 sid;
+
+ if (!tegra_dev_iommu_get_stream_id(dev, &sid))
+ return 0;
while (!of_parse_phandle_with_args(dev->of_node, "interconnects", "#interconnect-cells",
index, &args)) {
for (i = 0; i < mc->soc->num_clients; i++) {
const struct tegra_mc_client *client = &mc->soc->clients[i];
- if (client->id == args.args[0]) {
- u32 sid = fwspec->ids[0] & MC_SID_STREAMID_OVERRIDE_MASK;
-
- tegra186_mc_client_sid_override(mc, client, sid);
- }
+ if (client->id == args.args[0])
+ tegra186_mc_client_sid_override(
+ mc, client,
+ sid & MC_SID_STREAMID_OVERRIDE_MASK);
}
}
return 0;
}
+static int tegra186_mc_resume(struct tegra_mc *mc)
+{
+#if IS_ENABLED(CONFIG_IOMMU_API)
+ unsigned int i;
+
+ for (i = 0; i < mc->soc->num_clients; i++) {
+ const struct tegra_mc_client *client = &mc->soc->clients[i];
+
+ tegra186_mc_client_sid_override(mc, client, client->sid);
+ }
+#endif
+
+ return 0;
+}
+
const struct tegra_mc_ops tegra186_mc_ops = {
.probe = tegra186_mc_probe,
.remove = tegra186_mc_remove,
+ .resume = tegra186_mc_resume,
.probe_device = tegra186_mc_probe_device,
.handle_irq = tegra30_mc_handle_irq,
};
int of_dma_configure_id(struct device *dev, struct device_node *np,
bool force_dma, const u32 *id)
{
- const struct iommu_ops *iommu;
const struct bus_dma_region *map = NULL;
struct device_node *bus_np;
u64 dma_start = 0;
u64 mask, end, size = 0;
bool coherent;
+ int iommu_ret;
int ret;
if (np == dev->of_node)
dev_dbg(dev, "device is%sdma coherent\n",
coherent ? " " : " not ");
- iommu = of_iommu_configure(dev, np, id);
- if (PTR_ERR(iommu) == -EPROBE_DEFER) {
+ iommu_ret = of_iommu_configure(dev, np, id);
+ if (iommu_ret == -EPROBE_DEFER) {
/* Don't touch range map if it wasn't set from a valid dma-ranges */
if (!ret)
dev->dma_range_map = NULL;
kfree(map);
return -EPROBE_DEFER;
- }
+ } else if (iommu_ret == -ENODEV) {
+ dev_dbg(dev, "device is not behind an iommu\n");
+ } else if (iommu_ret) {
+ dev_err(dev, "iommu configuration for device failed with %pe\n",
+ ERR_PTR(iommu_ret));
- dev_dbg(dev, "device is%sbehind an iommu\n",
- iommu ? " " : " not ");
+ /*
+ * Historically this routine doesn't fail driver probing
+ * due to errors in of_iommu_configure()
+ */
+ } else
+ dev_dbg(dev, "device is behind an iommu\n");
- arch_setup_dma_ops(dev, dma_start, size, iommu, coherent);
+ arch_setup_dma_ops(dev, dma_start, size, coherent);
- if (!iommu)
+ if (iommu_ret)
of_dma_set_restricted_buffer(dev, np);
return 0;
return 0;
}
EXPORT_SYMBOL_GPL(of_device_uevent_modalias);
+
+/**
+ * of_device_make_bus_id - Use the device node data to assign a unique name
+ * @dev: pointer to device structure that is linked to a device tree node
+ *
+ * This routine will first try using the translated bus address to
+ * derive a unique name. If it cannot, then it will prepend names from
+ * parent nodes until a unique name can be derived.
+ */
+void of_device_make_bus_id(struct device *dev)
+{
+ struct device_node *node = dev->of_node;
+ const __be32 *reg;
+ u64 addr;
+ u32 mask;
+
+ /* Construct the name, using parent nodes if necessary to ensure uniqueness */
+ while (node->parent) {
+ /*
+ * If the address can be translated, then that is as much
+ * uniqueness as we need. Make it the first component and return
+ */
+ reg = of_get_property(node, "reg", NULL);
+ if (reg && (addr = of_translate_address(node, reg)) != OF_BAD_ADDR) {
+ if (!of_property_read_u32(node, "mask", &mask))
+ dev_set_name(dev, dev_name(dev) ? "%llx.%x.%pOFn:%s" : "%llx.%x.%pOFn",
+ addr, ffs(mask) - 1, node, dev_name(dev));
+
+ else
+ dev_set_name(dev, dev_name(dev) ? "%llx.%pOFn:%s" : "%llx.%pOFn",
+ addr, node, dev_name(dev));
+ return;
+ }
+
+ /* format arguments only used if dev_name() resolves to NULL */
+ dev_set_name(dev, dev_name(dev) ? "%s:%s" : "%s",
+ kbasename(node->full_name), dev_name(dev));
+ node = node->parent;
+ }
+}
+EXPORT_SYMBOL_GPL(of_device_make_bus_id);
struct acpi_handle_list {
u32 count;
- acpi_handle* handles;
+ acpi_handle *handles;
};
/* acpi_utils.h */
acpi_evaluate_integer(acpi_handle handle,
acpi_string pathname,
struct acpi_object_list *arguments, unsigned long long *data);
-acpi_status
-acpi_evaluate_reference(acpi_handle handle,
- acpi_string pathname,
- struct acpi_object_list *arguments,
- struct acpi_handle_list *list);
+bool acpi_evaluate_reference(acpi_handle handle, acpi_string pathname,
+ struct acpi_object_list *arguments,
+ struct acpi_handle_list *list);
bool acpi_handle_list_equal(struct acpi_handle_list *list1,
struct acpi_handle_list *list2);
void acpi_handle_list_replace(struct acpi_handle_list *dst,
struct acpi_handle_list *src);
void acpi_handle_list_free(struct acpi_handle_list *list);
+bool acpi_device_dep(acpi_handle target, acpi_handle match);
acpi_status
acpi_evaluate_ost(acpi_handle handle, u32 source_event, u32 status_code,
struct acpi_buffer *status_buf);
struct acpi_gpio_mapping;
+#define ACPI_DEVICE_SWNODE_ROOT 0
+
+/*
+ * The maximum expected number of CSI-2 data lanes.
+ *
+ * This number is not expected to ever have to be equal to or greater than the
+ * number of bits in an unsigned long variable, but if it needs to be increased
+ * above that limit, code will need to be adjusted accordingly.
+ */
+#define ACPI_DEVICE_CSI2_DATA_LANES 8
+
+#define ACPI_DEVICE_SWNODE_PORT_NAME_LENGTH 8
+
+enum acpi_device_swnode_dev_props {
+ ACPI_DEVICE_SWNODE_DEV_ROTATION,
+ ACPI_DEVICE_SWNODE_DEV_CLOCK_FREQUENCY,
+ ACPI_DEVICE_SWNODE_DEV_LED_MAX_MICROAMP,
+ ACPI_DEVICE_SWNODE_DEV_FLASH_MAX_MICROAMP,
+ ACPI_DEVICE_SWNODE_DEV_FLASH_MAX_TIMEOUT_US,
+ ACPI_DEVICE_SWNODE_DEV_NUM_OF,
+ ACPI_DEVICE_SWNODE_DEV_NUM_ENTRIES
+};
+
+enum acpi_device_swnode_port_props {
+ ACPI_DEVICE_SWNODE_PORT_REG,
+ ACPI_DEVICE_SWNODE_PORT_NUM_OF,
+ ACPI_DEVICE_SWNODE_PORT_NUM_ENTRIES
+};
+
+enum acpi_device_swnode_ep_props {
+ ACPI_DEVICE_SWNODE_EP_REMOTE_EP,
+ ACPI_DEVICE_SWNODE_EP_BUS_TYPE,
+ ACPI_DEVICE_SWNODE_EP_REG,
+ ACPI_DEVICE_SWNODE_EP_CLOCK_LANES,
+ ACPI_DEVICE_SWNODE_EP_DATA_LANES,
+ ACPI_DEVICE_SWNODE_EP_LANE_POLARITIES,
+ /* TX only */
+ ACPI_DEVICE_SWNODE_EP_LINK_FREQUENCIES,
+ ACPI_DEVICE_SWNODE_EP_NUM_OF,
+ ACPI_DEVICE_SWNODE_EP_NUM_ENTRIES
+};
+
+/*
+ * Each device has a root software node plus two times as many nodes as the
+ * number of CSI-2 ports.
+ */
+#define ACPI_DEVICE_SWNODE_PORT(port) (2 * (port) + 1)
+#define ACPI_DEVICE_SWNODE_EP(endpoint) \
+ (ACPI_DEVICE_SWNODE_PORT(endpoint) + 1)
+
+/**
+ * struct acpi_device_software_node_port - MIPI DisCo for Imaging CSI-2 port
+ * @port_name: Port name.
+ * @data_lanes: "data-lanes" property values.
+ * @lane_polarities: "lane-polarities" property values.
+ * @link_frequencies: "link_frequencies" property values.
+ * @port_nr: Port number.
+ * @crs_crs2_local: _CRS CSI2 record present (i.e. this is a transmitter one).
+ * @port_props: Port properties.
+ * @ep_props: Endpoint properties.
+ * @remote_ep: Reference to the remote endpoint.
+ */
+struct acpi_device_software_node_port {
+ char port_name[ACPI_DEVICE_SWNODE_PORT_NAME_LENGTH + 1];
+ u32 data_lanes[ACPI_DEVICE_CSI2_DATA_LANES];
+ u32 lane_polarities[ACPI_DEVICE_CSI2_DATA_LANES + 1 /* clock lane */];
+ u64 link_frequencies[ACPI_DEVICE_CSI2_DATA_LANES];
+ unsigned int port_nr;
+ bool crs_csi2_local;
+
+ struct property_entry port_props[ACPI_DEVICE_SWNODE_PORT_NUM_ENTRIES];
+ struct property_entry ep_props[ACPI_DEVICE_SWNODE_EP_NUM_ENTRIES];
+
+ struct software_node_ref_args remote_ep[1];
+};
+
+/**
+ * struct acpi_device_software_nodes - Software nodes for an ACPI device
+ * @dev_props: Device properties.
+ * @nodes: Software nodes for root as well as ports and endpoints.
+ * @nodeprts: Array of software node pointers, for (un)registering them.
+ * @ports: Information related to each port and endpoint within a port.
+ * @num_ports: The number of ports.
+ */
+struct acpi_device_software_nodes {
+ struct property_entry dev_props[ACPI_DEVICE_SWNODE_DEV_NUM_ENTRIES];
+ struct software_node *nodes;
+ const struct software_node **nodeptrs;
+ struct acpi_device_software_node_port *ports;
+ unsigned int num_ports;
+};
+
/* Device */
struct acpi_device {
u32 pld_crc;
struct acpi_device_data data;
struct acpi_scan_handler *handler;
struct acpi_hotplug_context *hp;
+ struct acpi_device_software_nodes *swnodes;
const struct acpi_gpio_mapping *driver_gpios;
void *driver_data;
struct device dev;
/* helper */
+ struct iommu_ops;
+
bool acpi_dma_supported(const struct acpi_device *adev);
enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev);
int acpi_iommu_fwspec_init(struct device *dev, u32 id,
adev->power.states[ACPI_STATE_D3_HOT].flags.explicit_set);
}
-bool acpi_dev_uid_match(struct acpi_device *adev, const char *uid2);
-bool acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2);
int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer);
+static inline bool acpi_dev_hid_match(struct acpi_device *adev, const char *hid2)
+{
+ const char *hid1 = acpi_device_hid(adev);
+
+ return hid1 && hid2 && !strcmp(hid1, hid2);
+}
+
+static inline bool acpi_str_uid_match(struct acpi_device *adev, const char *uid2)
+{
+ const char *uid1 = acpi_device_uid(adev);
+
+ return uid1 && uid2 && !strcmp(uid1, uid2);
+}
+
+static inline bool acpi_int_uid_match(struct acpi_device *adev, u64 uid2)
+{
+ u64 uid1;
+
+ return !acpi_dev_uid_to_integer(adev, &uid1) && uid1 == uid2;
+}
+
+#define TYPE_ENTRY(type, x) \
+ const type: x, \
+ type: x
+
+#define ACPI_STR_TYPES(match) \
+ TYPE_ENTRY(unsigned char *, match), \
+ TYPE_ENTRY(signed char *, match), \
+ TYPE_ENTRY(char *, match), \
+ TYPE_ENTRY(void *, match)
+
+/**
+ * acpi_dev_uid_match - Match device by supplied UID
+ * @adev: ACPI device to match.
+ * @uid2: Unique ID of the device.
+ *
+ * Matches UID in @adev with given @uid2.
+ *
+ * Returns: %true if matches, %false otherwise.
+ */
+#define acpi_dev_uid_match(adev, uid2) \
+ _Generic(uid2, \
+ /* Treat @uid2 as a string for acpi string types */ \
+ ACPI_STR_TYPES(acpi_str_uid_match), \
+ /* Treat as an integer otherwise */ \
+ default: acpi_int_uid_match)(adev, uid2)
+
+/**
+ * acpi_dev_hid_uid_match - Match device by supplied HID and UID
+ * @adev: ACPI device to match.
+ * @hid2: Hardware ID of the device.
+ * @uid2: Unique ID of the device, pass 0 or NULL to not check _UID.
+ *
+ * Matches HID and UID in @adev with given @hid2 and @uid2. Absence of @uid2
+ * will be treated as a match. If user wants to validate @uid2, it should be
+ * done before calling this function.
+ *
+ * Returns: %true if matches or @uid2 is 0 or NULL, %false otherwise.
+ */
+#define acpi_dev_hid_uid_match(adev, hid2, uid2) \
+ (acpi_dev_hid_match(adev, hid2) && \
+ (!(uid2) || acpi_dev_uid_match(adev, uid2)))
+
void acpi_dev_clear_dependencies(struct acpi_device *supplier);
bool acpi_dev_ready_for_enumeration(const struct acpi_device *device);
struct acpi_device *acpi_dev_get_next_consumer_dev(struct acpi_device *supplier,
struct subsys_private;
struct device_node;
struct fwnode_handle;
- struct iommu_ops;
struct iommu_group;
struct dev_pin_info;
struct dev_iommu;
*/
struct subsys_interface {
const char *name;
- struct bus_type *subsys;
+ const struct bus_type *subsys;
struct list_head node;
int (*add_dev)(struct device *dev, struct subsys_interface *sif);
void (*remove_dev)(struct device *dev, struct subsys_interface *sif);
int subsys_interface_register(struct subsys_interface *sif);
void subsys_interface_unregister(struct subsys_interface *sif);
-int subsys_system_register(struct bus_type *subsys,
+int subsys_system_register(const struct bus_type *subsys,
const struct attribute_group **groups);
-int subsys_virtual_register(struct bus_type *subsys,
+int subsys_virtual_register(const struct bus_type *subsys,
const struct attribute_group **groups);
/*
* @id: device instance
* @devres_lock: Spinlock to protect the resource of the device.
* @devres_head: The resources list of the device.
- * @knode_class: The node used to add the device to the class list.
* @class: The class of the device.
* @groups: Optional attribute groups.
* @release: Callback to free the device after all references have
int device_move(struct device *dev, struct device *new_parent,
enum dpm_order dpm_order);
int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid);
-int device_is_dependent(struct device *dev, void *target);
static inline bool device_supports_offline(struct device *dev)
{
* this bus.
* @pm: Power management operations of this bus, callback the specific
* device driver's pm-ops.
- * @iommu_ops: IOMMU specific operations for this bus, used to attach IOMMU
- * driver implementations to a bus and allow the driver to do
- * bus-specific setup
* @need_parent_lock: When probing or removing a device on this bus, the
* device core should lock the device's parent.
*
const struct dev_pm_ops *pm;
- const struct iommu_ops *iommu_ops;
-
bool need_parent_lock;
};
int bus_for_each_drv(const struct bus_type *bus, struct device_driver *start,
void *data, int (*fn)(struct device_driver *, void *));
-void bus_sort_breadthfirst(struct bus_type *bus,
+void bus_sort_breadthfirst(const struct bus_type *bus,
int (*compare)(const struct device *a,
const struct device *b));
/*
#include <linux/slab.h>
struct cma;
+ struct iommu_ops;
/*
* Values for struct dma_map_ops.flags:
#ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
- const struct iommu_ops *iommu, bool coherent);
+ bool coherent);
#else
static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base,
- u64 size, const struct iommu_ops *iommu, bool coherent)
+ u64 size, bool coherent)
{
}
#endif /* CONFIG_ARCH_HAS_SETUP_DMA_OPS */
#endif /* CONFIG_ARCH_HAS_TEARDOWN_DMA_OPS */
#ifdef CONFIG_DMA_API_DEBUG
-void dma_debug_add_bus(struct bus_type *bus);
+void dma_debug_add_bus(const struct bus_type *bus);
void debug_dma_dump_mappings(struct device *dev);
#else
-static inline void dma_debug_add_bus(struct bus_type *bus)
+static inline void dma_debug_add_bus(const struct bus_type *bus)
{
}
static inline void debug_dma_dump_mappings(struct device *dev)
struct page_pool *pp;
unsigned long _pp_mapping_pad;
unsigned long dma_addr;
- atomic_long_t pp_frag_count;
+ atomic_long_t pp_ref_count;
};
struct { /* Tail pages of compound page */
unsigned long compound_head; /* Bit zero is set */
* @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs.
* @__page_mapping: Aliases with page->mapping. Unused for page tables.
* @pt_mm: Used for x86 pgds.
- * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 only.
+ * @pt_frag_refcount: For fragmented page table tracking. Powerpc only.
* @_pt_pad_2: Padding to ensure proper alignment.
* @ptl: Lock for the page table.
* @__page_type: Same as page->page_type. Unused for page tables.
- * @_refcount: Same as page refcount. Used for s390 page tables.
+ * @__page_refcount: Same as page refcount.
* @pt_memcg_data: Memcg data. Tracked for page tables here.
*
* This struct overlays struct page for now. Do not modify without a good
#endif
};
unsigned int __page_type;
- atomic_t _refcount;
+ atomic_t __page_refcount;
#ifdef CONFIG_MEMCG
unsigned long pt_memcg_data;
#endif
TABLE_MATCH(mapping, __page_mapping);
TABLE_MATCH(rcu_head, pt_rcu_head);
TABLE_MATCH(page_type, __page_type);
-TABLE_MATCH(_refcount, _refcount);
+TABLE_MATCH(_refcount, __page_refcount);
#ifdef CONFIG_MEMCG
TABLE_MATCH(memcg_data, pt_memcg_data);
#endif
*/
unsigned long pids_active[2];
+ /* MM scan sequence ID when scan first started after VMA creation */
+ int start_scan_seq;
+
/*
* MM scan sequence ID when the VMA was last completely scanned.
* A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq
#endif
struct kioctx_table;
+ struct iommu_mm_data;
struct mm_struct {
struct {
/*
#endif
struct work_struct async_put_work;
- #ifdef CONFIG_IOMMU_SVA
- u32 pasid;
+ #ifdef CONFIG_IOMMU_MM_DATA
+ struct iommu_mm_data *iommu_mm;
#endif
#ifdef CONFIG_KSM
/*
*/
unsigned long ksm_zero_pages;
#endif /* CONFIG_KSM */
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
struct {
/* this mm_struct is on lru_gen_mm_list */
struct list_head list;
struct mem_cgroup *memcg;
#endif
} lru_gen;
-#endif /* CONFIG_LRU_GEN */
+#endif /* CONFIG_LRU_GEN_WALKS_MMU */
} __randomize_layout;
/*
spinlock_t lock;
};
+#endif /* CONFIG_LRU_GEN */
+
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
+
void lru_gen_add_mm(struct mm_struct *mm);
void lru_gen_del_mm(struct mm_struct *mm);
-#ifdef CONFIG_MEMCG
void lru_gen_migrate_mm(struct mm_struct *mm);
-#endif
static inline void lru_gen_init_mm(struct mm_struct *mm)
{
WRITE_ONCE(mm->lru_gen.bitmap, -1);
}
-#else /* !CONFIG_LRU_GEN */
+#else /* !CONFIG_LRU_GEN_WALKS_MMU */
static inline void lru_gen_add_mm(struct mm_struct *mm)
{
{
}
-#ifdef CONFIG_MEMCG
static inline void lru_gen_migrate_mm(struct mm_struct *mm)
{
}
-#endif
static inline void lru_gen_init_mm(struct mm_struct *mm)
{
{
}
-#endif /* CONFIG_LRU_GEN */
+#endif /* CONFIG_LRU_GEN_WALKS_MMU */
struct vma_iterator {
struct ma_state mas;
.mas = { \
.tree = &(__mm)->mm_mt, \
.index = __addr, \
- .node = MAS_START, \
+ .node = NULL, \
+ .status = ma_start, \
}, \
}
#include <uapi/linux/sched.h>
#include <asm/current.h>
-
-#include <linux/pid.h>
-#include <linux/sem.h>
+#include <asm/processor.h>
+#include <linux/thread_info.h>
+#include <linux/preempt.h>
+#include <linux/cpumask.h>
+
+#include <linux/cache.h>
+#include <linux/irqflags_types.h>
+#include <linux/smp_types.h>
+#include <linux/pid_types.h>
+#include <linux/sem_types.h>
#include <linux/shm.h>
#include <linux/kmsan_types.h>
-#include <linux/mutex.h>
-#include <linux/plist.h>
-#include <linux/hrtimer.h>
-#include <linux/irqflags.h>
-#include <linux/seccomp.h>
-#include <linux/nodemask.h>
-#include <linux/rcupdate.h>
-#include <linux/refcount.h>
+#include <linux/mutex_types.h>
+#include <linux/plist_types.h>
+#include <linux/hrtimer_types.h>
+#include <linux/timer_types.h>
+#include <linux/seccomp_types.h>
+#include <linux/nodemask_types.h>
+#include <linux/refcount_types.h>
#include <linux/resource.h>
#include <linux/latencytop.h>
#include <linux/sched/prio.h>
#include <linux/sched/types.h>
#include <linux/signal_types.h>
-#include <linux/syscall_user_dispatch.h>
+#include <linux/syscall_user_dispatch_types.h>
#include <linux/mm_types_task.h>
#include <linux/task_io_accounting.h>
-#include <linux/posix-timers.h>
-#include <linux/rseq.h>
-#include <linux/seqlock.h>
+#include <linux/posix-timers_types.h>
+#include <linux/restart_block.h>
+#include <uapi/linux/rseq.h>
+#include <linux/seqlock_types.h>
#include <linux/kcsan.h>
#include <linux/rv.h>
#include <linux/livepatch_sched.h>
+#include <linux/uidgid_types.h>
#include <asm/kmap_size.h>
/* task_struct member predeclarations (sorted alphabetically): */
struct root_domain;
struct rq;
struct sched_attr;
+struct sched_dl_entity;
struct seq_file;
struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
+struct task_struct;
struct user_event_mm;
/*
u32 inv_weight;
};
-/**
- * struct util_est - Estimation utilization of FAIR tasks
- * @enqueued: instantaneous estimated utilization of a task/cpu
- * @ewma: the Exponential Weighted Moving Average (EWMA)
- * utilization of a task
- *
- * Support data structure to track an Exponential Weighted Moving Average
- * (EWMA) of a FAIR task's utilization. New samples are added to the moving
- * average each time a task completes an activation. Sample's weight is chosen
- * so that the EWMA will be relatively insensitive to transient changes to the
- * task's workload.
- *
- * The enqueued attribute has a slightly different meaning for tasks and cpus:
- * - task: the task's util_avg at last task dequeue time
- * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
- * Thus, the util_est.enqueued of a task represents the contribution on the
- * estimated utilization of the CPU where that task is currently enqueued.
- *
- * Only for tasks we track a moving average of the past instantaneous
- * estimated utilization. This allows to absorb sporadic drops in utilization
- * of an otherwise almost periodic task.
- *
- * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
- * updates. When a task is dequeued, its util_est should not be updated if its
- * util_avg has not been updated in the meantime.
- * This information is mapped into the MSB bit of util_est.enqueued at dequeue
- * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
- * for a task) it is safe to use MSB.
- */
-struct util_est {
- unsigned int enqueued;
- unsigned int ewma;
-#define UTIL_EST_WEIGHT_SHIFT 2
-#define UTIL_AVG_UNCHANGED 0x80000000
-} __attribute__((__aligned__(sizeof(u64))));
-
/*
* The load/runnable/util_avg accumulates an infinite geometric series
* (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
unsigned long load_avg;
unsigned long runnable_avg;
unsigned long util_avg;
- struct util_est util_est;
+ unsigned int util_est;
} ____cacheline_aligned;
+/*
+ * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
+ * updates. When a task is dequeued, its util_est should not be updated if its
+ * util_avg has not been updated in the meantime.
+ * This information is mapped into the MSB bit of util_est at dequeue time.
+ * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
+ * it is safe to use MSB.
+ */
+#define UTIL_EST_WEIGHT_SHIFT 2
+#define UTIL_AVG_UNCHANGED 0x80000000
+
struct sched_statistics {
#ifdef CONFIG_SCHEDSTATS
u64 wait_start;
u64 block_max;
s64 sum_block_runtime;
- u64 exec_max;
+ s64 exec_max;
u64 slice_max;
u64 nr_migrations_cold;
struct load_weight load;
struct rb_node run_node;
u64 deadline;
- u64 min_deadline;
+ u64 min_vruntime;
struct list_head group_node;
unsigned int on_rq;
#endif
} __randomize_layout;
+typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
+typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
+
struct sched_dl_entity {
struct rb_node rb_node;
unsigned int dl_yielded : 1;
unsigned int dl_non_contending : 1;
unsigned int dl_overrun : 1;
+ unsigned int dl_server : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
* timer is needed to decrease the active utilization at the correct
* time.
*/
- struct hrtimer inactive_timer;
+ struct hrtimer inactive_timer;
+
+ /*
+ * Bits for DL-server functionality. Also see the comment near
+ * dl_server_update().
+ *
+ * @rq the runqueue this server is for
+ *
+ * @server_has_tasks() returns true if @server_pick return a
+ * runnable task.
+ */
+ struct rq *rq;
+ dl_server_has_tasks_f server_has_tasks;
+ dl_server_pick_f server_pick;
#ifdef CONFIG_RT_MUTEXES
/*
struct sched_entity se;
struct sched_rt_entity rt;
struct sched_dl_entity dl;
+ struct sched_dl_entity *dl_server;
const struct sched_class *sched_class;
#ifdef CONFIG_SCHED_CORE
/* Recursion prevention for eventfd_signal() */
unsigned in_eventfd:1;
#endif
- #ifdef CONFIG_IOMMU_SVA
+ #ifdef CONFIG_ARCH_HAS_CPU_PASID
unsigned pasid_activated:1;
#endif
#ifdef CONFIG_CPU_SUP_INTEL
*/
};
-static inline struct pid *task_pid(struct task_struct *task)
-{
- return task->thread_pid;
-}
-
-/*
- * the helpers to get the task's different pids as they are seen
- * from various namespaces
- *
- * task_xid_nr() : global id, i.e. the id seen from the init namespace;
- * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of
- * current.
- * task_xid_nr_ns() : id seen from the ns specified;
- *
- * see also pid_nr() etc in include/linux/pid.h
- */
-pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
-
-static inline pid_t task_pid_nr(struct task_struct *tsk)
-{
- return tsk->pid;
-}
-
-static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
-}
-
-static inline pid_t task_pid_vnr(struct task_struct *tsk)
-{
- return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
-}
-
-
-static inline pid_t task_tgid_nr(struct task_struct *tsk)
-{
- return tsk->tgid;
-}
-
-/**
- * pid_alive - check that a task structure is not stale
- * @p: Task structure to be checked.
- *
- * Test if a process is not yet dead (at most zombie state)
- * If pid_alive fails, then pointers within the task structure
- * can be stale and must not be dereferenced.
- *
- * Return: 1 if the process is alive. 0 otherwise.
- */
-static inline int pid_alive(const struct task_struct *p)
-{
- return p->thread_pid != NULL;
-}
-
-static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
-}
-
-static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
-{
- return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
-}
-
-
-static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
-}
-
-static inline pid_t task_session_vnr(struct task_struct *tsk)
-{
- return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
-}
-
-static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
-}
-
-static inline pid_t task_tgid_vnr(struct task_struct *tsk)
-{
- return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
-}
-
-static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
-{
- pid_t pid = 0;
-
- rcu_read_lock();
- if (pid_alive(tsk))
- pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
- rcu_read_unlock();
-
- return pid;
-}
-
-static inline pid_t task_ppid_nr(const struct task_struct *tsk)
-{
- return task_ppid_nr_ns(tsk, &init_pid_ns);
-}
-
-/* Obsolete, do not use: */
-static inline pid_t task_pgrp_nr(struct task_struct *tsk)
-{
- return task_pgrp_nr_ns(tsk, &init_pid_ns);
-}
-
#define TASK_REPORT_IDLE (TASK_REPORT + 1)
#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)
return task_index_to_char(task_state_index(tsk));
}
-/**
- * is_global_init - check if a task structure is init. Since init
- * is free to have sub-threads we need to check tgid.
- * @tsk: Task structure to be checked.
- *
- * Check if a task structure is the first user space task the kernel created.
- *
- * Return: 1 if the task structure is init. 0 otherwise.
- */
-static inline int is_global_init(struct task_struct *tsk)
-{
- return task_tgid_nr(tsk) == 1;
-}
-
extern struct pid *cad_pid;
/*
void yield(void);
union thread_union {
-#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
struct task_struct task;
-#endif
#ifndef CONFIG_THREAD_INFO_IN_TASK
struct thread_info thread_info;
#endif
__cond_resched_rwlock_write(lock); \
})
-static inline void cond_resched_rcu(void)
-{
-#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
- rcu_read_unlock();
- cond_resched();
- rcu_read_lock();
-#endif
-}
-
#ifdef CONFIG_PREEMPT_DYNAMIC
extern bool preempt_model_none(void);
return preempt_model_full() || preempt_model_rt();
}
-/*
- * Does a critical section need to be broken due to another
- * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
- * but a general need for low latency)
- */
-static inline int spin_needbreak(spinlock_t *lock)
-{
-#ifdef CONFIG_PREEMPTION
- return spin_is_contended(lock);
-#else
- return 0;
-#endif
-}
-
-/*
- * Check if a rwlock is contended.
- * Returns non-zero if there is another task waiting on the rwlock.
- * Returns zero if the lock is not contended or the system / underlying
- * rwlock implementation does not support contention detection.
- * Technically does not depend on CONFIG_PREEMPTION, but a general need
- * for low latency.
- */
-static inline int rwlock_needbreak(rwlock_t *lock)
-{
-#ifdef CONFIG_PREEMPTION
- return rwlock_is_contended(lock);
-#else
- return 0;
-#endif
-}
-
static __always_inline bool need_resched(void)
{
return unlikely(tif_need_resched());
extern unsigned long get_wchan(struct task_struct *p);
extern struct task_struct *cpu_curr_snapshot(int cpu);
+#include <linux/spinlock.h>
+
/*
* In order to reduce various lock holder preemption latencies provide an
* interface to see if a vCPU is currently running or not.
unsigned long sched_cpu_util(int cpu);
#endif /* CONFIG_SMP */
-#ifdef CONFIG_RSEQ
-
-/*
- * Map the event mask on the user-space ABI enum rseq_cs_flags
- * for direct mask checks.
- */
-enum rseq_event_mask_bits {
- RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
- RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
- RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
-};
-
-enum rseq_event_mask {
- RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT),
- RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT),
- RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT),
-};
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
- if (t->rseq)
- set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
-}
-
-void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
-
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
- struct pt_regs *regs)
-{
- if (current->rseq)
- __rseq_handle_notify_resume(ksig, regs);
-}
-
-static inline void rseq_signal_deliver(struct ksignal *ksig,
- struct pt_regs *regs)
-{
- preempt_disable();
- __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask);
- preempt_enable();
- rseq_handle_notify_resume(ksig, regs);
-}
-
-/* rseq_preempt() requires preemption to be disabled. */
-static inline void rseq_preempt(struct task_struct *t)
-{
- __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
- rseq_set_notify_resume(t);
-}
-
-/* rseq_migrate() requires preemption to be disabled. */
-static inline void rseq_migrate(struct task_struct *t)
-{
- __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
- rseq_set_notify_resume(t);
-}
-
-/*
- * If parent process has a registered restartable sequences area, the
- * child inherits. Unregister rseq for a clone with CLONE_VM set.
- */
-static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
-{
- if (clone_flags & CLONE_VM) {
- t->rseq = NULL;
- t->rseq_len = 0;
- t->rseq_sig = 0;
- t->rseq_event_mask = 0;
- } else {
- t->rseq = current->rseq;
- t->rseq_len = current->rseq_len;
- t->rseq_sig = current->rseq_sig;
- t->rseq_event_mask = current->rseq_event_mask;
- }
-}
-
-static inline void rseq_execve(struct task_struct *t)
-{
- t->rseq = NULL;
- t->rseq_len = 0;
- t->rseq_sig = 0;
- t->rseq_event_mask = 0;
-}
-
-#else
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
-}
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
- struct pt_regs *regs)
-{
-}
-static inline void rseq_signal_deliver(struct ksignal *ksig,
- struct pt_regs *regs)
-{
-}
-static inline void rseq_preempt(struct task_struct *t)
-{
-}
-static inline void rseq_migrate(struct task_struct *t)
-{
-}
-static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
-{
-}
-static inline void rseq_execve(struct task_struct *t)
-{
-}
-
-#endif
-
-#ifdef CONFIG_DEBUG_RSEQ
-
-void rseq_syscall(struct pt_regs *regs);
-
-#else
-
-static inline void rseq_syscall(struct pt_regs *regs)
-{
-}
-
-#endif
-
#ifdef CONFIG_SCHED_CORE
extern void sched_core_free(struct task_struct *tsk);
extern void sched_core_fork(struct task_struct *p);
#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
+#include <linux/syscall_user_dispatch.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
#include <linux/compat.h>
#include <linux/stackprotector.h>
#include <linux/user_events.h>
#include <linux/iommu.h>
+#include <linux/rseq.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
{
}
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static struct kmem_cache *task_struct_cachep;
static inline struct task_struct *alloc_task_struct_node(int node)
{
kmem_cache_free(task_struct_cachep, tsk);
}
-#endif
-
-#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
/*
* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
}
# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
-#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
-
-static int alloc_thread_stack_node(struct task_struct *tsk, int node)
-{
- unsigned long *stack;
-
- stack = arch_alloc_thread_stack_node(tsk, node);
- tsk->stack = stack;
- return stack ? 0 : -ENOMEM;
-}
-
-static void free_thread_stack(struct task_struct *tsk)
-{
- arch_free_thread_stack(tsk);
- tsk->stack = NULL;
-}
-
-#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
int retval;
unsigned long charge = 0;
LIST_HEAD(uf);
- VMA_ITERATOR(old_vmi, oldmm, 0);
VMA_ITERATOR(vmi, mm, 0);
uprobe_start_dup_mmap();
goto out;
khugepaged_fork(mm, oldmm);
- retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count);
- if (retval)
+ /* Use __mt_dup() to efficiently build an identical maple tree. */
+ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
+ if (unlikely(retval))
goto out;
mt_clear_in_rcu(vmi.mas.tree);
- for_each_vma(old_vmi, mpnt) {
+ for_each_vma(vmi, mpnt) {
struct file *file;
vma_start_write(mpnt);
if (mpnt->vm_flags & VM_DONTCOPY) {
+ retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
+ mpnt->vm_end, GFP_KERNEL);
+ if (retval)
+ goto loop_out;
+
vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
continue;
}
if (is_vm_hugetlb_page(tmp))
hugetlb_dup_vma_private(tmp);
- /* Link the vma into the MT */
- if (vma_iter_bulk_store(&vmi, tmp))
- goto fail_nomem_vmi_store;
+ /*
+ * Link the vma into the MT. After using __mt_dup(), memory
+ * allocation is not necessary here, so it cannot fail.
+ */
+ vma_iter_bulk_store(&vmi, tmp);
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
- if (retval)
+ if (retval) {
+ mpnt = vma_next(&vmi);
goto loop_out;
+ }
}
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
loop_out:
vma_iter_free(&vmi);
- if (!retval)
+ if (!retval) {
mt_set_in_rcu(vmi.mas.tree);
+ } else if (mpnt) {
+ /*
+ * The entire maple tree has already been duplicated. If the
+ * mmap duplication fails, mark the failure point with
+ * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
+ * stop releasing VMAs that have not been duplicated after this
+ * point.
+ */
+ mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
+ mas_store(&vmi.mas, XA_ZERO_ENTRY);
+ }
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
uprobe_end_dup_mmap();
return retval;
-fail_nomem_vmi_store:
- unlink_anon_vmas(tmp);
fail_nomem_anon_vma_fork:
mpol_put(vma_policy(tmp));
fail_nomem_policy:
int arch_task_struct_size __read_mostly;
#endif
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
/* Fetch thread_struct whitelist for the architecture. */
else
*offset += offsetof(struct task_struct, thread);
}
-#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
void __init fork_init(void)
{
int i;
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN 0
#endif
arch_task_struct_size, align,
SLAB_PANIC|SLAB_ACCOUNT,
useroffset, usersize, NULL);
-#endif
/* do the arch specific task caches init */
arch_task_cache_init();
tsk->use_memdelay = 0;
#endif
- #ifdef CONFIG_IOMMU_SVA
+ #ifdef CONFIG_ARCH_HAS_CPU_PASID
tsk->pasid_activated = 0;
#endif
static int wait_for_vfork_done(struct task_struct *child,
struct completion *vfork)
{
- unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE;
+ unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
int killed;
cgroup_enter_frozen();
get_task_struct(p);
}
- if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+ if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
/* lock the task to synchronize with memcg migration */
task_lock(p);
lru_gen_add_mm(p->mm);
The cost is that if the page was never dirtied and needs to be
swapped out again, it will be re-compressed.
+config ZSWAP_SHRINKER_DEFAULT_ON
+ bool "Shrink the zswap pool on memory pressure"
+ depends on ZSWAP
+ default n
+ help
+ If selected, the zswap shrinker will be enabled, and the pages
+ stored in the zswap pool will become available for reclaim (i.e
+ written back to the backing swap device) on memory pressure.
+
+ This means that zswap writeback could happen even if the pool is
+ not yet full, or the cgroup zswap limit has not been reached,
+ reducing the chance that cold pages will reside in the zswap pool
+ and consume memory indefinitely.
+
choice
prompt "Default compressor"
depends on ZSWAP
For more information, see zsmalloc documentation.
-menu "SLAB allocator options"
-
-choice
- prompt "Choose SLAB allocator"
- default SLUB
- help
- This option allows to select a slab allocator.
-
-config SLAB_DEPRECATED
- bool "SLAB (DEPRECATED)"
- depends on !PREEMPT_RT
- help
- Deprecated and scheduled for removal in a few cycles. Replaced by
- SLUB.
-
- and the people listed in the SLAB ALLOCATOR section of MAINTAINERS
- file, explaining why.
-
- The regular slab allocator that is established and known to work
- well in all environments. It organizes cache hot objects in
- per cpu and per node queues.
+menu "Slab allocator options"
config SLUB
- bool "SLUB (Unqueued Allocator)"
- help
- SLUB is a slab allocator that minimizes cache line usage
- instead of managing queues of cached objects (SLAB approach).
- Per cpu caching is realized using slabs of objects instead
- of queues of objects. SLUB can use memory efficiently
- and has enhanced diagnostics. SLUB is the default choice for
- a slab allocator.
-
-endchoice
-
-config SLAB
- bool
- default y
- depends on SLAB_DEPRECATED
+ def_bool y
config SLUB_TINY
- bool "Configure SLUB for minimal memory footprint"
- depends on SLUB && EXPERT
+ bool "Configure for minimal memory footprint"
+ depends on EXPERT
select SLAB_MERGE_DEFAULT
help
- Configures the SLUB allocator in a way to achieve minimal memory
+ Configures the slab allocator in a way to achieve minimal memory
footprint, sacrificing scalability, debugging and other features.
This is intended only for the smallest system that had used the
SLOB allocator and is not recommended for systems with more than
config SLAB_MERGE_DEFAULT
bool "Allow slab caches to be merged"
default y
- depends on SLAB || SLUB
help
For reduced kernel memory fragmentation, slab caches can be
merged when they share the same size and other characteristics.
config SLAB_FREELIST_RANDOM
bool "Randomize slab freelist"
- depends on SLAB || (SLUB && !SLUB_TINY)
+ depends on !SLUB_TINY
help
Randomizes the freelist order used on creating new pages. This
security feature reduces the predictability of the kernel slab
config SLAB_FREELIST_HARDENED
bool "Harden slab freelist metadata"
- depends on SLAB || (SLUB && !SLUB_TINY)
+ depends on !SLUB_TINY
help
Many kernel heap attacks try to target slab cache metadata and
other infrastructure. This options makes minor performance
sacrifices to harden the kernel slab allocator against common
- freelist exploit methods. Some slab implementations have more
- sanity-checking than others. This option is most effective with
- CONFIG_SLUB.
+ freelist exploit methods.
config SLUB_STATS
default n
- bool "Enable SLUB performance statistics"
- depends on SLUB && SYSFS && !SLUB_TINY
+ bool "Enable performance statistics"
+ depends on SYSFS && !SLUB_TINY
help
- SLUB statistics are useful to debug SLUBs allocation behavior in
+ The statistics are useful to debug slab allocation behavior in
order find ways to optimize the allocator. This should never be
enabled for production use since keeping statistics slows down
the allocator by a few percentage points. The slabinfo command
config SLUB_CPU_PARTIAL
default y
- depends on SLUB && SMP && !SLUB_TINY
- bool "SLUB per cpu partial cache"
+ depends on SMP && !SLUB_TINY
+ bool "Enable per cpu partial caches"
help
Per cpu partial caches accelerate objects allocation and freeing
that is local to a processor at the price of more indeterminism
config RANDOM_KMALLOC_CACHES
default n
- depends on SLUB && !SLUB_TINY
+ depends on !SLUB_TINY
bool "Randomize slab caches for normal kmalloc"
help
A hardening feature that creates multiple copies of slab caches for
limited degree of memory and CPU overhead that relates to hardware and
system workload.
-endmenu # SLAB allocator options
+endmenu # Slab allocator options
config SHUFFLE_PAGE_ALLOCATOR
bool "Page allocator randomization"
the presence of a memory-side-cache. There are also incidental
security benefits as it reduces the predictability of page
allocations to compliment SLAB_FREELIST_RANDOM, but the
- default granularity of shuffling on the MAX_ORDER i.e, 10th
+ default granularity of shuffling on the MAX_PAGE_ORDER i.e, 10th
order of pages is selected based on cache utilization benefits
on x86.
HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available
on a platform.
- Note that the pageblock_order cannot exceed MAX_ORDER and will be
- clamped down to MAX_ORDER.
+ Note that the pageblock_order cannot exceed MAX_PAGE_ORDER and will be
+ clamped down to MAX_PAGE_ORDER.
config CONTIG_ALLOC
def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
from userspace allocation. Keeping a user from writing to low pages
can help reduce the impact of kernel NULL pointer bugs.
- For most ia64, ppc64 and x86 users with lots of address space
+ For most ppc64 and x86 users with lots of address space
a value of 65536 is reasonable and should cause no problems.
On arm and other archs it should not be higher than 32768.
Programs which use vm86 functionality or have some need to map
madvise(MADV_HUGEPAGE) but it won't risk to increase the
memory footprint of applications without a guaranteed
benefit.
+
+ config TRANSPARENT_HUGEPAGE_NEVER
+ bool "never"
+ help
+ Disable Transparent Hugepage by default. It can still be
+ enabled at runtime via sysfs.
endchoice
config THP_SWAP
from evicted generations for debugging purpose.
This option has a per-memcg and per-node memory overhead.
+
+config LRU_GEN_WALKS_MMU
+ def_bool y
+ depends on LRU_GEN && ARCH_HAS_HW_PTE_YOUNG
# }
config ARCH_SUPPORTS_PER_VMA_LOCK
bool
depends on !STACK_GROWSUP
+ config IOMMU_MM_DATA
+ bool
+
source "mm/damon/Kconfig"
endmenu