bool osc_pc_lpi_support_confirmed;
EXPORT_SYMBOL_GPL(osc_pc_lpi_support_confirmed);
+/*
+ * ACPI 6.2 Section 6.2.11.2 'Platform-Wide OSPM Capabilities':
+ * Starting with ACPI Specification 6.2, all _CPC registers can be in
+ * PCC, System Memory, System IO, or Functional Fixed Hardware address
+ * spaces. OSPM support for this more flexible register space scheme is
+ * indicated by the “Flexible Address Space for CPPC Registers” _OSC bit.
+ *
+ * Otherwise (cf ACPI 6.1, s8.4.7.1.1.X), _CPC registers must be in:
+ * - PCC or Functional Fixed Hardware address space if defined
+ * - SystemMemory address space (NULL register) if not defined
+ */
+bool osc_cpc_flexible_adr_space_confirmed;
+EXPORT_SYMBOL_GPL(osc_cpc_flexible_adr_space_confirmed);
+
/*
* ACPI 6.4 Operating System Capabilities for USB.
*/
#endif
#ifdef CONFIG_X86
capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GENERIC_INITIATOR_SUPPORT;
- if (boot_cpu_has(X86_FEATURE_HWP)) {
- capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_SUPPORT;
- capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPCV2_SUPPORT;
- }
#endif
+#ifdef CONFIG_ACPI_CPPC_LIB
+ capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_SUPPORT;
+ capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPCV2_SUPPORT;
+#endif
+
+ capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_FLEXIBLE_ADR_SPACE;
+
if (IS_ENABLED(CONFIG_SCHED_MC_PRIO))
capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_DIVERSE_HIGH_SUPPORT;
return;
}
-#ifdef CONFIG_X86
- if (boot_cpu_has(X86_FEATURE_HWP))
- osc_sb_cppc_not_supported = !(capbuf_ret[OSC_SUPPORT_DWORD] &
- (OSC_SB_CPC_SUPPORT | OSC_SB_CPCV2_SUPPORT));
+#ifdef CONFIG_ACPI_CPPC_LIB
+ osc_sb_cppc_not_supported = !(capbuf_ret[OSC_SUPPORT_DWORD] &
+ (OSC_SB_CPC_SUPPORT | OSC_SB_CPCV2_SUPPORT));
#endif
/*
capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_PCLPI_SUPPORT;
osc_sb_native_usb4_support_confirmed =
capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_NATIVE_USB4_SUPPORT;
+ osc_cpc_flexible_adr_space_confirmed =
+ capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_CPC_FLEXIBLE_ADR_SPACE;
}
kfree(context.ret.pointer);
}
osc_sb_native_usb4_control =
- control & ((u32 *)context.ret.pointer)[OSC_CONTROL_DWORD];
+ control & acpi_osc_ctx_get_pci_control(&context);
acpi_bus_decode_usb_osc("USB4 _OSC: OS supports", control);
acpi_bus_decode_usb_osc("USB4 _OSC: OS controls",
}
EXPORT_SYMBOL_GPL(acpi_bus_for_each_dev);
+struct acpi_dev_walk_context {
+ int (*fn)(struct acpi_device *, void *);
+ void *data;
+};
+
+static int acpi_dev_for_one_check(struct device *dev, void *context)
+{
+ struct acpi_dev_walk_context *adwc = context;
+
+ if (dev->bus != &acpi_bus_type)
+ return 0;
+
+ return adwc->fn(to_acpi_device(dev), adwc->data);
+}
+
+int acpi_dev_for_each_child(struct acpi_device *adev,
+ int (*fn)(struct acpi_device *, void *), void *data)
+{
+ struct acpi_dev_walk_context adwc = {
+ .fn = fn,
+ .data = data,
+ };
+
+ return device_for_each_child(&adev->dev, &adwc, acpi_dev_for_one_check);
+}
+
/* --------------------------------------------------------------------------
Initialization/Cleanup
-------------------------------------------------------------------------- */
{ OSC_PCI_EXPRESS_DPC_CONTROL, "DPC" },
};
+ static struct pci_osc_bit_struct cxl_osc_support_bit[] = {
+ { OSC_CXL_1_1_PORT_REG_ACCESS_SUPPORT, "CXL11PortRegAccess" },
+ { OSC_CXL_2_0_PORT_DEV_REG_ACCESS_SUPPORT, "CXL20PortDevRegAccess" },
+ { OSC_CXL_PROTOCOL_ERR_REPORTING_SUPPORT, "CXLProtocolErrorReporting" },
+ { OSC_CXL_NATIVE_HP_SUPPORT, "CXLNativeHotPlug" },
+ };
+
+ static struct pci_osc_bit_struct cxl_osc_control_bit[] = {
+ { OSC_CXL_ERROR_REPORTING_CONTROL, "CXLMemErrorReporting" },
+ };
+
static void decode_osc_bits(struct acpi_pci_root *root, char *msg, u32 word,
struct pci_osc_bit_struct *table, int size)
{
ARRAY_SIZE(pci_osc_control_bit));
}
+ static void decode_cxl_osc_support(struct acpi_pci_root *root, char *msg, u32 word)
+ {
+ decode_osc_bits(root, msg, word, cxl_osc_support_bit,
+ ARRAY_SIZE(cxl_osc_support_bit));
+ }
+
+ static void decode_cxl_osc_control(struct acpi_pci_root *root, char *msg, u32 word)
+ {
+ decode_osc_bits(root, msg, word, cxl_osc_control_bit,
+ ARRAY_SIZE(cxl_osc_control_bit));
+ }
+
+ static inline bool is_pcie(struct acpi_pci_root *root)
+ {
+ return root->bridge_type == ACPI_BRIDGE_TYPE_PCIE;
+ }
+
+ static inline bool is_cxl(struct acpi_pci_root *root)
+ {
+ return root->bridge_type == ACPI_BRIDGE_TYPE_CXL;
+ }
+
static u8 pci_osc_uuid_str[] = "33DB4D5B-1FF7-401C-9657-7441C03DD766";
+ static u8 cxl_osc_uuid_str[] = "68F2D50B-C469-4d8A-BD3D-941A103FD3FC";
- static acpi_status acpi_pci_run_osc(acpi_handle handle,
- const u32 *capbuf, u32 *retval)
+ static char *to_uuid(struct acpi_pci_root *root)
+ {
+ if (is_cxl(root))
+ return cxl_osc_uuid_str;
+ return pci_osc_uuid_str;
+ }
+
+ static int cap_length(struct acpi_pci_root *root)
+ {
+ if (is_cxl(root))
+ return sizeof(u32) * OSC_CXL_CAPABILITY_DWORDS;
+ return sizeof(u32) * OSC_PCI_CAPABILITY_DWORDS;
+ }
+
+ static acpi_status acpi_pci_run_osc(struct acpi_pci_root *root,
+ const u32 *capbuf, u32 *pci_control,
+ u32 *cxl_control)
{
struct acpi_osc_context context = {
- .uuid_str = pci_osc_uuid_str,
+ .uuid_str = to_uuid(root),
.rev = 1,
- .cap.length = 12,
+ .cap.length = cap_length(root),
.cap.pointer = (void *)capbuf,
};
acpi_status status;
- status = acpi_run_osc(handle, &context);
+ status = acpi_run_osc(root->device->handle, &context);
if (ACPI_SUCCESS(status)) {
- *retval = *((u32 *)(context.ret.pointer + 8));
+ *pci_control = acpi_osc_ctx_get_pci_control(&context);
+ if (is_cxl(root))
+ *cxl_control = acpi_osc_ctx_get_cxl_control(&context);
kfree(context.ret.pointer);
}
return status;
}
- static acpi_status acpi_pci_query_osc(struct acpi_pci_root *root,
- u32 support,
- u32 *control)
+ static acpi_status acpi_pci_query_osc(struct acpi_pci_root *root, u32 support,
+ u32 *control, u32 cxl_support,
+ u32 *cxl_control)
{
acpi_status status;
- u32 result, capbuf[3];
+ u32 pci_result, cxl_result, capbuf[OSC_CXL_CAPABILITY_DWORDS];
support |= root->osc_support_set;
capbuf[OSC_SUPPORT_DWORD] = support;
capbuf[OSC_CONTROL_DWORD] = *control | root->osc_control_set;
- status = acpi_pci_run_osc(root->device->handle, capbuf, &result);
+ if (is_cxl(root)) {
+ cxl_support |= root->osc_ext_support_set;
+ capbuf[OSC_EXT_SUPPORT_DWORD] = cxl_support;
+ capbuf[OSC_EXT_CONTROL_DWORD] = *cxl_control | root->osc_ext_control_set;
+ }
+
+ retry:
+ status = acpi_pci_run_osc(root, capbuf, &pci_result, &cxl_result);
if (ACPI_SUCCESS(status)) {
root->osc_support_set = support;
- *control = result;
+ *control = pci_result;
+ if (is_cxl(root)) {
+ root->osc_ext_support_set = cxl_support;
+ *cxl_control = cxl_result;
+ }
+ } else if (is_cxl(root)) {
+ /*
+ * CXL _OSC is optional on CXL 1.1 hosts. Fall back to PCIe _OSC
+ * upon any failure using CXL _OSC.
+ */
+ root->bridge_type = ACPI_BRIDGE_TYPE_PCIE;
+ goto retry;
}
return status;
}
* @handle: ACPI handle of a PCI root bridge (or PCIe Root Complex).
* @mask: Mask of _OSC bits to request control of, place to store control mask.
* @support: _OSC supported capability.
+ * @cxl_mask: Mask of CXL _OSC control bits, place to store control mask.
+ * @cxl_support: CXL _OSC supported capability.
*
* Run _OSC query for @mask and if that is successful, compare the returned
* mask of control bits with @req. If all of the @req bits are set in the
* _OSC bits the BIOS has granted control of, but its contents are meaningless
* on failure.
**/
- static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 support)
+ static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask,
+ u32 support, u32 *cxl_mask,
+ u32 cxl_support)
{
u32 req = OSC_PCI_EXPRESS_CAPABILITY_CONTROL;
struct acpi_pci_root *root;
acpi_status status;
- u32 ctrl, capbuf[3];
+ u32 ctrl, cxl_ctrl = 0, capbuf[OSC_CXL_CAPABILITY_DWORDS];
if (!mask)
return AE_BAD_PARAMETER;
ctrl = *mask;
*mask |= root->osc_control_set;
+ if (is_cxl(root)) {
+ cxl_ctrl = *cxl_mask;
+ *cxl_mask |= root->osc_ext_control_set;
+ }
+
/* Need to check the available controls bits before requesting them. */
do {
- status = acpi_pci_query_osc(root, support, mask);
+ u32 pci_missing = 0, cxl_missing = 0;
+
+ status = acpi_pci_query_osc(root, support, mask, cxl_support,
+ cxl_mask);
if (ACPI_FAILURE(status))
return status;
- if (ctrl == *mask)
- break;
- decode_osc_control(root, "platform does not support",
- ctrl & ~(*mask));
+ if (is_cxl(root)) {
+ if (ctrl == *mask && cxl_ctrl == *cxl_mask)
+ break;
+ pci_missing = ctrl & ~(*mask);
+ cxl_missing = cxl_ctrl & ~(*cxl_mask);
+ } else {
+ if (ctrl == *mask)
+ break;
+ pci_missing = ctrl & ~(*mask);
+ }
+ if (pci_missing)
+ decode_osc_control(root, "platform does not support",
+ pci_missing);
+ if (cxl_missing)
+ decode_cxl_osc_control(root, "CXL platform does not support",
+ cxl_missing);
ctrl = *mask;
- } while (*mask);
+ cxl_ctrl = *cxl_mask;
+ } while (*mask || *cxl_mask);
/* No need to request _OSC if the control was already granted. */
- if ((root->osc_control_set & ctrl) == ctrl)
+ if ((root->osc_control_set & ctrl) == ctrl &&
+ (root->osc_ext_control_set & cxl_ctrl) == cxl_ctrl)
return AE_OK;
if ((ctrl & req) != req) {
capbuf[OSC_QUERY_DWORD] = 0;
capbuf[OSC_SUPPORT_DWORD] = root->osc_support_set;
capbuf[OSC_CONTROL_DWORD] = ctrl;
- status = acpi_pci_run_osc(handle, capbuf, mask);
+ if (is_cxl(root)) {
+ capbuf[OSC_EXT_SUPPORT_DWORD] = root->osc_ext_support_set;
+ capbuf[OSC_EXT_CONTROL_DWORD] = cxl_ctrl;
+ }
+
+ status = acpi_pci_run_osc(root, capbuf, mask, cxl_mask);
if (ACPI_FAILURE(status))
return status;
root->osc_control_set = *mask;
+ root->osc_ext_control_set = *cxl_mask;
return AE_OK;
}
return support;
}
+ /*
+ * Background on hotplug support, and making it depend on only
+ * CONFIG_HOTPLUG_PCI_PCIE vs. also considering CONFIG_MEMORY_HOTPLUG:
+ *
+ * CONFIG_ACPI_HOTPLUG_MEMORY does depend on CONFIG_MEMORY_HOTPLUG, but
+ * there is no existing _OSC for memory hotplug support. The reason is that
+ * ACPI memory hotplug requires the OS to acknowledge / coordinate with
+ * memory plug events via a scan handler. On the CXL side the equivalent
+ * would be if Linux supported the Mechanical Retention Lock [1], or
+ * otherwise had some coordination for the driver of a PCI device
+ * undergoing hotplug to be consulted on whether the hotplug should
+ * proceed or not.
+ *
+ * The concern is that if Linux says no to supporting CXL hotplug then
+ * the BIOS may say no to giving the OS hotplug control of any other PCIe
+ * device. So the question here is not whether hotplug is enabled, it's
+ * whether it is handled natively by the at all OS, and if
+ * CONFIG_HOTPLUG_PCI_PCIE is enabled then the answer is "yes".
+ *
+ * Otherwise, the plan for CXL coordinated remove, since the kernel does
+ * not support blocking hotplug, is to require the memory device to be
+ * disabled before hotplug is attempted. When CONFIG_MEMORY_HOTPLUG is
+ * disabled that step will fail and the remove attempt cancelled by the
+ * user. If that is not honored and the card is removed anyway then it
+ * does not matter if CONFIG_MEMORY_HOTPLUG is enabled or not, it will
+ * cause a crash and other badness.
+ *
+ * Therefore, just say yes to CXL hotplug and require removal to
+ * be coordinated by userspace unless and until the kernel grows better
+ * mechanisms for doing "managed" removal of devices in consultation with
+ * the driver.
+ *
+ */
+ static u32 calculate_cxl_support(void)
+ {
+ u32 support;
+
+ support = OSC_CXL_2_0_PORT_DEV_REG_ACCESS_SUPPORT;
+ if (pci_aer_available())
+ support |= OSC_CXL_PROTOCOL_ERR_REPORTING_SUPPORT;
+ if (IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
+ support |= OSC_CXL_NATIVE_HP_SUPPORT;
+
+ return support;
+ }
+
static u32 calculate_control(void)
{
u32 control;
return control;
}
+ static u32 calculate_cxl_control(void)
+ {
+ u32 control = 0;
+
+ if (IS_ENABLED(CONFIG_MEMORY_FAILURE))
+ control |= OSC_CXL_ERROR_REPORTING_CONTROL;
+
+ return control;
+ }
+
static bool os_control_query_checks(struct acpi_pci_root *root, u32 support)
{
struct acpi_device *device = root->device;
return true;
}
- static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm,
- bool is_pcie)
+ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm)
{
u32 support, control = 0, requested = 0;
+ u32 cxl_support = 0, cxl_control = 0, cxl_requested = 0;
acpi_status status;
struct acpi_device *device = root->device;
acpi_handle handle = device->handle;
if (os_control_query_checks(root, support))
requested = control = calculate_control();
- status = acpi_pci_osc_control_set(handle, &control, support);
+ if (is_cxl(root)) {
+ cxl_support = calculate_cxl_support();
+ decode_cxl_osc_support(root, "OS supports", cxl_support);
+ cxl_requested = cxl_control = calculate_cxl_control();
+ }
+
+ status = acpi_pci_osc_control_set(handle, &control, support,
+ &cxl_control, cxl_support);
if (ACPI_SUCCESS(status)) {
if (control)
decode_osc_control(root, "OS now controls", control);
+ if (cxl_control)
+ decode_cxl_osc_control(root, "OS now controls",
+ cxl_control);
if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_ASPM) {
/*
*no_aspm = 1;
/* _OSC is optional for PCI host bridges */
- if ((status == AE_NOT_FOUND) && !is_pcie)
+ if (status == AE_NOT_FOUND && !is_pcie(root))
return;
if (control) {
decode_osc_control(root, "OS requested", requested);
decode_osc_control(root, "platform willing to grant", control);
}
+ if (cxl_control) {
+ decode_cxl_osc_control(root, "OS requested", cxl_requested);
+ decode_cxl_osc_control(root, "platform willing to grant",
+ cxl_control);
+ }
dev_info(&device->dev, "_OSC: platform retains control of PCIe features (%s)\n",
acpi_format_exception(status));
acpi_handle handle = device->handle;
int no_aspm = 0;
bool hotadd = system_state == SYSTEM_RUNNING;
- bool is_pcie;
+ const char *acpi_hid;
root = kzalloc(sizeof(struct acpi_pci_root), GFP_KERNEL);
if (!root)
root->mcfg_addr = acpi_pci_root_get_mcfg_addr(handle);
- is_pcie = strcmp(acpi_device_hid(device), "PNP0A08") == 0;
- negotiate_os_control(root, &no_aspm, is_pcie);
+ acpi_hid = acpi_device_hid(root->device);
+ if (strcmp(acpi_hid, "PNP0A08") == 0)
+ root->bridge_type = ACPI_BRIDGE_TYPE_PCIE;
+ else if (strcmp(acpi_hid, "ACPI0016") == 0)
+ root->bridge_type = ACPI_BRIDGE_TYPE_CXL;
+ else
+ dev_dbg(&device->dev, "Assuming non-PCIe host bridge\n");
+
+ negotiate_os_control(root, &no_aspm);
/*
* TBD: Need PCI interface for enumeration/configuration of roots.
host_bridge->preserve_config = 1;
ACPI_FREE(obj);
+ acpi_dev_power_up_children_with_adr(device);
+
pci_scan_child_bus(bus);
pci_set_host_bridge_release(host_bridge, acpi_pci_root_release_info,
info);
return to_nd_region(to_dev(pmem)->parent);
}
-static void hwpoison_clear(struct pmem_device *pmem,
- phys_addr_t phys, unsigned int len)
+static phys_addr_t to_phys(struct pmem_device *pmem, phys_addr_t offset)
{
+ return pmem->phys_addr + offset;
+}
+
+static sector_t to_sect(struct pmem_device *pmem, phys_addr_t offset)
+{
+ return (offset - pmem->data_offset) >> SECTOR_SHIFT;
+}
+
+static phys_addr_t to_offset(struct pmem_device *pmem, sector_t sector)
+{
+ return (sector << SECTOR_SHIFT) + pmem->data_offset;
+}
+
+static void pmem_mkpage_present(struct pmem_device *pmem, phys_addr_t offset,
+ unsigned int len)
+{
+ phys_addr_t phys = to_phys(pmem, offset);
unsigned long pfn_start, pfn_end, pfn;
/* only pmem in the linear map supports HWPoison */
}
}
-static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
- phys_addr_t offset, unsigned int len)
+static void pmem_clear_bb(struct pmem_device *pmem, sector_t sector, long blks)
{
- struct device *dev = to_dev(pmem);
- sector_t sector;
- long cleared;
- blk_status_t rc = BLK_STS_OK;
+ if (blks == 0)
+ return;
+ badblocks_clear(&pmem->bb, sector, blks);
+ if (pmem->bb_state)
+ sysfs_notify_dirent(pmem->bb_state);
+}
- sector = (offset - pmem->data_offset) / 512;
+static long __pmem_clear_poison(struct pmem_device *pmem,
+ phys_addr_t offset, unsigned int len)
+{
+ phys_addr_t phys = to_phys(pmem, offset);
+ long cleared = nvdimm_clear_poison(to_dev(pmem), phys, len);
- cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
- if (cleared < len)
- rc = BLK_STS_IOERR;
- if (cleared > 0 && cleared / 512) {
- hwpoison_clear(pmem, pmem->phys_addr + offset, cleared);
- cleared /= 512;
- dev_dbg(dev, "%#llx clear %ld sector%s\n",
- (unsigned long long) sector, cleared,
- cleared > 1 ? "s" : "");
- badblocks_clear(&pmem->bb, sector, cleared);
- if (pmem->bb_state)
- sysfs_notify_dirent(pmem->bb_state);
+ if (cleared > 0) {
+ pmem_mkpage_present(pmem, offset, cleared);
+ arch_invalidate_pmem(pmem->virt_addr + offset, len);
}
+ return cleared;
+}
- arch_invalidate_pmem(pmem->virt_addr + offset, len);
+static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
+ phys_addr_t offset, unsigned int len)
+{
+ long cleared = __pmem_clear_poison(pmem, offset, len);
- return rc;
+ if (cleared < 0)
+ return BLK_STS_IOERR;
+
+ pmem_clear_bb(pmem, to_sect(pmem, offset), cleared >> SECTOR_SHIFT);
+ if (cleared < len)
+ return BLK_STS_IOERR;
+ return BLK_STS_OK;
}
static void write_pmem(void *pmem_addr, struct page *page,
sector_t sector, unsigned int len)
{
blk_status_t rc;
- phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
+ phys_addr_t pmem_off = to_offset(pmem, sector);
void *pmem_addr = pmem->virt_addr + pmem_off;
if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
struct page *page, unsigned int page_off,
sector_t sector, unsigned int len)
{
- blk_status_t rc = BLK_STS_OK;
- bool bad_pmem = false;
- phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
+ phys_addr_t pmem_off = to_offset(pmem, sector);
void *pmem_addr = pmem->virt_addr + pmem_off;
- if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
- bad_pmem = true;
+ if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) {
+ blk_status_t rc = pmem_clear_poison(pmem, pmem_off, len);
+
+ if (rc != BLK_STS_OK)
+ return rc;
+ }
- /*
- * Note that we write the data both before and after
- * clearing poison. The write before clear poison
- * handles situations where the latest written data is
- * preserved and the clear poison operation simply marks
- * the address range as valid without changing the data.
- * In this case application software can assume that an
- * interrupted write will either return the new good
- * data or an error.
- *
- * However, if pmem_clear_poison() leaves the data in an
- * indeterminate state we need to perform the write
- * after clear poison.
- */
flush_dcache_page(page);
write_pmem(pmem_addr, page, page_off, len);
- if (unlikely(bad_pmem)) {
- rc = pmem_clear_poison(pmem, pmem_off, len);
- write_pmem(pmem_addr, page, page_off, len);
- }
- return rc;
+ return BLK_STS_OK;
}
static void pmem_submit_bio(struct bio *bio)
/* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
__weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
- long nr_pages, void **kaddr, pfn_t *pfn)
+ long nr_pages, enum dax_access_mode mode, void **kaddr,
+ pfn_t *pfn)
{
resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
-
- if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
- PFN_PHYS(nr_pages))))
- return -EIO;
+ sector_t sector = PFN_PHYS(pgoff) >> SECTOR_SHIFT;
+ unsigned int num = PFN_PHYS(nr_pages) >> SECTOR_SHIFT;
+ struct badblocks *bb = &pmem->bb;
+ sector_t first_bad;
+ int num_bad;
if (kaddr)
*kaddr = pmem->virt_addr + offset;
if (pfn)
*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+ if (bb->count &&
+ badblocks_check(bb, sector, num, &first_bad, &num_bad)) {
+ long actual_nr;
+
+ if (mode != DAX_RECOVERY_WRITE)
+ return -EIO;
+
+ /*
+ * Set the recovery stride is set to kernel page size because
+ * the underlying driver and firmware clear poison functions
+ * don't appear to handle large chunk(such as 2MiB) reliably.
+ */
+ actual_nr = PHYS_PFN(
+ PAGE_ALIGN((first_bad - sector) << SECTOR_SHIFT));
+ dev_dbg(pmem->bb.dev, "start sector(%llu), nr_pages(%ld), first_bad(%llu), actual_nr(%ld)\n",
+ sector, nr_pages, first_bad, actual_nr);
+ if (actual_nr)
+ return actual_nr;
+ return 1;
+ }
+
/*
- * If badblocks are present, limit known good range to the
- * requested range.
+ * If badblocks are present but not in the range, limit known good range
+ * to the requested range.
*/
- if (unlikely(pmem->bb.count))
+ if (bb->count)
return nr_pages;
return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
}
}
static long pmem_dax_direct_access(struct dax_device *dax_dev,
- pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
+ pgoff_t pgoff, long nr_pages, enum dax_access_mode mode,
+ void **kaddr, pfn_t *pfn)
+{
+ struct pmem_device *pmem = dax_get_private(dax_dev);
+
+ return __pmem_direct_access(pmem, pgoff, nr_pages, mode, kaddr, pfn);
+}
+
+/*
+ * The recovery write thread started out as a normal pwrite thread and
+ * when the filesystem was told about potential media error in the
+ * range, filesystem turns the normal pwrite to a dax_recovery_write.
+ *
+ * The recovery write consists of clearing media poison, clearing page
+ * HWPoison bit, reenable page-wide read-write permission, flush the
+ * caches and finally write. A competing pread thread will be held
+ * off during the recovery process since data read back might not be
+ * valid, and this is achieved by clearing the badblock records after
+ * the recovery write is complete. Competing recovery write threads
+ * are already serialized by writer lock held by dax_iomap_rw().
+ */
+static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i)
{
struct pmem_device *pmem = dax_get_private(dax_dev);
+ size_t olen, len, off;
+ phys_addr_t pmem_off;
+ struct device *dev = pmem->bb.dev;
+ long cleared;
+
+ off = offset_in_page(addr);
+ len = PFN_PHYS(PFN_UP(off + bytes));
+ if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) >> SECTOR_SHIFT, len))
+ return _copy_from_iter_flushcache(addr, bytes, i);
+
+ /*
+ * Not page-aligned range cannot be recovered. This should not
+ * happen unless something else went wrong.
+ */
+ if (off || !PAGE_ALIGNED(bytes)) {
+ dev_dbg(dev, "Found poison, but addr(%p) or bytes(%#zx) not page aligned\n",
+ addr, bytes);
+ return 0;
+ }
+
+ pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
+ cleared = __pmem_clear_poison(pmem, pmem_off, len);
+ if (cleared > 0 && cleared < len) {
+ dev_dbg(dev, "poison cleared only %ld out of %zu bytes\n",
+ cleared, len);
+ return 0;
+ }
+ if (cleared < 0) {
+ dev_dbg(dev, "poison clear failed: %ld\n", cleared);
+ return 0;
+ }
+
+ olen = _copy_from_iter_flushcache(addr, bytes, i);
+ pmem_clear_bb(pmem, to_sect(pmem, pmem_off), cleared >> SECTOR_SHIFT);
- return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
+ return olen;
}
static const struct dax_operations pmem_dax_ops = {
.direct_access = pmem_dax_direct_access,
.zero_page_range = pmem_dax_zero_page_range,
+ .recovery_write = pmem_recovery_write,
};
static ssize_t write_cache_show(struct device *dev,
nvdimm_namespace_detach_btt(to_nd_btt(dev));
else {
/*
- * Note, this assumes nd_device_lock() context to not
+ * Note, this assumes device_lock() context to not
* race nd_pmem_notify()
*/
sysfs_put(pmem->bb_state);
extern struct bus_type acpi_bus_type;
int acpi_bus_for_each_dev(int (*fn)(struct device *, void *), void *data);
+int acpi_dev_for_each_child(struct acpi_device *adev,
+ int (*fn)(struct acpi_device *, void *), void *data);
/*
* Events
int acpi_bus_update_power(acpi_handle handle, int *state_p);
int acpi_device_update_power(struct acpi_device *device, int *state_p);
bool acpi_bus_power_manageable(acpi_handle handle);
+void acpi_dev_power_up_children_with_adr(struct acpi_device *adev);
int acpi_device_power_add_dependent(struct acpi_device *adev,
struct device *dev);
void acpi_device_power_remove_dependent(struct acpi_device *adev,
int acpi_bind_one(struct device *dev, struct acpi_device *adev);
int acpi_unbind_one(struct device *dev);
+ enum acpi_bridge_type {
+ ACPI_BRIDGE_TYPE_PCIE = 1,
+ ACPI_BRIDGE_TYPE_CXL,
+ };
+
struct acpi_pci_root {
struct acpi_device * device;
struct pci_bus *bus;
u16 segment;
+ int bridge_type;
struct resource secondary; /* downstream bus range */
- u32 osc_support_set; /* _OSC state of support bits */
- u32 osc_control_set; /* _OSC state of control bits */
+ u32 osc_support_set; /* _OSC state of support bits */
+ u32 osc_control_set; /* _OSC state of control bits */
+ u32 osc_ext_support_set; /* _OSC state of extended support bits */
+ u32 osc_ext_control_set; /* _OSC state of extended control bits */
phys_addr_t mcfg_addr;
};
acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
- /* Indexes into _OSC Capabilities Buffer (DWORDs 2 & 3 are device-specific) */
+ /* Number of _OSC capability DWORDS depends on bridge type */
+ #define OSC_PCI_CAPABILITY_DWORDS 3
+ #define OSC_CXL_CAPABILITY_DWORDS 5
+
+ /* Indexes into _OSC Capabilities Buffer (DWORDs 2 to 5 are device-specific) */
#define OSC_QUERY_DWORD 0 /* DWORD 1 */
#define OSC_SUPPORT_DWORD 1 /* DWORD 2 */
#define OSC_CONTROL_DWORD 2 /* DWORD 3 */
+ #define OSC_EXT_SUPPORT_DWORD 3 /* DWORD 4 */
+ #define OSC_EXT_CONTROL_DWORD 4 /* DWORD 5 */
/* _OSC Capabilities DWORD 1: Query/Control and Error Returns (generic) */
#define OSC_QUERY_ENABLE 0x00000001 /* input */
#define OSC_SB_OSLPI_SUPPORT 0x00000100
#define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT 0x00001000
#define OSC_SB_GENERIC_INITIATOR_SUPPORT 0x00002000
+#define OSC_SB_CPC_FLEXIBLE_ADR_SPACE 0x00004000
#define OSC_SB_NATIVE_USB4_SUPPORT 0x00040000
#define OSC_SB_PRM_SUPPORT 0x00200000
extern bool osc_pc_lpi_support_confirmed;
extern bool osc_sb_native_usb4_support_confirmed;
extern bool osc_sb_cppc_not_supported;
+extern bool osc_cpc_flexible_adr_space_confirmed;
/* USB4 Capabilities */
#define OSC_USB_USB3_TUNNELING 0x00000001
#define OSC_PCI_EXPRESS_LTR_CONTROL 0x00000020
#define OSC_PCI_EXPRESS_DPC_CONTROL 0x00000080
+ /* CXL _OSC: Capabilities DWORD 4: Support Field */
+ #define OSC_CXL_1_1_PORT_REG_ACCESS_SUPPORT 0x00000001
+ #define OSC_CXL_2_0_PORT_DEV_REG_ACCESS_SUPPORT 0x00000002
+ #define OSC_CXL_PROTOCOL_ERR_REPORTING_SUPPORT 0x00000004
+ #define OSC_CXL_NATIVE_HP_SUPPORT 0x00000008
+
+ /* CXL _OSC: Capabilities DWORD 5: Control Field */
+ #define OSC_CXL_ERROR_REPORTING_CONTROL 0x00000001
+
+ static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context)
+ {
+ u32 *ret = context->ret.pointer;
+
+ return ret[OSC_CONTROL_DWORD];
+ }
+
+ static inline u32 acpi_osc_ctx_get_cxl_control(struct acpi_osc_context *context)
+ {
+ u32 *ret = context->ret.pointer;
+
+ return ret[OSC_EXT_CONTROL_DWORD];
+ }
+
#define ACPI_GSB_ACCESS_ATTRIB_QUICK 0x00000002
#define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV 0x00000004
#define ACPI_GSB_ACCESS_ATTRIB_BYTE 0x00000006
static inline void acpi_unregister_wakeup_handler(
bool (*wakeup)(void *context), void *context) { }
+ struct acpi_osc_context;
+ static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context)
+ {
+ return 0;
+ }
+
+ static inline u32 acpi_osc_ctx_get_cxl_control(struct acpi_osc_context *context)
+ {
+ return 0;
+ }
+
#endif /* !CONFIG_ACPI */
#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
struct task_struct;
-/* for sysctl */
-extern int prove_locking;
-extern int lock_stat;
-
#ifdef CONFIG_LOCKDEP
#include <linux/linkage.h>
struct lock_class_key *key, unsigned int subclass,
unsigned long ip);
+ #define lock_set_novalidate_class(l, n, i) \
+ lock_set_class(l, n, &__lockdep_no_validate__, 0, i)
+
static inline void lock_set_subclass(struct lockdep_map *lock,
unsigned int subclass, unsigned long ip)
{
# define lock_acquire(l, s, t, r, c, n, i) do { } while (0)
# define lock_release(l, i) do { } while (0)
# define lock_downgrade(l, i) do { } while (0)
- # define lock_set_class(l, n, k, s, i) do { } while (0)
+ # define lock_set_class(l, n, key, s, i) do { (void)(key); } while (0)
+ # define lock_set_novalidate_class(l, n, i) do { } while (0)
# define lock_set_subclass(l, s, i) do { } while (0)
# define lockdep_init() do { } while (0)
# define lockdep_init_map_type(lock, name, key, sub, inner, outer, type) \
}
#endif /* CONFIG_VT_CONSOLE_SLEEP */
+ #ifdef CONFIG_CXL_SUSPEND
+ bool cxl_mem_active(void);
+ #else
+ static inline bool cxl_mem_active(void)
+ {
+ return false;
+ }
+ #endif
+
/*
* Device power management
*/
#ifdef CONFIG_PM
#define _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \
- runtime_resume_fn, idle_fn, sec) \
+ runtime_resume_fn, idle_fn, sec, ns) \
_DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \
runtime_resume_fn, idle_fn); \
- _EXPORT_SYMBOL(name, sec)
+ __EXPORT_SYMBOL(name, sec, ns)
#else
#define _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \
- runtime_resume_fn, idle_fn, sec) \
+ runtime_resume_fn, idle_fn, sec, ns) \
static __maybe_unused _DEFINE_DEV_PM_OPS(__static_##name, suspend_fn, \
resume_fn, runtime_suspend_fn, \
runtime_resume_fn, idle_fn)
_DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL)
#define EXPORT_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
- _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "")
+ _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "", "")
#define EXPORT_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
- _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl")
+ _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl", "")
+#define EXPORT_NS_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn, ns) \
+ _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "", #ns)
+#define EXPORT_NS_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn, ns) \
+ _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl", #ns)
/* Deprecated. Use DEFINE_SIMPLE_DEV_PM_OPS() instead. */
#define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
char *s = buf;
suspend_state_t i;
- for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) {
+ if (i >= PM_SUSPEND_MEM && cxl_mem_active())
+ continue;
if (mem_sleep_states[i]) {
const char *label = mem_sleep_states[i];
else
s += sprintf(s, "%s ", label);
}
+ }
/* Convert the last space to a newline if needed. */
if (s != buf)
}
__setup("pm_debug_messages", pm_debug_messages_setup);
-/**
- * __pm_pr_dbg - Print a suspend debug message to the kernel log.
- * @defer: Whether or not to use printk_deferred() to print the message.
- * @fmt: Message format.
- *
- * The message will be emitted if enabled through the pm_debug_messages
- * sysfs attribute.
- */
-void __pm_pr_dbg(bool defer, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- if (!pm_debug_messages_on)
- return;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- if (defer)
- printk_deferred(KERN_DEBUG "PM: %pV", &vaf);
- else
- printk(KERN_DEBUG "PM: %pV", &vaf);
-
- va_end(args);
-}
-
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
#endif /* CONFIG_PM_SLEEP_DEBUG */
larger and slower, but it gives very useful debugging information
in case of kernel bugs. (precise oopses/stacktraces/warnings)
+config OBJTOOL
+ bool
+
config STACK_VALIDATION
bool "Compile-time stack metadata validation"
- depends on HAVE_STACK_VALIDATION
+ depends on HAVE_STACK_VALIDATION && UNWINDER_FRAME_POINTER
+ select OBJTOOL
default n
help
- Add compile-time checks to validate stack metadata, including frame
- pointers (if CONFIG_FRAME_POINTER is enabled). This helps ensure
- that runtime stack traces are more reliable.
-
- This is also a prerequisite for generation of ORC unwind data, which
- is needed for CONFIG_UNWINDER_ORC.
+ Validate frame pointer rules at compile-time. This helps ensure that
+ runtime stack traces are more reliable.
For more information, see
tools/objtool/Documentation/stack-validation.txt.
-config VMLINUX_VALIDATION
+config NOINSTR_VALIDATION
bool
- depends on STACK_VALIDATION && DEBUG_ENTRY
+ depends on HAVE_NOINSTR_VALIDATION && DEBUG_ENTRY
+ select OBJTOOL
default y
config VMLINUX_MAP
help
Debug objects boot parameter default value
-config DEBUG_SLAB
- bool "Debug slab memory allocations"
- depends on DEBUG_KERNEL && SLAB
- help
- Say Y here to have the kernel do limited verification on memory
- allocation as well as poisoning memory on free to catch use of freed
- memory. This can make kmalloc/kfree-intensive workloads much slower.
-
-config SLUB_DEBUG_ON
- bool "SLUB debugging on by default"
- depends on SLUB && SLUB_DEBUG
- default n
- help
- Boot with debugging on by default. SLUB boots by default with
- the runtime debug capabilities switched off. Enabling this is
- equivalent to specifying the "slub_debug" parameter on boot.
- There is no support for more fine grained debug control like
- possible with slub_debug=xxx. SLUB debugging may be switched
- off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying
- "slub_debug=-".
-
-config SLUB_STATS
- default n
- bool "Enable SLUB performance statistics"
- depends on SLUB && SYSFS
- help
- SLUB statistics are useful to debug SLUBs allocation behavior in
- order find ways to optimize the allocator. This should never be
- enabled for production use since keeping statistics slows down
- the allocator by a few percentage points. The slabinfo command
- supports the determination of the most active slabs to figure
- out which slabs are relevant to a particular load.
- Try running: slabinfo -DA
-
config HAVE_DEBUG_KMEMLEAK
bool
Say N if unsure.
-config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE
- int
- depends on SOFTLOCKUP_DETECTOR
- range 0 1
- default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC
- default 1 if BOOTPARAM_SOFTLOCKUP_PANIC
-
config HARDLOCKUP_DETECTOR_PERF
bool
select SOFTLOCKUP_DETECTOR
Say N if unsure.
-config BOOTPARAM_HARDLOCKUP_PANIC_VALUE
- int
- depends on HARDLOCKUP_DETECTOR
- range 0 1
- default 0 if !BOOTPARAM_HARDLOCKUP_PANIC
- default 1 if BOOTPARAM_HARDLOCKUP_PANIC
-
config DETECT_HUNG_TASK
bool "Detect Hung Tasks"
depends on DEBUG_KERNEL
Say N if unsure.
-config BOOTPARAM_HUNG_TASK_PANIC_VALUE
- int
- depends on DETECT_HUNG_TASK
- range 0 1
- default 0 if !BOOTPARAM_HUNG_TASK_PANIC
- default 1 if BOOTPARAM_HUNG_TASK_PANIC
-
config WQ_WATCHDOG
bool "Detect Workqueue Stalls"
depends on DEBUG_KERNEL
include the IPI handler function currently executing (if any)
and relevant stack traces.
- choice
- prompt "Lock debugging: prove subsystem device_lock() correctness"
- depends on PROVE_LOCKING
- help
- For subsystems that have instrumented their usage of the device_lock()
- with nested annotations, enable lock dependency checking. The locking
- hierarchy 'subclass' identifiers are not compatible across
- sub-systems, so only one can be enabled at a time.
-
- config PROVE_NVDIMM_LOCKING
- bool "NVDIMM"
- depends on LIBNVDIMM
- help
- Enable lockdep to validate nd_device_lock() usage.
-
- config PROVE_CXL_LOCKING
- bool "CXL"
- depends on CXL_BUS
- help
- Enable lockdep to validate cxl_device_lock() usage.
-
- endchoice
-
endmenu # lock debugging
config TRACE_IRQFLAGS
so architecture maintainers really need to do what they can
to get the CRNG seeded sooner after the system is booted.
However, since users cannot do anything actionable to
- address this, by default the kernel will issue only a single
- warning for the first use of unseeded randomness.
+ address this, by default this option is disabled.
Say Y here if you want to receive warnings for all uses of
unseeded randomness. This will be of use primarily for
bool "Code coverage for fuzzing"
depends on ARCH_HAS_KCOV
depends on CC_HAS_SANCOV_TRACE_PC || GCC_PLUGINS
- depends on !ARCH_WANTS_NO_INSTR || STACK_VALIDATION || \
+ depends on !ARCH_WANTS_NO_INSTR || HAVE_NOINSTR_HACK || \
GCC_VERSION >= 120000 || CLANG_VERSION >= 130000
select DEBUG_FS
select GCC_PLUGIN_SANCOV if !CC_HAS_SANCOV_TRACE_PC
+ select OBJTOOL if HAVE_NOINSTR_HACK
help
KCOV exposes kernel code coverage information in a form suitable
for coverage-guided fuzzing (randomized testing).
If unsure, say N.
config KPROBES_SANITY_TEST
- tristate "Kprobes sanity tests"
+ tristate "Kprobes sanity tests" if !KUNIT_ALL_TESTS
depends on DEBUG_KERNEL
depends on KPROBES
depends on KUNIT
+ default KUNIT_ALL_TESTS
help
This option provides for testing basic kprobes functionality on
boot. Samples of kprobe and kretprobe are inserted and
If unsure, say N.
config BITFIELD_KUNIT
- tristate "KUnit test bitfield functions at runtime"
+ tristate "KUnit test bitfield functions at runtime" if !KUNIT_ALL_TESTS
depends on KUNIT
+ default KUNIT_ALL_TESTS
help
Enable this option to test the bitfield functions at boot.
optimized versions. If unsure, say N.
config RESOURCE_KUNIT_TEST
- tristate "KUnit test for resource API"
+ tristate "KUnit test for resource API" if !KUNIT_ALL_TESTS
depends on KUNIT
+ default KUNIT_ALL_TESTS
help
This builds the resource API unit test.
Tests the logic of API provided by resource.c and ioport.h.
If unsure, say N.
config CMDLINE_KUNIT_TEST
- tristate "KUnit test for cmdline API"
+ tristate "KUnit test for cmdline API" if !KUNIT_ALL_TESTS
depends on KUNIT
+ default KUNIT_ALL_TESTS
help
This builds the cmdline API unit test.
Tests the logic of API provided by cmdline.c.
If unsure, say N.
config BITS_TEST
- tristate "KUnit test for bits.h"
+ tristate "KUnit test for bits.h" if !KUNIT_ALL_TESTS
depends on KUNIT
+ default KUNIT_ALL_TESTS
help
This builds the bits unit test.
Tests the logic of macros defined in bits.h.