]> Git Repo - J-linux.git/commitdiff
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <[email protected]>
Thu, 15 Dec 2022 19:12:21 +0000 (11:12 -0800)
committerLinus Torvalds <[email protected]>
Thu, 15 Dec 2022 19:12:21 +0000 (11:12 -0800)
Pull kvm updates from Paolo Bonzini:
 "ARM64:

   - Enable the per-vcpu dirty-ring tracking mechanism, together with an
     option to keep the good old dirty log around for pages that are
     dirtied by something other than a vcpu.

   - Switch to the relaxed parallel fault handling, using RCU to delay
     page table reclaim and giving better performance under load.

   - Relax the MTE ABI, allowing a VMM to use the MAP_SHARED mapping
     option, which multi-process VMMs such as crosvm rely on (see merge
     commit 382b5b87a97d: "Fix a number of issues with MTE, such as
     races on the tags being initialised vs the PG_mte_tagged flag as
     well as the lack of support for VM_SHARED when KVM is involved.
     Patches from Catalin Marinas and Peter Collingbourne").

   - Merge the pKVM shadow vcpu state tracking that allows the
     hypervisor to have its own view of a vcpu, keeping that state
     private.

   - Add support for the PMUv3p5 architecture revision, bringing support
     for 64bit counters on systems that support it, and fix the
     no-quite-compliant CHAIN-ed counter support for the machines that
     actually exist out there.

   - Fix a handful of minor issues around 52bit VA/PA support (64kB
     pages only) as a prefix of the oncoming support for 4kB and 16kB
     pages.

   - Pick a small set of documentation and spelling fixes, because no
     good merge window would be complete without those.

  s390:

   - Second batch of the lazy destroy patches

   - First batch of KVM changes for kernel virtual != physical address
     support

   - Removal of a unused function

  x86:

   - Allow compiling out SMM support

   - Cleanup and documentation of SMM state save area format

   - Preserve interrupt shadow in SMM state save area

   - Respond to generic signals during slow page faults

   - Fixes and optimizations for the non-executable huge page errata
     fix.

   - Reprogram all performance counters on PMU filter change

   - Cleanups to Hyper-V emulation and tests

   - Process Hyper-V TLB flushes from a nested guest (i.e. from a L2
     guest running on top of a L1 Hyper-V hypervisor)

   - Advertise several new Intel features

   - x86 Xen-for-KVM:

      - Allow the Xen runstate information to cross a page boundary

      - Allow XEN_RUNSTATE_UPDATE flag behaviour to be configured

      - Add support for 32-bit guests in SCHEDOP_poll

   - Notable x86 fixes and cleanups:

      - One-off fixes for various emulation flows (SGX, VMXON, NRIPS=0).

      - Reinstate IBPB on emulated VM-Exit that was incorrectly dropped
        a few years back when eliminating unnecessary barriers when
        switching between vmcs01 and vmcs02.

      - Clean up vmread_error_trampoline() to make it more obvious that
        params must be passed on the stack, even for x86-64.

      - Let userspace set all supported bits in MSR_IA32_FEAT_CTL
        irrespective of the current guest CPUID.

      - Fudge around a race with TSC refinement that results in KVM
        incorrectly thinking a guest needs TSC scaling when running on a
        CPU with a constant TSC, but no hardware-enumerated TSC
        frequency.

      - Advertise (on AMD) that the SMM_CTL MSR is not supported

      - Remove unnecessary exports

  Generic:

   - Support for responding to signals during page faults; introduces
     new FOLL_INTERRUPTIBLE flag that was reviewed by mm folks

  Selftests:

   - Fix an inverted check in the access tracking perf test, and restore
     support for asserting that there aren't too many idle pages when
     running on bare metal.

   - Fix build errors that occur in certain setups (unsure exactly what
     is unique about the problematic setup) due to glibc overriding
     static_assert() to a variant that requires a custom message.

   - Introduce actual atomics for clear/set_bit() in selftests

   - Add support for pinning vCPUs in dirty_log_perf_test.

   - Rename the so called "perf_util" framework to "memstress".

   - Add a lightweight psuedo RNG for guest use, and use it to randomize
     the access pattern and write vs. read percentage in the memstress
     tests.

   - Add a common ucall implementation; code dedup and pre-work for
     running SEV (and beyond) guests in selftests.

   - Provide a common constructor and arch hook, which will eventually
     be used by x86 to automatically select the right hypercall (AMD vs.
     Intel).

   - A bunch of added/enabled/fixed selftests for ARM64, covering
     memslots, breakpoints, stage-2 faults and access tracking.

   - x86-specific selftest changes:

      - Clean up x86's page table management.

      - Clean up and enhance the "smaller maxphyaddr" test, and add a
        related test to cover generic emulation failure.

      - Clean up the nEPT support checks.

      - Add X86_PROPERTY_* framework to retrieve multi-bit CPUID values.

      - Fix an ordering issue in the AMX test introduced by recent
        conversions to use kvm_cpu_has(), and harden the code to guard
        against similar bugs in the future. Anything that tiggers
        caching of KVM's supported CPUID, kvm_cpu_has() in this case,
        effectively hides opt-in XSAVE features if the caching occurs
        before the test opts in via prctl().

  Documentation:

   - Remove deleted ioctls from documentation

   - Clean up the docs for the x86 MSR filter.

   - Various fixes"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (361 commits)
  KVM: x86: Add proper ReST tables for userspace MSR exits/flags
  KVM: selftests: Allocate ucall pool from MEM_REGION_DATA
  KVM: arm64: selftests: Align VA space allocator with TTBR0
  KVM: arm64: Fix benign bug with incorrect use of VA_BITS
  KVM: arm64: PMU: Fix period computation for 64bit counters with 32bit overflow
  KVM: x86: Advertise that the SMM_CTL MSR is not supported
  KVM: x86: remove unnecessary exports
  KVM: selftests: Fix spelling mistake "probabalistic" -> "probabilistic"
  tools: KVM: selftests: Convert clear/set_bit() to actual atomics
  tools: Drop "atomic_" prefix from atomic test_and_set_bit()
  tools: Drop conflicting non-atomic test_and_{clear,set}_bit() helpers
  KVM: selftests: Use non-atomic clear/set bit helpers in KVM tests
  perf tools: Use dedicated non-atomic clear/set bit helpers
  tools: Take @bit as an "unsigned long" in {clear,set}_bit() helpers
  KVM: arm64: selftests: Enable single-step without a "full" ucall()
  KVM: x86: fix APICv/x2AVIC disabled when vm reboot by itself
  KVM: Remove stale comment about KVM_REQ_UNHALT
  KVM: Add missing arch for KVM_CREATE_DEVICE and KVM_{SET,GET}_DEVICE_ATTR
  KVM: Reference to kvm_userspace_memory_region in doc and comments
  KVM: Delete all references to removed KVM_SET_MEMORY_ALIAS ioctl
  ...

31 files changed:
1  2 
Documentation/virt/kvm/api.rst
MAINTAINERS
arch/arm64/Kconfig
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/pgtable.h
arch/arm64/kernel/cpufeature.c
arch/arm64/kernel/image-vars.h
arch/arm64/mm/fault.c
arch/s390/kernel/entry.S
arch/s390/kvm/vsie.c
arch/s390/mm/gmap.c
arch/s390/mm/init.c
arch/x86/events/intel/lbr.c
arch/x86/include/asm/cpufeatures.h
arch/x86/include/asm/hyperv-tlfs.h
arch/x86/kernel/kvm.c
arch/x86/kvm/cpuid.c
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/reverse_cpuid.h
arch/x86/kvm/svm/svm.c
arch/x86/kvm/x86.c
drivers/s390/crypto/vfio_ap_ops.c
include/asm-generic/hyperv-tlfs.h
include/linux/kvm_host.h
include/linux/mm.h
include/linux/page-flags.h
mm/Kconfig
mm/gup.c
mm/huge_memory.c
mm/hugetlb.c
virt/kvm/kvm_main.c

index 896914e3a8475a12fe3b841c736566726ba47157,778c6460d1de637d69e24410cd53c096d4999226..0dd5d8733dd583ff77a4ef9072831e12e289a3b1
@@@ -272,18 -272,6 +272,6 @@@ the VCPU file descriptor can be mmap-ed
    KVM_CAP_DIRTY_LOG_RING, see section 8.3.
  
  
- 4.6 KVM_SET_MEMORY_REGION
- -------------------------
- :Capability: basic
- :Architectures: all
- :Type: vm ioctl
- :Parameters: struct kvm_memory_region (in)
- :Returns: 0 on success, -1 on error
- This ioctl is obsolete and has been removed.
  4.7 KVM_CREATE_VCPU
  -------------------
  
@@@ -368,17 -356,6 +356,6 @@@ see the description of the capability
  Note that the Xen shared info page, if configured, shall always be assumed
  to be dirty. KVM will not explicitly mark it such.
  
- 4.9 KVM_SET_MEMORY_ALIAS
- ------------------------
- :Capability: basic
- :Architectures: x86
- :Type: vm ioctl
- :Parameters: struct kvm_memory_alias (in)
- :Returns: 0 (success), -1 (error)
- This ioctl is obsolete and has been removed.
  
  4.10 KVM_RUN
  ------------
@@@ -1332,7 -1309,7 +1309,7 @@@ yet and must be cleared on entry
        __u64 userspace_addr; /* start of the userspace allocated memory */
    };
  
-   /* for kvm_memory_region::flags */
+   /* for kvm_userspace_memory_region::flags */
    #define KVM_MEM_LOG_DIRTY_PAGES     (1UL << 0)
    #define KVM_MEM_READONLY    (1UL << 1)
  
@@@ -1377,10 -1354,6 +1354,6 @@@ the memory region are automatically ref
  mmap() that affects the region will be made visible immediately.  Another
  example is madvise(MADV_DROP).
  
- It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl.
- The KVM_SET_MEMORY_REGION does not allow fine grained control over memory
- allocation and is deprecated.
  
  4.36 KVM_SET_TSS_ADDR
  ---------------------
@@@ -3293,6 -3266,7 +3266,7 @@@ valid entries found
  ----------------------
  
  :Capability: KVM_CAP_DEVICE_CTRL
+ :Architectures: all
  :Type: vm ioctl
  :Parameters: struct kvm_create_device (in/out)
  :Returns: 0 on success, -1 on error
@@@ -3333,6 -3307,7 +3307,7 @@@ number
  :Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device,
               KVM_CAP_VCPU_ATTRIBUTES for vcpu device
               KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device (no set)
+ :Architectures: x86, arm64, s390
  :Type: device ioctl, vm ioctl, vcpu ioctl
  :Parameters: struct kvm_device_attr
  :Returns: 0 on success, -1 on error
@@@ -4104,80 -4079,71 +4079,71 @@@ flags values for ``struct kvm_msr_filte
  ``KVM_MSR_FILTER_READ``
  
    Filter read accesses to MSRs using the given bitmap. A 0 in the bitmap
-   indicates that a read should immediately fail, while a 1 indicates that
-   a read for a particular MSR should be handled regardless of the default
+   indicates that read accesses should be denied, while a 1 indicates that
+   a read for a particular MSR should be allowed regardless of the default
    filter action.
  
  ``KVM_MSR_FILTER_WRITE``
  
    Filter write accesses to MSRs using the given bitmap. A 0 in the bitmap
-   indicates that a write should immediately fail, while a 1 indicates that
-   a write for a particular MSR should be handled regardless of the default
+   indicates that write accesses should be denied, while a 1 indicates that
+   a write for a particular MSR should be allowed regardless of the default
    filter action.
  
- ``KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE``
-   Filter both read and write accesses to MSRs using the given bitmap. A 0
-   in the bitmap indicates that both reads and writes should immediately fail,
-   while a 1 indicates that reads and writes for a particular MSR are not
-   filtered by this range.
  flags values for ``struct kvm_msr_filter``:
  
  ``KVM_MSR_FILTER_DEFAULT_ALLOW``
  
    If no filter range matches an MSR index that is getting accessed, KVM will
-   fall back to allowing access to the MSR.
+   allow accesses to all MSRs by default.
  
  ``KVM_MSR_FILTER_DEFAULT_DENY``
  
    If no filter range matches an MSR index that is getting accessed, KVM will
-   fall back to rejecting access to the MSR. In this mode, all MSRs that should
-   be processed by KVM need to explicitly be marked as allowed in the bitmaps.
+   deny accesses to all MSRs by default.
+ This ioctl allows userspace to define up to 16 bitmaps of MSR ranges to deny
+ guest MSR accesses that would normally be allowed by KVM.  If an MSR is not
+ covered by a specific range, the "default" filtering behavior applies.  Each
+ bitmap range covers MSRs from [base .. base+nmsrs).
  
- This ioctl allows user space to define up to 16 bitmaps of MSR ranges to
- specify whether a certain MSR access should be explicitly filtered for or not.
+ If an MSR access is denied by userspace, the resulting KVM behavior depends on
+ whether or not KVM_CAP_X86_USER_SPACE_MSR's KVM_MSR_EXIT_REASON_FILTER is
+ enabled.  If KVM_MSR_EXIT_REASON_FILTER is enabled, KVM will exit to userspace
+ on denied accesses, i.e. userspace effectively intercepts the MSR access.  If
+ KVM_MSR_EXIT_REASON_FILTER is not enabled, KVM will inject a #GP into the guest
+ on denied accesses.
  
- If this ioctl has never been invoked, MSR accesses are not guarded and the
- default KVM in-kernel emulation behavior is fully preserved.
+ If an MSR access is allowed by userspace, KVM will emulate and/or virtualize
+ the access in accordance with the vCPU model.  Note, KVM may still ultimately
+ inject a #GP if an access is allowed by userspace, e.g. if KVM doesn't support
+ the MSR, or to follow architectural behavior for the MSR.
+ By default, KVM operates in KVM_MSR_FILTER_DEFAULT_ALLOW mode with no MSR range
+ filters.
  
  Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR
  filtering. In that mode, ``KVM_MSR_FILTER_DEFAULT_DENY`` is invalid and causes
  an error.
  
- As soon as the filtering is in place, every MSR access is processed through
- the filtering except for accesses to the x2APIC MSRs (from 0x800 to 0x8ff);
- x2APIC MSRs are always allowed, independent of the ``default_allow`` setting,
- and their behavior depends on the ``X2APIC_ENABLE`` bit of the APIC base
- register.
  .. warning::
-    MSR accesses coming from nested vmentry/vmexit are not filtered.
+    MSR accesses as part of nested VM-Enter/VM-Exit are not filtered.
     This includes both writes to individual VMCS fields and reads/writes
     through the MSR lists pointed to by the VMCS.
  
- If a bit is within one of the defined ranges, read and write accesses are
- guarded by the bitmap's value for the MSR index if the kind of access
- is included in the ``struct kvm_msr_filter_range`` flags.  If no range
- cover this particular access, the behavior is determined by the flags
- field in the kvm_msr_filter struct: ``KVM_MSR_FILTER_DEFAULT_ALLOW``
- and ``KVM_MSR_FILTER_DEFAULT_DENY``.
- Each bitmap range specifies a range of MSRs to potentially allow access on.
- The range goes from MSR index [base .. base+nmsrs]. The flags field
- indicates whether reads, writes or both reads and writes are filtered
- by setting a 1 bit in the bitmap for the corresponding MSR index.
- If an MSR access is not permitted through the filtering, it generates a
- #GP inside the guest. When combined with KVM_CAP_X86_USER_SPACE_MSR, that
- allows user space to deflect and potentially handle various MSR accesses
- into user space.
+    x2APIC MSR accesses cannot be filtered (KVM silently ignores filters that
+    cover any x2APIC MSRs).
  
  Note, invoking this ioctl while a vCPU is running is inherently racy.  However,
  KVM does guarantee that vCPUs will see either the previous filter or the new
  filter, e.g. MSRs with identical settings in both the old and new filter will
  have deterministic behavior.
  
+ Similarly, if userspace wishes to intercept on denied accesses,
+ KVM_MSR_EXIT_REASON_FILTER must be enabled before activating any filters, and
+ left enabled until after all filters are deactivated.  Failure to do so may
+ result in KVM injecting a #GP instead of exiting to userspace.
  4.98 KVM_CREATE_SPAPR_TCE_64
  ----------------------------
  
@@@ -5163,10 -5129,13 +5129,13 @@@ KVM_PV_ENABL
    =====      =============================
  
  KVM_PV_DISABLE
-   Deregister the VM from the Ultravisor and reclaim the memory that
-   had been donated to the Ultravisor, making it usable by the kernel
-   again.  All registered VCPUs are converted back to non-protected
-   ones.
+   Deregister the VM from the Ultravisor and reclaim the memory that had
+   been donated to the Ultravisor, making it usable by the kernel again.
+   All registered VCPUs are converted back to non-protected ones. If a
+   previous protected VM had been prepared for asynchonous teardown with
+   KVM_PV_ASYNC_CLEANUP_PREPARE and not subsequently torn down with
+   KVM_PV_ASYNC_CLEANUP_PERFORM, it will be torn down in this call
+   together with the current protected VM.
  
  KVM_PV_VM_SET_SEC_PARMS
    Pass the image header from VM memory to the Ultravisor in
@@@ -5289,6 -5258,36 +5258,36 @@@ KVM_PV_DUM
      authentication tag all of which are needed to decrypt the dump at a
      later time.
  
+ KVM_PV_ASYNC_CLEANUP_PREPARE
+   :Capability: KVM_CAP_S390_PROTECTED_ASYNC_DISABLE
+   Prepare the current protected VM for asynchronous teardown. Most
+   resources used by the current protected VM will be set aside for a
+   subsequent asynchronous teardown. The current protected VM will then
+   resume execution immediately as non-protected. There can be at most
+   one protected VM prepared for asynchronous teardown at any time. If
+   a protected VM had already been prepared for teardown without
+   subsequently calling KVM_PV_ASYNC_CLEANUP_PERFORM, this call will
+   fail. In that case, the userspace process should issue a normal
+   KVM_PV_DISABLE. The resources set aside with this call will need to
+   be cleaned up with a subsequent call to KVM_PV_ASYNC_CLEANUP_PERFORM
+   or KVM_PV_DISABLE, otherwise they will be cleaned up when KVM
+   terminates. KVM_PV_ASYNC_CLEANUP_PREPARE can be called again as soon
+   as cleanup starts, i.e. before KVM_PV_ASYNC_CLEANUP_PERFORM finishes.
+ KVM_PV_ASYNC_CLEANUP_PERFORM
+   :Capability: KVM_CAP_S390_PROTECTED_ASYNC_DISABLE
+   Tear down the protected VM previously prepared for teardown with
+   KVM_PV_ASYNC_CLEANUP_PREPARE. The resources that had been set aside
+   will be freed during the execution of this command. This PV command
+   should ideally be issued by userspace from a separate thread. If a
+   fatal signal is received (or the process terminates naturally), the
+   command will terminate immediately without completing, and the normal
+   KVM shutdown procedure will take care of cleaning up all remaining
+   protected VMs, including the ones whose teardown was interrupted by
+   process termination.
  4.126 KVM_XEN_HVM_SET_ATTR
  --------------------------
  
        union {
                __u8 long_mode;
                __u8 vector;
+               __u8 runstate_update_flag;
                struct {
                        __u64 gfn;
                } shared_info;
@@@ -5383,6 -5383,14 +5383,14 @@@ KVM_XEN_ATTR_TYPE_XEN_VERSIO
    event channel delivery, so responding within the kernel without
    exiting to userspace is beneficial.
  
+ KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG
+   This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+   support for KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG. It enables the
+   XEN_RUNSTATE_UPDATE flag which allows guest vCPUs to safely read
+   other vCPUs' vcpu_runstate_info. Xen guests enable this feature via
+   the VM_ASST_TYPE_runstate_update_flag of the HYPERVISOR_vm_assist
+   hypercall.
  4.127 KVM_XEN_HVM_GET_ATTR
  --------------------------
  
@@@ -6440,31 -6448,35 +6448,35 @@@ if it decides to decode and emulate th
  
  Used on x86 systems. When the VM capability KVM_CAP_X86_USER_SPACE_MSR is
  enabled, MSR accesses to registers that would invoke a #GP by KVM kernel code
will instead trigger a KVM_EXIT_X86_RDMSR exit for reads and KVM_EXIT_X86_WRMSR
may instead trigger a KVM_EXIT_X86_RDMSR exit for reads and KVM_EXIT_X86_WRMSR
  exit for writes.
  
- The "reason" field specifies why the MSR trap occurred. User space will only
receive MSR exit traps when a particular reason was requested during through
+ The "reason" field specifies why the MSR interception occurred. Userspace will
only receive MSR exits when a particular reason was requested during through
  ENABLE_CAP. Currently valid exit reasons are:
  
-       KVM_MSR_EXIT_REASON_UNKNOWN - access to MSR that is unknown to KVM
-       KVM_MSR_EXIT_REASON_INVAL - access to invalid MSRs or reserved bits
-       KVM_MSR_EXIT_REASON_FILTER - access blocked by KVM_X86_SET_MSR_FILTER
+ ============================ ========================================
+  KVM_MSR_EXIT_REASON_UNKNOWN access to MSR that is unknown to KVM
+  KVM_MSR_EXIT_REASON_INVAL   access to invalid MSRs or reserved bits
+  KVM_MSR_EXIT_REASON_FILTER  access blocked by KVM_X86_SET_MSR_FILTER
+ ============================ ========================================
  
- For KVM_EXIT_X86_RDMSR, the "index" field tells user space which MSR the guest
- wants to read. To respond to this request with a successful read, user space
+ For KVM_EXIT_X86_RDMSR, the "index" field tells userspace which MSR the guest
+ wants to read. To respond to this request with a successful read, userspace
  writes the respective data into the "data" field and must continue guest
  execution to ensure the read data is transferred into guest register state.
  
- If the RDMSR request was unsuccessful, user space indicates that with a "1" in
+ If the RDMSR request was unsuccessful, userspace indicates that with a "1" in
  the "error" field. This will inject a #GP into the guest when the VCPU is
  executed again.
  
- For KVM_EXIT_X86_WRMSR, the "index" field tells user space which MSR the guest
- wants to write. Once finished processing the event, user space must continue
- vCPU execution. If the MSR write was unsuccessful, user space also sets the
+ For KVM_EXIT_X86_WRMSR, the "index" field tells userspace which MSR the guest
+ wants to write. Once finished processing the event, userspace must continue
+ vCPU execution. If the MSR write was unsuccessful, userspace also sets the
  "error" field to "1".
  
+ See KVM_X86_SET_MSR_FILTER for details on the interaction with MSR filtering.
  ::
  
  
@@@ -7213,13 -7225,14 +7225,13 @@@ veto the transition
  :Parameters: args[0] is the maximum poll time in nanoseconds
  :Returns: 0 on success; -1 on error
  
 -This capability overrides the kvm module parameter halt_poll_ns for the
 -target VM.
 -
 -VCPU polling allows a VCPU to poll for wakeup events instead of immediately
 -scheduling during guest halts. The maximum time a VCPU can spend polling is
 -controlled by the kvm module parameter halt_poll_ns. This capability allows
 -the maximum halt time to specified on a per-VM basis, effectively overriding
 -the module parameter for the target VM.
 +KVM_CAP_HALT_POLL overrides the kvm.halt_poll_ns module parameter to set the
 +maximum halt-polling time for all vCPUs in the target VM. This capability can
 +be invoked at any time and any number of times to dynamically change the
 +maximum halt-polling time.
 +
 +See Documentation/virt/kvm/halt-polling.rst for more information on halt
 +polling.
  
  7.21 KVM_CAP_X86_USER_SPACE_MSR
  -------------------------------
  :Parameters: args[0] contains the mask of KVM_MSR_EXIT_REASON_* events to report
  :Returns: 0 on success; -1 on error
  
- This capability enables trapping of #GP invoking RDMSR and WRMSR instructions
into user space.
+ This capability allows userspace to intercept RDMSR and WRMSR instructions if
access to an MSR is denied.  By default, KVM injects #GP on denied accesses.
  
  When a guest requests to read or write an MSR, KVM may not implement all MSRs
  that are relevant to a respective system. It also does not differentiate by
  CPU type.
  
- To allow more fine grained control over MSR handling, user space may enable
+ To allow more fine grained control over MSR handling, userspace may enable
  this capability. With it enabled, MSR accesses that match the mask specified in
- args[0] and trigger a #GP event inside the guest by KVM will instead trigger
- KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space
- can then handle to implement model specific MSR handling and/or user notifications
- to inform a user that an MSR was not handled.
+ args[0] and would trigger a #GP inside the guest will instead trigger
+ KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications.  Userspace
+ can then implement model specific MSR handling and/or user notifications
+ to inform a user that an MSR was not emulated/virtualized by KVM.
+ The valid mask flags are:
+ ============================ ===============================================
+  KVM_MSR_EXIT_REASON_UNKNOWN intercept accesses to unknown (to KVM) MSRs
+  KVM_MSR_EXIT_REASON_INVAL   intercept accesses that are architecturally
+                              invalid according to the vCPU model and/or mode
+  KVM_MSR_EXIT_REASON_FILTER  intercept accesses that are denied by userspace
+                              via KVM_X86_SET_MSR_FILTER
+ ============================ ===============================================
  
  7.22 KVM_CAP_X86_BUS_LOCK_EXIT
  -------------------------------
@@@ -7384,8 -7407,9 +7406,9 @@@ hibernation of the host; however the VM
  tags as appropriate if the VM is migrated.
  
  When this capability is enabled all memory in memslots must be mapped as
- not-shareable (no MAP_SHARED), attempts to create a memslot with a
- MAP_SHARED mmap will result in an -EINVAL return.
+ ``MAP_ANONYMOUS`` or with a RAM-based file mapping (``tmpfs``, ``memfd``),
+ attempts to create a memslot with an invalid mmap will result in an
+ -EINVAL return.
  
  When enabled the VMM may make use of the ``KVM_ARM_MTE_COPY_TAGS`` ioctl to
  perform a bulk copy of tags to/from the guest.
@@@ -7901,7 -7925,7 +7924,7 @@@ KVM_EXIT_X86_WRMSR exit notifications
  This capability indicates that KVM supports that accesses to user defined MSRs
  may be rejected. With this capability exposed, KVM exports new VM ioctl
  KVM_X86_SET_MSR_FILTER which user space can call to specify bitmaps of MSR
- ranges that KVM should reject access to.
+ ranges that KVM should deny access to.
  
  In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to
  trap and emulate MSRs that are outside of the scope of KVM as well as
@@@ -7920,7 -7944,7 +7943,7 @@@ regardless of what has actually been ex
  8.29 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ACQ_REL
  ----------------------------------------------------------
  
- :Architectures: x86
+ :Architectures: x86, arm64
  :Parameters: args[0] - size of the dirty log ring
  
  KVM is capable of tracking dirty memory using ring buffers that are
@@@ -8002,13 -8026,6 +8025,6 @@@ flushing is done by the KVM_GET_DIRTY_L
  needs to kick the vcpu out of KVM_RUN using a signal.  The resulting
  vmexit ensures that all dirty GFNs are flushed to the dirty rings.
  
- NOTE: the capability KVM_CAP_DIRTY_LOG_RING and the corresponding
- ioctl KVM_RESET_DIRTY_RINGS are mutual exclusive to the existing ioctls
- KVM_GET_DIRTY_LOG and KVM_CLEAR_DIRTY_LOG.  After enabling
- KVM_CAP_DIRTY_LOG_RING with an acceptable dirty ring size, the virtual
- machine will switch to ring-buffer dirty page tracking and further
- KVM_GET_DIRTY_LOG or KVM_CLEAR_DIRTY_LOG ioctls will fail.
  NOTE: KVM_CAP_DIRTY_LOG_RING_ACQ_REL is the only capability that
  should be exposed by weakly ordered architecture, in order to indicate
  the additional memory ordering requirements imposed on userspace when
@@@ -8017,6 -8034,33 +8033,33 @@@ Architecture with TSO-like ordering (su
  expose both KVM_CAP_DIRTY_LOG_RING and KVM_CAP_DIRTY_LOG_RING_ACQ_REL
  to userspace.
  
+ After enabling the dirty rings, the userspace needs to detect the
+ capability of KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP to see whether the
+ ring structures can be backed by per-slot bitmaps. With this capability
+ advertised, it means the architecture can dirty guest pages without
+ vcpu/ring context, so that some of the dirty information will still be
+ maintained in the bitmap structure. KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP
+ can't be enabled if the capability of KVM_CAP_DIRTY_LOG_RING_ACQ_REL
+ hasn't been enabled, or any memslot has been existing.
+ Note that the bitmap here is only a backup of the ring structure. The
+ use of the ring and bitmap combination is only beneficial if there is
+ only a very small amount of memory that is dirtied out of vcpu/ring
+ context. Otherwise, the stand-alone per-slot bitmap mechanism needs to
+ be considered.
+ To collect dirty bits in the backup bitmap, userspace can use the same
+ KVM_GET_DIRTY_LOG ioctl. KVM_CLEAR_DIRTY_LOG isn't needed as long as all
+ the generation of the dirty bits is done in a single pass. Collecting
+ the dirty bitmap should be the very last thing that the VMM does before
+ considering the state as complete. VMM needs to ensure that the dirty
+ state is final and avoid missing dirty pages from another ioctl ordered
+ after the bitmap collection.
+ NOTE: One example of using the backup bitmap is saving arm64 vgic/its
+ tables through KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_SAVE_TABLES} command on
+ KVM device "kvm-arm-vgic-its" when dirty ring is enabled.
  8.30 KVM_CAP_XEN_HVM
  --------------------
  
  This capability indicates the features that Xen supports for hosting Xen
  PVHVM guests. Valid flags are::
  
-   #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR    (1 << 0)
-   #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL  (1 << 1)
-   #define KVM_XEN_HVM_CONFIG_SHARED_INFO      (1 << 2)
-   #define KVM_XEN_HVM_CONFIG_RUNSTATE         (1 << 3)
-   #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL    (1 << 4)
-   #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND      (1 << 5)
+   #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR            (1 << 0)
+   #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL          (1 << 1)
+   #define KVM_XEN_HVM_CONFIG_SHARED_INFO              (1 << 2)
+   #define KVM_XEN_HVM_CONFIG_RUNSTATE                 (1 << 3)
+   #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL            (1 << 4)
+   #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND              (1 << 5)
+   #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG     (1 << 6)
  
  The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
  ioctl is available, for the guest to set its hypercall page.
@@@ -8062,6 -8107,18 +8106,18 @@@ KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID/TIMER/UP
  related to event channel delivery, timers, and the XENVER_version
  interception.
  
+ The KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG flag indicates that KVM supports
+ the KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG attribute in the KVM_XEN_SET_ATTR
+ and KVM_XEN_GET_ATTR ioctls. This controls whether KVM will set the
+ XEN_RUNSTATE_UPDATE flag in guest memory mapped vcpu_runstate_info during
+ updates of the runstate information. Note that versions of KVM which support
+ the RUNSTATE feature above, but not thie RUNSTATE_UPDATE_FLAG feature, will
+ always set the XEN_RUNSTATE_UPDATE flag when updating the guest structure,
+ which is perhaps counterintuitive. When this flag is advertised, KVM will
+ behave more correctly, not using the XEN_RUNSTATE_UPDATE flag until/unless
+ specifically enabled (by the guest making the hypercall, causing the VMM
+ to enable the KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG attribute).
  8.31 KVM_CAP_PPC_MULTITCE
  -------------------------
  
diff --combined MAINTAINERS
index b454c20a5c88ab923327f4e9171ad5ca54c27c73,89672a59c0c3a107a4026c32b71192a3101d7e03..096ae475e21cd95018290cb9c83e17898a8b5ba2
@@@ -312,13 -312,6 +312,13 @@@ L:       [email protected]
  S:    Maintained
  F:    drivers/counter/104-quad-8.c
  
 +ACCES IDIO-16 GPIO LIBRARY
 +M:    William Breathitt Gray <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +F:    drivers/gpio/gpio-idio-16.c
 +F:    drivers/gpio/gpio-idio-16.h
 +
  ACCES PCI-IDIO-16 GPIO DRIVER
  M:    William Breathitt Gray <[email protected]>
  L:    [email protected]
@@@ -782,24 -775,6 +782,24 @@@ T:       git git://linuxtv.org/media_tree.gi
  F:    Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml
  F:    drivers/media/platform/sunxi/sun4i-csi/
  
 +ALLWINNER A31 CSI DRIVER
 +M:    Yong Deng <[email protected]>
 +M:    Paul Kocialkowski <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +T:    git git://linuxtv.org/media_tree.git
 +F:    Documentation/devicetree/bindings/media/allwinner,sun6i-a31-csi.yaml
 +F:    drivers/media/platform/sunxi/sun6i-csi/
 +
 +ALLWINNER A31 ISP DRIVER
 +M:    Paul Kocialkowski <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +T:    git git://linuxtv.org/media_tree.git
 +F:    Documentation/devicetree/bindings/media/allwinner,sun6i-a31-isp.yaml
 +F:    drivers/staging/media/sunxi/sun6i-isp/
 +F:    drivers/staging/media/sunxi/sun6i-isp/uapi/sun6i-isp-config.h
 +
  ALLWINNER A31 MIPI CSI-2 BRIDGE DRIVER
  M:    Paul Kocialkowski <[email protected]>
  L:    [email protected]
@@@ -1118,16 -1093,6 +1118,16 @@@ S:    Maintaine
  F:    Documentation/hid/amd-sfh*
  F:    drivers/hid/amd-sfh-hid/
  
 +AMLOGIC DDR PMU DRIVER
 +M:    Jiucheng Xu <[email protected]>
 +L:    [email protected]
 +S:    Supported
 +W:    http://www.amlogic.com
 +F:    Documentation/admin-guide/perf/meson-ddr-pmu.rst
 +F:    Documentation/devicetree/bindings/perf/amlogic,g12-ddr-pmu.yaml
 +F:    drivers/perf/amlogic/
 +F:    include/soc/amlogic/
 +
  AMPHION VPU CODEC V4L2 DRIVER
  M:    Ming Qian <[email protected]>
  M:    Shijie Qin <[email protected]>
@@@ -1720,7 -1685,7 +1720,7 @@@ M:      Miquel Raynal <miquel.raynal@bootlin
  M:    Naga Sureshkumar Relli <[email protected]>
  L:    [email protected] (moderated for non-subscribers)
  S:    Maintained
 -F:    Documentation/devicetree/bindings/memory-controllers/arm,pl353-smc.yaml
 +F:    Documentation/devicetree/bindings/memory-controllers/arm,pl35x-smc.yaml
  F:    drivers/memory/pl353-smc.c
  
  ARM PRIMECELL CLCD PL110 DRIVER
@@@ -1932,14 -1897,12 +1932,14 @@@ T:   git https://github.com/AsahiLinux/li
  F:    Documentation/devicetree/bindings/arm/apple.yaml
  F:    Documentation/devicetree/bindings/arm/apple/*
  F:    Documentation/devicetree/bindings/clock/apple,nco.yaml
 +F:    Documentation/devicetree/bindings/cpufreq/apple,cluster-cpufreq.yaml
  F:    Documentation/devicetree/bindings/dma/apple,admac.yaml
  F:    Documentation/devicetree/bindings/i2c/apple,i2c.yaml
  F:    Documentation/devicetree/bindings/interrupt-controller/apple,*
  F:    Documentation/devicetree/bindings/iommu/apple,dart.yaml
  F:    Documentation/devicetree/bindings/iommu/apple,sart.yaml
  F:    Documentation/devicetree/bindings/mailbox/apple,mailbox.yaml
 +F:    Documentation/devicetree/bindings/net/bluetooth/brcm,bcm4377-bluetooth.yaml
  F:    Documentation/devicetree/bindings/nvme/apple,nvme-ans.yaml
  F:    Documentation/devicetree/bindings/nvmem/apple,efuses.yaml
  F:    Documentation/devicetree/bindings/pci/apple,pcie.yaml
@@@ -1947,9 -1910,7 +1947,9 @@@ F:      Documentation/devicetree/bindings/pi
  F:    Documentation/devicetree/bindings/power/apple*
  F:    Documentation/devicetree/bindings/watchdog/apple,wdt.yaml
  F:    arch/arm64/boot/dts/apple/
 +F:    drivers/bluetooth/hci_bcm4377.c
  F:    drivers/clk/clk-apple-nco.c
 +F:    drivers/cpufreq/apple-soc-cpufreq.c
  F:    drivers/dma/apple-admac.c
  F:    drivers/i2c/busses/i2c-pasemi-core.c
  F:    drivers/i2c/busses/i2c-pasemi-platform.c
@@@ -2236,7 -2197,7 +2236,7 @@@ M:      Wei Xu <[email protected]
  L:    [email protected] (moderated for non-subscribers)
  S:    Supported
  W:    http://www.hisilicon.com
 -T:    git git://github.com/hisilicon/linux-hisi.git
 +T:    git https://github.com/hisilicon/linux-hisi.git
  F:    arch/arm/boot/dts/hi3*
  F:    arch/arm/boot/dts/hip*
  F:    arch/arm/boot/dts/hisi*
@@@ -2311,6 -2272,8 +2311,6 @@@ F:      drivers/clocksource/timer-ixp4xx.
  F:    drivers/crypto/ixp4xx_crypto.c
  F:    drivers/gpio/gpio-ixp4xx.c
  F:    drivers/irqchip/irq-ixp4xx.c
 -F:    include/linux/irqchip/irq-ixp4xx.h
 -F:    include/linux/platform_data/timer-ixp4xx.h
  
  ARM/INTEL KEEMBAY ARCHITECTURE
  M:    Paul J. Murphy <[email protected]>
@@@ -2378,8 -2341,6 +2378,8 @@@ M:      Gregory Clement <gregory.clement@boo
  L:    [email protected] (moderated for non-subscribers)
  S:    Maintained
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/gclement/mvebu.git
 +F:    Documentation/devicetree/bindings/arm/marvell/marvell,dove.txt
 +F:    Documentation/devicetree/bindings/arm/marvell/marvell,orion5x.txt
  F:    Documentation/devicetree/bindings/soc/dove/
  F:    arch/arm/boot/dts/dove*
  F:    arch/arm/boot/dts/orion5x*
@@@ -2396,7 -2357,6 +2396,7 @@@ M:      Sebastian Hesselbarth <sebastian.hes
  L:    [email protected] (moderated for non-subscribers)
  S:    Maintained
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/gclement/mvebu.git
 +F:    Documentation/devicetree/bindings/arm/marvell/
  F:    arch/arm/boot/dts/armada*
  F:    arch/arm/boot/dts/kirkwood*
  F:    arch/arm/configs/mvebu_*_defconfig
@@@ -2479,7 -2439,6 +2479,7 @@@ L:      [email protected]
  S:    Supported
  T:    git git://github.com/microchip-ung/linux-upstream.git
  F:    arch/arm64/boot/dts/microchip/
 +F:    drivers/net/ethernet/microchip/vcap/
  F:    drivers/pinctrl/pinctrl-microchip-sgpio.c
  N:    sparx5
  
@@@ -2661,7 -2620,7 +2661,7 @@@ W:      http://www.armlinux.org.uk
  ARM/QUALCOMM SUPPORT
  M:    Andy Gross <[email protected]>
  M:    Bjorn Andersson <[email protected]>
 -R:    Konrad Dybcio <konrad.dybcio@somainline.org>
 +R:    Konrad Dybcio <konrad.dybcio@linaro.org>
  L:    [email protected]
  S:    Maintained
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux.git
@@@ -2732,7 -2691,7 +2732,7 @@@ F:      arch/arm/boot/dts/rtd
  F:    arch/arm/mach-realtek/
  F:    arch/arm64/boot/dts/realtek/
  
 -ARM/RENESAS ARCHITECTURE
 +ARM/RISC-V/RENESAS ARCHITECTURE
  M:    Geert Uytterhoeven <[email protected]>
  M:    Magnus Damm <[email protected]>
  L:    [email protected]
@@@ -2740,6 -2699,7 +2740,6 @@@ S:      Supporte
  Q:    http://patchwork.kernel.org/project/linux-renesas-soc/list/
  C:    irc://irc.libera.chat/renesas-soc
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git next
 -F:    Documentation/devicetree/bindings/arm/renesas.yaml
  F:    Documentation/devicetree/bindings/hwinfo/renesas,prr.yaml
  F:    Documentation/devicetree/bindings/soc/renesas/
  F:    arch/arm/boot/dts/emev2*
@@@ -2753,7 -2713,6 +2753,7 @@@ F:      arch/arm/configs/shmobile_defconfi
  F:    arch/arm/include/debug/renesas-scif.S
  F:    arch/arm/mach-shmobile/
  F:    arch/arm64/boot/dts/renesas/
 +F:    arch/riscv/boot/dts/renesas/
  F:    drivers/soc/renesas/
  F:    include/linux/soc/renesas/
  
@@@ -4850,7 -4809,7 +4850,7 @@@ R:      Jeff Layton <[email protected]
  L:    [email protected]
  S:    Supported
  W:    http://ceph.com/
 -T:    git git://github.com/ceph/ceph-client.git
 +T:    git https://github.com/ceph/ceph-client.git
  F:    include/linux/ceph/
  F:    include/linux/crush/
  F:    net/ceph/
@@@ -4862,7 -4821,7 +4862,7 @@@ R:      Jeff Layton <[email protected]
  L:    [email protected]
  S:    Supported
  W:    http://ceph.com/
 -T:    git git://github.com/ceph/ceph-client.git
 +T:    git https://github.com/ceph/ceph-client.git
  F:    Documentation/filesystems/ceph.rst
  F:    fs/ceph/
  
@@@ -4952,7 -4911,7 +4952,7 @@@ F:      drivers/platform/chrome
  
  CHROMEOS EC CODEC DRIVER
  M:    Cheng-Yi Chiang <[email protected]>
 -M:    Tzung-Bi Shih <tzungbi@google.com>
 +M:    Tzung-Bi Shih <tzungbi@kernel.org>
  R:    Guenter Roeck <[email protected]>
  L:    [email protected]
  S:    Maintained
@@@ -4982,12 -4941,6 +4982,12 @@@ S:    Maintaine
  F:    drivers/platform/chrome/cros_usbpd_notify.c
  F:    include/linux/platform_data/cros_usbpd_notify.h
  
 +CHROMEOS HPS DRIVER
 +M:    Dan Callaghan <[email protected]>
 +R:    Sami Kyöstilä <[email protected]>
 +S:    Maintained
 +F:    drivers/platform/chrome/cros_hps_i2c.c
 +
  CHRONTEL CH7322 CEC DRIVER
  M:    Joe Tessler <[email protected]>
  L:    [email protected]
@@@ -5346,7 -5299,7 +5346,7 @@@ M:      Johannes Weiner <[email protected]
  M:    Michal Hocko <[email protected]>
  M:    Roman Gushchin <[email protected]>
  M:    Shakeel Butt <[email protected]>
 -R:    Muchun Song <[email protected]>
 +R:    Muchun Song <[email protected]>
  L:    [email protected]
  L:    [email protected]
  S:    Maintained
@@@ -5549,6 -5502,14 +5549,6 @@@ M:     Jaya Kumar <[email protected]
  S:    Maintained
  F:    sound/pci/cs5535audio/
  
 -CSI DRIVERS FOR ALLWINNER V3s
 -M:    Yong Deng <[email protected]>
 -L:    [email protected]
 -S:    Maintained
 -T:    git git://linuxtv.org/media_tree.git
 -F:    Documentation/devicetree/bindings/media/allwinner,sun6i-a31-csi.yaml
 -F:    drivers/media/platform/sunxi/sun6i-csi/
 -
  CTU CAN FD DRIVER
  M:    Pavel Pisa <[email protected]>
  M:    Ondrej Ille <[email protected]>
@@@ -5624,6 -5585,8 +5624,6 @@@ F:      drivers/scsi/cxgbi/cxgb3
  
  CXGB4 CRYPTO DRIVER (chcr)
  M:    Ayush Sawal <[email protected]>
 -M:    Vinay Kumar Yadav <[email protected]>
 -M:    Rohit Maheshwari <[email protected]>
  L:    [email protected]
  S:    Supported
  W:    http://www.chelsio.com
@@@ -5631,6 -5594,8 +5631,6 @@@ F:      drivers/crypto/chelsi
  
  CXGB4 INLINE CRYPTO DRIVER
  M:    Ayush Sawal <[email protected]>
 -M:    Vinay Kumar Yadav <[email protected]>
 -M:    Rohit Maheshwari <[email protected]>
  L:    [email protected]
  S:    Supported
  W:    http://www.chelsio.com
  S:    Maintained
  F:    drivers/platform/x86/dell/dell-wmi-descriptor.c
  
 +DELL WMI DDV DRIVER
 +M:    Armin Wolf <[email protected]>
 +S:    Maintained
 +F:    Documentation/ABI/testing/debugfs-dell-wmi-ddv
 +F:    Documentation/ABI/testing/sysfs-platform-dell-wmi-ddv
 +F:    drivers/platform/x86/dell/dell-wmi-ddv.c
 +
  DELL WMI SYSMAN DRIVER
  M:    Divya Bharathi <[email protected]>
  M:    Prasanth Ksr <[email protected]>
@@@ -6075,12 -6033,11 +6075,12 @@@ F:   include/net/devlink.
  F:    include/uapi/linux/devlink.h
  F:    net/core/devlink.c
  
 -DH ELECTRONICS IMX6 DHCOM BOARD SUPPORT
 +DH ELECTRONICS IMX6 DHCOM/DHCOR BOARD SUPPORT
  M:    Christoph Niedermaier <[email protected]>
  L:    [email protected]
  S:    Maintained
  F:    arch/arm/boot/dts/imx6*-dhcom-*
 +F:    arch/arm/boot/dts/imx6*-dhcor-*
  
  DH ELECTRONICS STM32MP1 DHCOM/DHCOR BOARD SUPPORT
  M:    Marek Vasut <[email protected]>
@@@ -6372,7 -6329,6 +6372,7 @@@ F:      drivers/net/ethernet/freescale/dpaa2
  F:    drivers/net/ethernet/freescale/dpaa2/Makefile
  F:    drivers/net/ethernet/freescale/dpaa2/dpaa2-eth*
  F:    drivers/net/ethernet/freescale/dpaa2/dpaa2-mac*
 +F:    drivers/net/ethernet/freescale/dpaa2/dpaa2-xsk*
  F:    drivers/net/ethernet/freescale/dpaa2/dpkg.h
  F:    drivers/net/ethernet/freescale/dpaa2/dpmac*
  F:    drivers/net/ethernet/freescale/dpaa2/dpni*
@@@ -6550,12 -6506,6 +6550,12 @@@ S:    Orphan / Obsolet
  F:    drivers/gpu/drm/i810/
  F:    include/uapi/drm/i810_drm.h
  
 +DRM DRIVER FOR JADARD JD9365DA-H3 MIPI-DSI LCD PANELS
 +M:    Jagan Teki <[email protected]>
 +S:    Maintained
 +F:    Documentation/devicetree/bindings/display/panel/jadard,jd9365da-h3.yaml
 +F:    drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c
 +
  DRM DRIVER FOR LOGICVC DISPLAY CONTROLLER
  M:    Paul Kocialkowski <[email protected]>
  S:    Supported
  S:    Maintained
  T:    git git://anongit.freedesktop.org/drm/drm-misc
  F:    drivers/gpu/drm/drm_aperture.c
 +F:    drivers/gpu/drm/tiny/ofdrm.c
  F:    drivers/gpu/drm/tiny/simpledrm.c
  F:    drivers/video/aperture.c
 +F:    drivers/video/nomodeset.c
  F:    include/drm/drm_aperture.h
  F:    include/linux/aperture.h
 +F:    include/video/nomodeset.h
  
  DRM DRIVER FOR SIS VIDEO CARDS
  S:    Orphan / Obsolete
@@@ -6880,15 -6827,6 +6880,15 @@@ F:    include/drm/drm
  F:    include/linux/vga*
  F:    include/uapi/drm/drm*
  
 +DRM COMPUTE ACCELERATORS DRIVERS AND FRAMEWORK
 +M:    Oded Gabbay <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +C:    irc://irc.oftc.net/dri-devel
 +T:    git https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/accel.git
 +F:    Documentation/accel/
 +F:    drivers/accel/
 +
  DRM DRIVERS FOR ALLWINNER A10
  M:    Maxime Ripard <[email protected]>
  M:    Chen-Yu Tsai <[email protected]>
@@@ -7177,7 -7115,7 +7177,7 @@@ F:      drivers/gpu/drm/ttm
  F:    include/drm/ttm/
  
  DRM GPU SCHEDULER
 -M:    Andrey Grodzovsky <andrey.grodzovsky@amd.com>
 +M:    Luben Tuikov <luben.tuikov@amd.com>
  L:    [email protected]
  S:    Maintained
  T:    git git://anongit.freedesktop.org/drm/drm-misc
@@@ -7425,9 -7363,9 +7425,9 @@@ F:      drivers/edac/thunderx_edac
  
  EDAC-CORE
  M:    Borislav Petkov <[email protected]>
 -M:    Mauro Carvalho Chehab <[email protected]>
  M:    Tony Luck <[email protected]>
  R:    James Morse <[email protected]>
 +R:    Mauro Carvalho Chehab <[email protected]>
  R:    Robert Richter <[email protected]>
  L:    [email protected]
  S:    Supported
@@@ -7544,7 -7482,8 +7544,7 @@@ S:      Maintaine
  F:    drivers/edac/pnd2_edac.[ch]
  
  EDAC-QCOM
 -M:    Channagoud Kadabi <[email protected]>
 -M:    Venkata Narendra Kumar Gutta <[email protected]>
 +M:    Manivannan Sadhasivam <[email protected]>
  L:    [email protected]
  L:    [email protected]
  S:    Maintained
@@@ -7745,7 -7684,6 +7745,7 @@@ ETAS ES58X CAN/USB DRIVE
  M:    Vincent Mailhol <[email protected]>
  L:    [email protected]
  S:    Maintained
 +F:    Documentation/networking/devlink/etas_es58x.rst
  F:    drivers/net/can/usb/etas_es58x/
  
  ETHERNET BRIDGE
@@@ -7851,6 -7789,7 +7851,6 @@@ F:      Documentation/admin-guide/efi-stub.r
  F:    arch/*/include/asm/efi.h
  F:    arch/*/kernel/efi.c
  F:    arch/arm/boot/compressed/efi-header.S
 -F:    arch/arm64/kernel/efi-entry.S
  F:    arch/x86/platform/efi/
  F:    drivers/firmware/efi/
  F:    include/linux/efi*.h
@@@ -7896,7 -7835,6 +7896,7 @@@ M:      Chao Yu <[email protected]
  L:    [email protected]
  S:    Maintained
  W:    https://f2fs.wiki.kernel.org/
 +B:    https://bugzilla.kernel.org/enter_bug.cgi?product=File%20System&component=f2fs
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git
  F:    Documentation/ABI/testing/sysfs-fs-f2fs
  F:    Documentation/filesystems/f2fs.rst
@@@ -8113,8 -8051,6 +8113,8 @@@ S:      Supporte
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening
  F:    include/linux/fortify-string.h
  F:    lib/fortify_kunit.c
 +F:    lib/memcpy_kunit.c
 +F:    lib/strscpy_kunit.c
  F:    lib/test_fortify/*
  F:    scripts/test_fortify.sh
  K:    \b__NO_FORTIFY\b
@@@ -8251,10 -8187,7 +8251,10 @@@ S:    Maintaine
  F:    drivers/i2c/busses/i2c-cpm.c
  
  FREESCALE IMX / MXC FEC DRIVER
 -M:    Joakim Zhang <[email protected]>
 +M:    Wei Fang <[email protected]>
 +R:    Shenwei Wang <[email protected]>
 +R:    Clark Wang <[email protected]>
 +R:    NXP Linux Team <[email protected]>
  L:    [email protected]
  S:    Maintained
  F:    Documentation/devicetree/bindings/net/fsl,fec.yaml
@@@ -8669,8 -8602,8 +8669,8 @@@ F:      include/asm-generic
  F:    include/uapi/asm-generic/
  
  GENERIC PHY FRAMEWORK
 -M:    Kishon Vijay Abraham I <[email protected]>
  M:    Vinod Koul <[email protected]>
 +M:    Kishon Vijay Abraham I <[email protected]>
  L:    [email protected]
  S:    Supported
  Q:    https://patchwork.kernel.org/project/linux-phy/list/
@@@ -8813,7 -8746,6 +8813,7 @@@ GPIO IR Transmitte
  M:    Sean Young <[email protected]>
  L:    [email protected]
  S:    Maintained
 +F:    Documentation/devicetree/bindings/leds/irled/gpio-ir-tx.yaml
  F:    drivers/media/rc/gpio-ir-tx.c
  
  GPIO MOCKUP DRIVER
@@@ -9237,13 -9169,6 +9237,13 @@@ W:    http://www.highpoint-tech.co
  F:    Documentation/scsi/hptiop.rst
  F:    drivers/scsi/hptiop.c
  
 +HIMAX HX83112B TOUCHSCREEN SUPPORT
 +M:    Job Noorman <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +F:    Documentation/devicetree/bindings/input/touchscreen/himax,hx83112b.yaml
 +F:    drivers/input/touchscreen/himax_hx83112b.c
 +
  HIPPI
  M:    Jes Sorensen <[email protected]>
  L:    [email protected]
@@@ -9273,7 -9198,6 +9273,7 @@@ HISILICON GPIO DRIVE
  M:    Jay Fang <[email protected]>
  L:    [email protected]
  S:    Maintained
 +F:    Documentation/devicetree/bindings/gpio/hisilicon,ascend910-gpio.yaml
  F:    drivers/gpio/gpio-hisi.c
  
  HISILICON HIGH PERFORMANCE RSA ENGINE DRIVER (HPRE)
@@@ -9324,7 -9248,7 +9324,7 @@@ F:      drivers/misc/hisi_hikey_usb.
  
  HISILICON PMU DRIVER
  M:    Shaokun Zhang <[email protected]>
 -M:    Qi Liu <liuqi115@huawei.com>
 +M:    Jonathan Cameron <jonathan.cameron@huawei.com>
  S:    Supported
  W:    http://www.hisilicon.com
  F:    Documentation/admin-guide/perf/hisi-pcie-pmu.rst
@@@ -9373,7 -9297,7 +9373,7 @@@ F:      Documentation/devicetree/bindings/in
  F:    drivers/infiniband/hw/hns/
  
  HISILICON SAS Controller
 -M:    John Garry <john.garry@huawei.com>
 +M:    Xiang Chen <chenxiang66@hisilicon.com>
  S:    Supported
  W:    http://www.hisilicon.com
  F:    Documentation/devicetree/bindings/scsi/hisilicon-sas.txt
@@@ -9417,7 -9341,7 +9417,7 @@@ S:      Maintaine
  F:    drivers/crypto/hisilicon/trng/trng.c
  
  HISILICON V3XX SPI NOR FLASH Controller Driver
 -M:    John Garry <john.garry@huawei.com>
 +M:    Jay Fang <f.fangjian@huawei.com>
  S:    Maintained
  W:    http://www.hisilicon.com
  F:    drivers/spi/spi-hisi-sfc-v3xx.c
@@@ -9442,7 -9366,7 +9442,7 @@@ F:      drivers/net/wireless/intersil/hostap
  HP COMPAQ TC1100 TABLET WMI EXTRAS DRIVER
  L:    [email protected]
  S:    Orphan
 -F:    drivers/platform/x86/tc1100-wmi.c
 +F:    drivers/platform/x86/hp/tc1100-wmi.c
  
  HPET: High Precision Event Timers driver
  M:    Clemens Ladisch <[email protected]>
@@@ -9512,15 -9436,14 +9512,15 @@@ F:   Documentation/devicetree/bindings/ii
  F:    drivers/iio/humidity/hts221*
  
  HUAWEI ETHERNET DRIVER
 +M:    Cai Huoqing <[email protected]>
  L:    [email protected]
 -S:    Orphan
 +S:    Maintained
  F:    Documentation/networking/device_drivers/ethernet/huawei/hinic.rst
  F:    drivers/net/ethernet/huawei/hinic/
  
  HUGETLB SUBSYSTEM
  M:    Mike Kravetz <[email protected]>
 -M:    Muchun Song <[email protected]>
 +M:    Muchun Song <[email protected]>
  L:    [email protected]
  S:    Maintained
  F:    Documentation/ABI/testing/sysfs-kernel-mm-hugepages
@@@ -9584,6 -9507,7 +9584,6 @@@ F:      drivers/media/i2c/hi847.
  Hyper-V/Azure CORE AND DRIVERS
  M:    "K. Y. Srinivasan" <[email protected]>
  M:    Haiyang Zhang <[email protected]>
 -M:    Stephen Hemminger <[email protected]>
  M:    Wei Liu <[email protected]>
  M:    Dexuan Cui <[email protected]>
  L:    [email protected]
@@@ -9617,7 -9541,6 +9617,7 @@@ F:      include/asm-generic/hyperv-tlfs.
  F:    include/asm-generic/mshyperv.h
  F:    include/clocksource/hyperv_timer.h
  F:    include/linux/hyperv.h
 +F:    include/net/mana
  F:    include/uapi/linux/hyperv.h
  F:    net/vmw_vsock/hyperv_transport.c
  F:    tools/hv/
@@@ -9781,7 -9704,8 +9781,7 @@@ F:      Documentation/devicetree/bindings/i3
  F:    drivers/i3c/master/i3c-master-cdns.c
  
  I3C DRIVER FOR SYNOPSYS DESIGNWARE
 -M:    Vitor Soares <[email protected]>
 -S:    Maintained
 +S:    Orphan
  F:    Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml
  F:    drivers/i3c/master/dw*
  
@@@ -10103,11 -10027,6 +10103,11 @@@ F: Documentation/hwmon/ina2xx.rs
  F:    drivers/hwmon/ina2xx.c
  F:    include/linux/platform_data/ina2xx.h
  
 +INDEX OF FURTHER KERNEL DOCUMENTATION
 +M:    Carlos Bilbao <[email protected]>
 +S:    Maintained
 +F:    Documentation/process/kernel-docs.rst
 +
  INDUSTRY PACK SUBSYSTEM (IPACK)
  M:    Samuel Iglesias Gonsalvez <[email protected]>
  M:    Jens Taprogge <[email protected]>
@@@ -10137,7 -10056,6 +10137,7 @@@ F:   drivers/infiniband
  F:    include/rdma/
  F:    include/trace/events/ib_mad.h
  F:    include/trace/events/ib_umad.h
 +F:    include/trace/misc/rdma.h
  F:    include/uapi/linux/if_infiniband.h
  F:    include/uapi/rdma/
  F:    samples/bpf/ibumad_kern.c
@@@ -10311,7 -10229,6 +10311,7 @@@ Q:   http://patchwork.freedesktop.org/pro
  B:    https://gitlab.freedesktop.org/drm/intel/-/wikis/How-to-file-i915-bugs
  C:    irc://irc.oftc.net/intel-gfx
  T:    git git://anongit.freedesktop.org/drm-intel
 +F:    Documentation/ABI/testing/sysfs-driver-intel-i915-hwmon
  F:    Documentation/gpu/i915.rst
  F:    drivers/gpu/drm/i915/
  F:    include/drm/i915*
@@@ -10371,7 -10288,7 +10371,7 @@@ T:   git https://github.com/intel/gvt-lin
  F:    drivers/gpu/drm/i915/gvt/
  
  INTEL HID EVENT DRIVER
 -M:    Alex Hung <alex.hung@canonical.com>
 +M:    Alex Hung <alexhung@gmail.com>
  L:    [email protected]
  S:    Maintained
  F:    drivers/platform/x86/intel/hid.c
@@@ -10803,18 -10720,6 +10803,18 @@@ F: drivers/iommu/dma-iommu.
  F:    drivers/iommu/iova.c
  F:    include/linux/iova.h
  
 +IOMMUFD
 +M:    Jason Gunthorpe <[email protected]>
 +M:    Kevin Tian <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd.git
 +F:    Documentation/userspace-api/iommufd.rst
 +F:    drivers/iommu/iommufd/
 +F:    include/linux/iommufd.h
 +F:    include/uapi/linux/iommufd.h
 +F:    tools/testing/selftests/iommu/
 +
  IOMMU SUBSYSTEM
  M:    Joerg Roedel <[email protected]>
  M:    Will Deacon <[email protected]>
@@@ -10994,13 -10899,6 +10994,13 @@@ F: drivers/isdn/Makefil
  F:    drivers/isdn/hardware/
  F:    drivers/isdn/mISDN/
  
 +ISOFS FILESYSTEM
 +M:    Jan Kara <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +F:    Documentation/filesystems/isofs.rst
 +F:    fs/isofs/
 +
  IT87 HARDWARE MONITORING DRIVER
  M:    Jean Delvare <[email protected]>
  L:    [email protected]
@@@ -11062,9 -10960,9 +11062,9 @@@ F:   drivers/hwmon/jc42.
  JFS FILESYSTEM
  M:    Dave Kleikamp <[email protected]>
  L:    [email protected]
 -S:    Maintained
 +S:    Odd Fixes
  W:    http://jfs.sourceforge.net/
 -T:    git git://github.com/kleikamp/linux-shaggy.git
 +T:    git https://github.com/kleikamp/linux-shaggy.git
  F:    Documentation/admin-guide/jfs.rst
  F:    fs/jfs/
  
@@@ -11138,7 -11036,6 +11138,7 @@@ KCONFI
  M:    Masahiro Yamada <[email protected]>
  L:    [email protected]
  S:    Maintained
 +Q:    https://patchwork.kernel.org/project/linux-kbuild/list/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git kconfig
  F:    Documentation/kbuild/kconfig*
  F:    scripts/Kconfig.include
@@@ -11196,12 -11093,10 +11196,12 @@@ F:        fs/autofs
  
  KERNEL BUILD + files below scripts/ (unless maintained elsewhere)
  M:    Masahiro Yamada <[email protected]>
 -M:    Michal Marek <[email protected]>
 +R:    Nathan Chancellor <[email protected]>
  R:    Nick Desaulniers <[email protected]>
 +R:    Nicolas Schier <[email protected]>
  L:    [email protected]
  S:    Maintained
 +Q:    https://patchwork.kernel.org/project/linux-kbuild/list/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git
  F:    Documentation/kbuild/
  F:    Makefile
@@@ -11219,8 -11114,6 +11219,8 @@@ M:   Kees Cook <[email protected]
  L:    [email protected]
  S:    Supported
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening
 +F:    Documentation/ABI/testing/sysfs-kernel-oops_count
 +F:    Documentation/ABI/testing/sysfs-kernel-warn_count
  F:    include/linux/overflow.h
  F:    include/linux/randomize_kstack.h
  F:    mm/usercopy.c
@@@ -11239,18 -11132,11 +11239,18 @@@ L:        [email protected]
  S:    Supported
  W:    http://nfs.sourceforge.net/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git
 +F:    fs/exportfs/
  F:    fs/lockd/
  F:    fs/nfs_common/
  F:    fs/nfsd/
  F:    include/linux/lockd/
  F:    include/linux/sunrpc/
 +F:    include/trace/events/rpcgss.h
 +F:    include/trace/events/rpcrdma.h
 +F:    include/trace/events/sunrpc.h
 +F:    include/trace/misc/fs.h
 +F:    include/trace/misc/nfs.h
 +F:    include/trace/misc/sunrpc.h
  F:    include/uapi/linux/nfsd/
  F:    include/uapi/linux/sunrpc/
  F:    net/sunrpc/
@@@ -11438,6 -11324,16 +11438,16 @@@ F: arch/x86/kvm/svm/hyperv.
  F:    arch/x86/kvm/svm/svm_onhyperv.*
  F:    arch/x86/kvm/vmx/evmcs.*
  
+ KVM X86 Xen (KVM/Xen)
+ M:    David Woodhouse <[email protected]>
+ M:    Paul Durrant <[email protected]>
+ M:    Sean Christopherson <[email protected]>
+ M:    Paolo Bonzini <[email protected]>
+ L:    [email protected]
+ S:    Supported
+ T:    git git://git.kernel.org/pub/scm/virt/kvm/kvm.git
+ F:    arch/x86/kvm/xen.*
  KERNFS
  M:    Greg Kroah-Hartman <[email protected]>
  M:    Tejun Heo <[email protected]>
@@@ -11972,7 -11868,7 +11982,7 @@@ M:   Eric Piel <[email protected]
  S:    Maintained
  F:    Documentation/misc-devices/lis3lv02d.rst
  F:    drivers/misc/lis3lv02d/
 -F:    drivers/platform/x86/hp_accel.c
 +F:    drivers/platform/x86/hp/hp_accel.c
  
  LIST KUNIT TEST
  M:    David Gow <[email protected]>
@@@ -12127,21 -12023,6 +12137,21 @@@ F: drivers/*/*loongarch
  F:    Documentation/loongarch/
  F:    Documentation/translations/zh_CN/loongarch/
  
 +LOONGSON-2 SOC SERIES GUTS DRIVER
 +M:    Yinbo Zhu <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +F:    Documentation/devicetree/bindings/hwinfo/loongson,ls2k-chipid.yaml
 +F:    drivers/soc/loongson/loongson2_guts.c
 +
 +LOONGSON-2 SOC SERIES PINCTRL DRIVER
 +M:    zhanghongchen <[email protected]>
 +M:    Yinbo Zhu <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +F:    Documentation/devicetree/bindings/pinctrl/loongson,ls2k-pinctrl.yaml
 +F:    drivers/pinctrl/pinctrl-loongson2.c
 +
  LSILOGIC MPT FUSION DRIVERS (FC/SAS/SPI)
  M:    Sathya Prakash <[email protected]>
  M:    Sreekanth Reddy <[email protected]>
@@@ -12219,7 -12100,7 +12229,7 @@@ M:   Alexey Kodanev <alexey.kodanev@oracl
  L:    [email protected] (subscribers-only)
  S:    Maintained
  W:    http://linux-test-project.github.io/
 -T:    git git://github.com/linux-test-project/ltp.git
 +T:    git https://github.com/linux-test-project/ltp.git
  
  LYNX 28G SERDES PHY DRIVER
  M:    Ioana Ciornei <[email protected]>
@@@ -12355,6 -12236,7 +12365,6 @@@ F:   arch/mips/boot/dts/img/pistachio
  
  MARVELL 88E6XXX ETHERNET SWITCH FABRIC DRIVER
  M:    Andrew Lunn <[email protected]>
 -M:    Vivien Didelot <[email protected]>
  L:    [email protected]
  S:    Maintained
  F:    Documentation/devicetree/bindings/net/dsa/marvell.txt
@@@ -12444,7 -12326,7 +12454,7 @@@ M:   Marcin Wojtas <[email protected]
  M:    Russell King <[email protected]>
  L:    [email protected]
  S:    Maintained
 -F:    Documentation/devicetree/bindings/net/marvell-pp2.txt
 +F:    Documentation/devicetree/bindings/net/marvell,pp2.yaml
  F:    drivers/net/ethernet/marvell/mvpp2/
  
  MARVELL MWIFIEX WIRELESS DRIVER
@@@ -12492,7 -12374,7 +12502,7 @@@ F:   Documentation/networking/device_driv
  F:    drivers/net/ethernet/marvell/octeontx2/af/
  
  MARVELL PRESTERA ETHERNET SWITCH DRIVER
 -M:    Taras Chornyi <t[email protected]>
 +M:    Taras Chornyi <t[email protected]>
  S:    Supported
  W:    https://github.com/Marvell-switching/switchdev-prestera
  F:    drivers/net/ethernet/marvell/prestera/
@@@ -12854,7 -12736,7 +12864,7 @@@ F:   Documentation/admin-guide/media/imx7
  F:    Documentation/devicetree/bindings/media/nxp,imx-mipi-csi2.yaml
  F:    Documentation/devicetree/bindings/media/nxp,imx7-csi.yaml
  F:    drivers/media/platform/nxp/imx-mipi-csis.c
 -F:    drivers/staging/media/imx/imx7-media-csi.c
 +F:    drivers/media/platform/nxp/imx7-media-csi.c
  
  MEDIA DRIVERS FOR HELENE
  M:    Abylay Ospan <[email protected]>
@@@ -13051,7 -12933,6 +13061,7 @@@ M:   Felix Fietkau <[email protected]
  M:    John Crispin <[email protected]>
  M:    Sean Wang <[email protected]>
  M:    Mark Lee <[email protected]>
 +M:    Lorenzo Bianconi <[email protected]>
  L:    [email protected]
  S:    Maintained
  F:    drivers/net/ethernet/mediatek/
@@@ -13423,20 -13304,10 +13433,20 @@@ F:        include/linux/memory_hotplug.
  F:    include/linux/mm.h
  F:    include/linux/mmzone.h
  F:    include/linux/pagewalk.h
 -F:    include/linux/vmalloc.h
  F:    mm/
  F:    tools/testing/selftests/vm/
  
 +VMALLOC
 +M:    Andrew Morton <[email protected]>
 +R:    Uladzislau Rezki <[email protected]>
 +R:    Christoph Hellwig <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +W:    http://www.linux-mm.org
 +T:    git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
 +F:    include/linux/vmalloc.h
 +F:    mm/vmalloc.c
 +
  MEMORY HOT(UN)PLUG
  M:    David Hildenbrand <[email protected]>
  M:    Oscar Salvador <[email protected]>
@@@ -13524,7 -13395,7 +13534,7 @@@ MESON NAND CONTROLLER DRIVER FOR AMLOGI
  M:    Liang Yang <[email protected]>
  L:    [email protected]
  S:    Maintained
 -F:    Documentation/devicetree/bindings/mtd/amlogic,meson-nand.txt
 +F:    Documentation/devicetree/bindings/mtd/amlogic,meson-nand.yaml
  F:    drivers/mtd/nand/raw/meson_*
  
  MESON VIDEO DECODER DRIVER FOR AMLOGIC SOCS
@@@ -13605,7 -13476,7 +13615,7 @@@ M:   Eugen Hristev <eugen.hristev@microch
  L:    [email protected]
  S:    Supported
  F:    Documentation/devicetree/bindings/media/microchip,csi2dc.yaml
 -F:    drivers/media/platform/atmel/microchip-csi2dc.c
 +F:    drivers/media/platform/microchip/microchip-csi2dc.c
  
  MICROCHIP ECC DRIVER
  M:    Tudor Ambarus <[email protected]>
  S:    Supported
  F:    Documentation/devicetree/bindings/media/atmel,isc.yaml
  F:    Documentation/devicetree/bindings/media/microchip,xisc.yaml
 -F:    drivers/media/platform/atmel/atmel-isc*
 -F:    drivers/media/platform/atmel/atmel-sama*-isc*
 +F:    drivers/staging/media/deprecated/atmel/atmel-isc*
 +F:    drivers/staging/media/deprecated/atmel/atmel-sama*-isc*
 +F:    drivers/media/platform/microchip/microchip-isc*
 +F:    drivers/media/platform/microchip/microchip-sama*-isc*
  F:    include/linux/atmel-isc-media.h
  
  MICROCHIP ISI DRIVER
@@@ -13768,12 -13637,6 +13778,12 @@@ S: Supporte
  F:    drivers/misc/atmel-ssc.c
  F:    include/linux/atmel-ssc.h
  
 +MICROCHIP SOC DRIVERS
 +M:    Conor Dooley <[email protected]>
 +S:    Supported
 +T:    git https://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git/
 +F:    drivers/soc/microchip/
 +
  MICROCHIP USB251XB DRIVER
  M:    Richard Leitner <[email protected]>
  L:    [email protected]
@@@ -13818,15 -13681,6 +13828,15 @@@ F: drivers/scsi/smartpqi/smartpqi*.[ch
  F:    include/linux/cciss*.h
  F:    include/uapi/linux/cciss*.h
  
 +MICROSOFT MANA RDMA DRIVER
 +M:    Long Li <[email protected]>
 +M:    Ajay Sharma <[email protected]>
 +L:    [email protected]
 +S:    Supported
 +F:    drivers/infiniband/hw/mana/
 +F:    include/net/mana
 +F:    include/uapi/rdma/mana-abi.h
 +
  MICROSOFT SURFACE AGGREGATOR TABLET-MODE SWITCH
  M:    Maximilian Luz <[email protected]>
  L:    [email protected]
@@@ -14102,7 -13956,6 +14112,7 @@@ F:   include/uapi/linux/meye.
  
  MOTORCOMM PHY DRIVER
  M:    Peter Geis <[email protected]>
 +M:    Frank <[email protected]>
  L:    [email protected]
  S:    Maintained
  F:    drivers/net/phy/motorcomm.c
@@@ -14481,6 -14334,7 +14491,6 @@@ F:   drivers/net/wireless
  
  NETWORKING [DSA]
  M:    Andrew Lunn <[email protected]>
 -M:    Vivien Didelot <[email protected]>
  M:    Florian Fainelli <[email protected]>
  M:    Vladimir Oltean <[email protected]>
  S:    Maintained
@@@ -15337,13 -15191,6 +15347,13 @@@ S: Maintaine
  T:    git git://linuxtv.org/media_tree.git
  F:    drivers/media/i2c/ov08d10.c
  
 +OMNIVISION OV08X40 SENSOR DRIVER
 +M:    Jason Chen <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +T:    git git://linuxtv.org/media_tree.git
 +F:    drivers/media/i2c/ov08x40.c
 +
  OMNIVISION OV13858 SENSOR DRIVER
  M:    Sakari Ailus <[email protected]>
  L:    [email protected]
@@@ -15382,14 -15229,6 +15392,14 @@@ S: Maintaine
  T:    git git://linuxtv.org/media_tree.git
  F:    drivers/media/i2c/ov2740.c
  
 +OMNIVISION OV4689 SENSOR DRIVER
 +M:    Mikhail Rudenko <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +T:    git git://linuxtv.org/media_tree.git
 +F:    Documentation/devicetree/bindings/media/i2c/ovti,ov4689.yaml
 +F:    drivers/media/i2c/ov5647.c
 +
  OMNIVISION OV5640 SENSOR DRIVER
  M:    Steve Longerbeam <[email protected]>
  L:    [email protected]
@@@ -15514,12 -15353,6 +15524,12 @@@ S: Maintaine
  F:    drivers/mtd/nand/onenand/
  F:    include/linux/mtd/onenand*.h
  
 +ONEXPLAYER FAN DRIVER
 +M:    Joaquín Ignacio Aramendía <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +F:    drivers/hwmon/oxp-sensors.c
 +
  ONION OMEGA2+ BOARD
  M:    Harvey Hunt <[email protected]>
  L:    [email protected]
@@@ -16123,7 -15956,6 +16133,7 @@@ Q:   https://patchwork.kernel.org/project
  B:    https://bugzilla.kernel.org
  C:    irc://irc.oftc.net/linux-pci
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git
 +F:    Documentation/devicetree/bindings/pci/
  F:    drivers/pci/controller/
  F:    drivers/pci/pci-bridge-emul.c
  F:    drivers/pci/pci-bridge-emul.h
@@@ -16230,7 -16062,7 +16240,7 @@@ F:   Documentation/devicetree/bindings/pc
  F:    drivers/pci/controller/*microchip*
  
  PCIE DRIVER FOR QUALCOMM MSM
 -M:    Stanimir Varbanov <[email protected]>
 +M:    Manivannan Sadhasivam <[email protected]>
  L:    [email protected]
  L:    [email protected]
  S:    Maintained
@@@ -16320,8 -16152,7 +16330,8 @@@ F:   include/linux/peci-cpu.
  F:    include/linux/peci.h
  
  PENSANDO ETHERNET DRIVERS
 -M:    Shannon Nelson <[email protected]>
 +M:    Shannon Nelson <[email protected]>
 +M:    Brett Creeley <[email protected]>
  M:    [email protected]
  L:    [email protected]
  S:    Supported
@@@ -16479,7 -16310,7 +16489,7 @@@ M:   Sean Wang <[email protected]
  L:    [email protected] (moderated for non-subscribers)
  S:    Maintained
  F:    Documentation/devicetree/bindings/pinctrl/mediatek,mt65xx-pinctrl.yaml
 -F:    Documentation/devicetree/bindings/pinctrl/mediatek,mt6797-pinctrl.yaml
 +F:    Documentation/devicetree/bindings/pinctrl/mediatek,mt6779-pinctrl.yaml
  F:    Documentation/devicetree/bindings/pinctrl/mediatek,mt7622-pinctrl.yaml
  F:    Documentation/devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml
  F:    drivers/pinctrl/mediatek/
@@@ -16552,6 -16383,13 +16562,6 @@@ S:  Supporte
  F:    Documentation/devicetree/bindings/input/pine64,pinephone-keyboard.yaml
  F:    drivers/input/keyboard/pinephone-keyboard.c
  
 -PKTCDVD DRIVER
 -M:    [email protected]
 -S:    Orphan
 -F:    drivers/block/pktcdvd.c
 -F:    include/linux/pktcdvd.h
 -F:    include/uapi/linux/pktcdvd.h
 -
  PLANTOWER PMS7003 AIR POLLUTION SENSOR DRIVER
  M:    Tomasz Duszynski <[email protected]>
  S:    Maintained
@@@ -16829,10 -16667,10 +16839,10 @@@ F:        net/psampl
  
  PSTORE FILESYSTEM
  M:    Kees Cook <[email protected]>
 -M:    Anton Vorontsov <[email protected]>
 -M:    Colin Cross <ccross@android.com>
 -M:    Tony Luck <[email protected]>
 -S:    Maintained
 +R:    Tony Luck <[email protected]>
 +R:    Guilherme G. Piccoli <gpiccoli@igalia.com>
 +L:    [email protected]
 +S:    Supported
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/pstore
  F:    Documentation/admin-guide/ramoops.rst
  F:    Documentation/admin-guide/pstore-blk.rst
@@@ -16879,6 -16717,7 +16889,6 @@@ M:   Hans Verkuil <[email protected]
  L:    [email protected]
  S:    Maintained
  T:    git git://linuxtv.org/media_tree.git
 -F:    Documentation/admin-guide/media/pulse8-cec.rst
  F:    drivers/media/cec/usb/pulse8/
  
  PURELIFI PLFXLC DRIVER
@@@ -16909,7 -16748,6 +16919,7 @@@ PWM IR Transmitte
  M:    Sean Young <[email protected]>
  L:    [email protected]
  S:    Maintained
 +F:    Documentation/devicetree/bindings/leds/irled/pwm-ir-tx.yaml
  F:    drivers/media/rc/pwm-ir-tx.c
  
  PWM SUBSYSTEM
@@@ -16974,7 -16812,7 +16984,7 @@@ M:   Srinivas Kandagatla <srinivas.kandag
  M:    Banajit Goswami <[email protected]>
  L:    [email protected] (moderated for non-subscribers)
  S:    Supported
 -F:    Documentation/devicetree/bindings/soc/qcom/qcom,apr.yaml
 +F:    Documentation/devicetree/bindings/soc/qcom/qcom,apr*
  F:    Documentation/devicetree/bindings/sound/qcom,*
  F:    drivers/soc/qcom/apr.c
  F:    include/dt-bindings/sound/qcom,wcd9335.h
@@@ -17332,8 -17170,7 +17342,8 @@@ F:   Documentation/devicetree/bindings/th
  F:    drivers/thermal/qcom/
  
  QUALCOMM VENUS VIDEO ACCELERATOR DRIVER
 -M:    Stanimir Varbanov <[email protected]>
 +M:    Stanimir Varbanov <[email protected]>
 +M:    Vikash Garodia <[email protected]>
  L:    [email protected]
  L:    [email protected]
  S:    Maintained
@@@ -17398,7 -17235,7 +17408,7 @@@ R:   Dongsheng Yang <dongsheng.yang@easys
  L:    [email protected]
  S:    Supported
  W:    http://ceph.com/
 -T:    git git://github.com/ceph/ceph-client.git
 +T:    git https://github.com/ceph/ceph-client.git
  F:    Documentation/ABI/testing/sysfs-bus-rbd
  F:    drivers/block/rbd.c
  F:    drivers/block/rbd_types.h
@@@ -17651,8 -17488,10 +17661,8 @@@ S:  Maintaine
  F:    drivers/net/wireless/realtek/rtw89/
  
  REDPINE WIRELESS DRIVER
 -M:    Amitkumar Karwar <[email protected]>
 -M:    Siva Rebbagondla <[email protected]>
  L:    [email protected]
 -S:    Maintained
 +S:    Orphan
  F:    drivers/net/wireless/rsi/
  
  REGISTER MAP ABSTRACTION
@@@ -17897,7 -17736,7 +17907,7 @@@ F:   arch/riscv
  N:    riscv
  K:    riscv
  
 -RISC-V/MICROCHIP POLARFIRE SOC SUPPORT
 +RISC-V MICROCHIP FPGA SUPPORT
  M:    Conor Dooley <[email protected]>
  M:    Daire McNamara <[email protected]>
  L:    [email protected]
@@@ -17915,26 -17754,17 +17925,26 @@@ F:        Documentation/devicetree/bindings/us
  F:    arch/riscv/boot/dts/microchip/
  F:    drivers/char/hw_random/mpfs-rng.c
  F:    drivers/clk/microchip/clk-mpfs.c
 -F:    drivers/i2c/busses/i2c-microchip-core.c
 +F:    drivers/i2c/busses/i2c-microchip-corei2c.c
  F:    drivers/mailbox/mailbox-mpfs.c
  F:    drivers/pci/controller/pcie-microchip-host.c
  F:    drivers/reset/reset-mpfs.c
  F:    drivers/rtc/rtc-mpfs.c
 -F:    drivers/soc/microchip/
 +F:    drivers/soc/microchip/mpfs-sys-controller.c
  F:    drivers/spi/spi-microchip-core-qspi.c
  F:    drivers/spi/spi-microchip-core.c
  F:    drivers/usb/musb/mpfs.c
  F:    include/soc/microchip/mpfs.h
  
 +RISC-V MISC SOC SUPPORT
 +M:    Conor Dooley <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +Q:    https://patchwork.kernel.org/project/linux-riscv/list/
 +T:    git https://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git/
 +F:    Documentation/devicetree/bindings/riscv/
 +F:    arch/riscv/boot/dts/
 +
  RNBD BLOCK DRIVERS
  M:    Md. Haris Iqbal <[email protected]>
  M:    Jack Wang <[email protected]>
@@@ -17950,13 -17780,6 +17960,13 @@@ F: Documentation/ABI/*/sysfs-driver-hid
  F:    drivers/hid/hid-roccat*
  F:    include/linux/hid-roccat*
  
 +ROCKCHIP CRYPTO DRIVERS
 +M:    Corentin Labbe <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +F:    Documentation/devicetree/bindings/crypto/rockchip,rk3288-crypto.yaml
 +F:    drivers/crypto/rockchip/
 +
  ROCKCHIP I2S TDM DRIVER
  M:    Nicolas Frattaroli <[email protected]>
  L:    [email protected]
  S:    Maintained
  F:    drivers/video/fbdev/savage/
  
 -S390
 +S390 ARCHITECTURE
  M:    Heiko Carstens <[email protected]>
  M:    Vasily Gorbik <[email protected]>
  M:    Alexander Gordeev <[email protected]>
  S:    Supported
  F:    drivers/s390/net/
  
 +S390 MM
 +M:    Alexander Gordeev <[email protected]>
 +M:    Gerald Schaefer <[email protected]>
 +L:    [email protected]
 +S:    Supported
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git
 +F:    arch/s390/include/asm/pgtable.h
 +F:    arch/s390/mm
 +
  S390 PCI SUBSYSTEM
  M:    Niklas Schnelle <[email protected]>
  M:    Gerald Schaefer <[email protected]>
@@@ -18673,7 -18487,6 +18683,7 @@@ K:   \bsecure_computin
  K:    \bTIF_SECCOMP\b
  
  SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) Broadcom BRCMSTB DRIVER
 +M:    Kamal Dasu <[email protected]>
  M:    Al Cooper <[email protected]>
  R:    Broadcom internal kernel review list <[email protected]>
  L:    [email protected]
@@@ -18684,7 -18497,6 +18694,7 @@@ SECURE DIGITAL HOST CONTROLLER INTERFAC
  M:    Adrian Hunter <[email protected]>
  L:    [email protected]
  S:    Supported
 +F:    Documentation/devicetree/bindings/mmc/sdhci-common.yaml
  F:    drivers/mmc/host/sdhci*
  
  SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) MICROCHIP DRIVER
@@@ -18979,6 -18791,7 +18989,6 @@@ M:   Palmer Dabbelt <[email protected]
  M:    Paul Walmsley <[email protected]>
  L:    [email protected]
  S:    Supported
 -T:    git https://github.com/sifive/riscv-linux.git
  N:    sifive
  K:    [^@]sifive
  
@@@ -18997,13 -18810,6 +19007,13 @@@ S: Maintaine
  F:    Documentation/devicetree/bindings/dma/sifive,fu540-c000-pdma.yaml
  F:    drivers/dma/sf-pdma/
  
 +SIFIVE SOC DRIVERS
 +M:    Conor Dooley <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +T:    git https://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git/
 +F:    drivers/soc/sifive/
 +
  SILEAD TOUCHSCREEN DRIVER
  M:    Hans de Goede <[email protected]>
  L:    [email protected]
@@@ -19070,7 -18876,7 +19080,7 @@@ M:   Jason A. Donenfeld <[email protected]
  S:    Maintained
  F:    include/linux/siphash.h
  F:    lib/siphash.c
 -F:    lib/test_siphash.c
 +F:    lib/siphash_kunit.c
  
  SIS 190 ETHERNET DRIVER
  M:    Francois Romieu <[email protected]>
@@@ -19094,7 -18900,7 +19104,7 @@@ F:   drivers/video/fbdev/sis
  F:    include/video/sisfb.h
  
  SIS I2C TOUCHSCREEN DRIVER
 -M:    Mika Penttilä <mika.penttila@nextfour.com>
 +M:    Mika Penttilä <mpenttil@redhat.com>
  L:    [email protected]
  S:    Maintained
  F:    Documentation/devicetree/bindings/input/touchscreen/sis_i2c.txt
@@@ -19237,7 -19043,7 +19247,7 @@@ M:   Jassi Brar <[email protected]
  M:    Ilias Apalodimas <[email protected]>
  L:    [email protected]
  S:    Maintained
 -F:    Documentation/devicetree/bindings/net/socionext-netsec.txt
 +F:    Documentation/devicetree/bindings/net/socionext,synquacer-netsec.yaml
  F:    drivers/net/ethernet/socionext/netsec.c
  
  SOCIONEXT (SNI) Synquacer SPI DRIVER
@@@ -19245,7 -19051,7 +19255,7 @@@ M:   Masahisa Kojima <masahisa.kojima@lin
  M:    Jassi Brar <[email protected]>
  L:    [email protected]
  S:    Maintained
 -F:    Documentation/devicetree/bindings/spi/spi-synquacer.txt
 +F:    Documentation/devicetree/bindings/spi/socionext,synquacer-spi.yaml
  F:    drivers/spi/spi-synquacer.c
  
  SOCIONEXT SYNQUACER I2C DRIVER
@@@ -19392,7 -19198,7 +19402,7 @@@ M:   Manivannan Sadhasivam <manivannan.sa
  L:    [email protected]
  S:    Maintained
  T:    git git://linuxtv.org/media_tree.git
 -F:    Documentation/devicetree/bindings/media/i2c/imx290.txt
 +F:    Documentation/devicetree/bindings/media/i2c/sony,imx290.yaml
  F:    drivers/media/i2c/imx290.c
  
  SONY IMX319 SENSOR DRIVER
@@@ -19541,11 -19347,6 +19551,11 @@@ W: https://linuxtv.or
  Q:    http://patchwork.linuxtv.org/project/linux-media/list/
  F:    drivers/media/dvb-frontends/sp2*
  
 +SPANISH DOCUMENTATION
 +M:    Carlos Bilbao <[email protected]>
 +S:    Maintained
 +F:    Documentation/translations/sp_SP/
 +
  SPARC + UltraSPARC (sparc/sparc64)
  M:    "David S. Miller" <[email protected]>
  L:    [email protected]
@@@ -19689,7 -19490,7 +19699,7 @@@ M:   Sylvain Petinot <sylvain.petinot@fos
  L:    [email protected]
  S:    Maintained
  T:    git git://linuxtv.org/media_tree.git
 -F:    Documentation/devicetree/bindings/media/i2c/st,st-mipid02.txt
 +F:    Documentation/devicetree/bindings/media/i2c/st,st-mipid02.yaml
  F:    drivers/media/i2c/st-mipid02.c
  
  ST STM32 I2C/SMBUS DRIVER
@@@ -19712,16 -19513,6 +19722,16 @@@ S: Maintaine
  F:    Documentation/hwmon/stpddc60.rst
  F:    drivers/hwmon/pmbus/stpddc60.c
  
 +ST VGXY61 DRIVER
 +M:    Benjamin Mugnier <[email protected]>
 +M:    Sylvain Petinot <[email protected]>
 +L:    [email protected]
 +S:    Maintained
 +T:    git git://linuxtv.org/media_tree.git
 +F:    Documentation/devicetree/bindings/media/i2c/st,st-vgxy61.yaml
 +F:    Documentation/userspace-api/media/drivers/st-vgxy61.rst
 +F:    drivers/media/i2c/st-vgxy61.c
 +
  ST VL53L0X ToF RANGER(I2C) IIO DRIVER
  M:    Song Qiang <[email protected]>
  L:    [email protected]
@@@ -19737,7 -19528,6 +19747,7 @@@ S:   Supporte
  F:    Documentation/process/stable-kernel-rules.rst
  
  STAGING - ATOMISP DRIVER
 +M:    Hans de Goede <[email protected]>
  M:    Mauro Carvalho Chehab <[email protected]>
  R:    Sakari Ailus <[email protected]>
  L:    [email protected]
@@@ -19821,11 -19611,6 +19831,11 @@@ M: Ion Badulescu <[email protected]
  S:    Odd Fixes
  F:    drivers/net/ethernet/adaptec/starfire*
  
 +STARFIVE DEVICETREES
 +M:    Emil Renner Berthing <[email protected]>
 +S:    Maintained
 +F:    arch/riscv/boot/dts/starfive/
 +
  STARFIVE JH7100 CLOCK DRIVERS
  M:    Emil Renner Berthing <[email protected]>
  S:    Maintained
@@@ -19947,13 -19732,6 +19957,13 @@@ W: https://sunplus.atlassian.net/wiki/s
  F:    Documentation/devicetree/bindings/net/sunplus,sp7021-emac.yaml
  F:    drivers/net/ethernet/sunplus/
  
 +SUNPLUS MMC DRIVER
 +M:    Tony Huang <[email protected]>
 +M:    Li-hao Kuo <[email protected]>
 +S:    Maintained
 +F:    Documentation/devicetree/bindings/mmc/sunplus,mmc.yaml
 +F:    drivers/mmc/host/sunplus-mmc.c
 +
  SUNPLUS OCOTP DRIVER
  M:    Vincent Shih <[email protected]>
  S:    Maintained
@@@ -20205,7 -19983,6 +20215,7 @@@ F:   drivers/clk/clk-sc[mp]i.
  F:    drivers/cpufreq/sc[mp]i-cpufreq.c
  F:    drivers/firmware/arm_scmi/
  F:    drivers/firmware/arm_scpi.c
 +F:    drivers/powercap/arm_scmi_powercap.c
  F:    drivers/regulator/scmi-regulator.c
  F:    drivers/reset/reset-scmi.c
  F:    include/linux/sc[mp]i_protocol.h
@@@ -20540,7 -20317,7 +20550,7 @@@ M:   Chris Zankel <[email protected]
  M:    Max Filippov <[email protected]>
  L:    [email protected]
  S:    Maintained
 -T:    git git://github.com/czankel/xtensa-linux.git
 +T:    git https://github.com/jcmvbkbc/linux-xtensa.git
  F:    arch/xtensa/
  F:    drivers/irqchip/irq-xtensa-*
  
@@@ -20890,6 -20667,7 +20900,6 @@@ W:   https://wireless.wiki.kernel.org/en/
  W:    https://wireless.wiki.kernel.org/en/users/Drivers/wl1251
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/luca/wl12xx.git
  F:    drivers/net/wireless/ti/
 -F:    include/linux/wl12xx.h
  
  TIMEKEEPING, CLOCKSOURCE CORE, NTP, ALARMTIMER
  M:    John Stultz <[email protected]>
@@@ -21953,12 -21731,6 +21963,12 @@@ F: include/linux/virtio*.
  F:    include/uapi/linux/virtio_*.h
  F:    tools/virtio/
  
 +VISL VIRTUAL STATELESS DECODER DRIVER
 +M:    Daniel Almeida <[email protected]>
 +L:    [email protected]
 +S:    Supported
 +F:    drivers/media/test-drivers/visl
 +
  IFCVF VIRTIO DATA PATH ACCELERATOR
  R:    Zhu Lingshan <[email protected]>
  F:    drivers/vdpa/ifcvf/
diff --combined arch/arm64/Kconfig
index 7cb7d635fbcc8c9d2df237ffaf80f8d47eefbe66,cd93d07384256a65b33031b9cb3046461a1ded29..cf6d1cd8b6dc5dbc3b0797eeffeebc2e6c5f1350
@@@ -1,7 -1,6 +1,7 @@@
  # SPDX-License-Identifier: GPL-2.0-only
  config ARM64
        def_bool y
 +      select ACPI_APMT if ACPI
        select ACPI_CCA_REQUIRED if ACPI
        select ACPI_GENERIC_GSI if ACPI
        select ACPI_GTDT if ACPI
@@@ -32,7 -31,6 +32,7 @@@
        select ARCH_HAS_KCOV
        select ARCH_HAS_KEEPINITRD
        select ARCH_HAS_MEMBARRIER_SYNC_CORE
 +      select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
        select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
        select ARCH_HAS_PTE_DEVMAP
        select ARCH_HAS_PTE_SPECIAL
        select CPU_PM if (SUSPEND || CPU_IDLE)
        select CRC32
        select DCACHE_WORD_ACCESS
 +      select DYNAMIC_FTRACE if FUNCTION_TRACER
        select DMA_DIRECT_REMAP
        select EDAC_SUPPORT
        select FRAME_POINTER
        select HAVE_DEBUG_KMEMLEAK
        select HAVE_DMA_CONTIGUOUS
        select HAVE_DYNAMIC_FTRACE
 +      select HAVE_DYNAMIC_FTRACE_WITH_ARGS \
 +              if $(cc-option,-fpatchable-function-entry=2)
        select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \
 -              if DYNAMIC_FTRACE_WITH_REGS
 +              if DYNAMIC_FTRACE_WITH_ARGS
        select HAVE_EFFICIENT_UNALIGNED_ACCESS
        select HAVE_FAST_GUP
        select HAVE_FTRACE_MCOUNT_RECORD
        help
          ARM 64-bit (AArch64) Linux support.
  
 -config CLANG_SUPPORTS_DYNAMIC_FTRACE_WITH_REGS
 +config CLANG_SUPPORTS_DYNAMIC_FTRACE_WITH_ARGS
        def_bool CC_IS_CLANG
        # https://github.com/ClangBuiltLinux/linux/issues/1507
        depends on AS_IS_GNU || (AS_IS_LLVM && (LD_IS_LLD || LD_VERSION >= 23600))
 -      select HAVE_DYNAMIC_FTRACE_WITH_REGS
 +      select HAVE_DYNAMIC_FTRACE_WITH_ARGS
  
 -config GCC_SUPPORTS_DYNAMIC_FTRACE_WITH_REGS
 +config GCC_SUPPORTS_DYNAMIC_FTRACE_WITH_ARGS
        def_bool CC_IS_GCC
        depends on $(cc-option,-fpatchable-function-entry=2)
 -      select HAVE_DYNAMIC_FTRACE_WITH_REGS
 +      select HAVE_DYNAMIC_FTRACE_WITH_ARGS
  
  config 64BIT
        def_bool y
@@@ -375,9 -370,6 +375,9 @@@ config KASAN_SHADOW_OFFSE
        default 0xeffffff800000000 if ARM64_VA_BITS_36 && KASAN_SW_TAGS
        default 0xffffffffffffffff
  
 +config UNWIND_TABLES
 +      bool
 +
  source "arch/arm64/Kconfig.platforms"
  
  menu "Kernel Features"
@@@ -972,22 -964,6 +972,22 @@@ config ARM64_ERRATUM_245716
  
          If unsure, say Y.
  
 +config ARM64_ERRATUM_2645198
 +      bool "Cortex-A715: 2645198: Workaround possible [ESR|FAR]_ELx corruption"
 +      default y
 +      help
 +        This option adds the workaround for ARM Cortex-A715 erratum 2645198.
 +
 +        If a Cortex-A715 cpu sees a page mapping permissions change from executable
 +        to non-executable, it may corrupt the ESR_ELx and FAR_ELx registers on the
 +        next instruction abort caused by permission fault.
 +
 +        Only user-space does executable to non-executable permission transition via
 +        mprotect() system call. Workaround the problem by doing a break-before-make
 +        TLB invalidation, for all changes to executable user space mappings.
 +
 +        If unsure, say Y.
 +
  config CAVIUM_ERRATUM_22375
        bool "Cavium erratum 22375, 24313"
        default y
@@@ -1738,6 -1714,7 +1738,6 @@@ config ARM64_LSE_ATOMIC
  
  config ARM64_USE_LSE_ATOMICS
        bool "Atomic instructions"
 -      depends on JUMP_LABEL
        default y
        help
          As part of the Large System Extensions, ARMv8.1 introduces new
@@@ -1839,7 -1816,7 +1839,7 @@@ config ARM64_PTR_AUTH_KERNE
        # which is only understood by binutils starting with version 2.33.1.
        depends on LD_IS_LLD || LD_VERSION >= 23301 || (CC_IS_GCC && GCC_VERSION < 90100)
        depends on !CC_IS_CLANG || AS_HAS_CFI_NEGATE_RA_STATE
 -      depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS)
 +      depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_ARGS)
        help
          If the compiler supports the -mbranch-protection or
          -msign-return-address flag (e.g. GCC 7 or later), then this option
          disabled with minimal loss of protection.
  
          This feature works with FUNCTION_GRAPH_TRACER option only if
 -        DYNAMIC_FTRACE_WITH_REGS is enabled.
 +        DYNAMIC_FTRACE_WITH_ARGS is enabled.
  
  config CC_HAS_BRANCH_PROT_PAC_RET
        # GCC 9 or later, clang 8 or later
@@@ -1947,7 -1924,7 +1947,7 @@@ config ARM64_BTI_KERNE
        depends on !CC_IS_GCC
        # https://github.com/llvm/llvm-project/commit/a88c722e687e6780dcd6a58718350dc76fcc4cc9
        depends on !CC_IS_CLANG || CLANG_VERSION >= 120000
 -      depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS)
 +      depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_ARGS)
        help
          Build the kernel with Branch Target Identification annotations
          and enable enforcement of this for kernel code. When this option
@@@ -1988,6 -1965,7 +1988,7 @@@ config ARM64_MT
        depends on ARM64_PAN
        select ARCH_HAS_SUBPAGE_FAULTS
        select ARCH_USES_HIGH_VMA_FLAGS
+       select ARCH_USES_PG_ARCH_X
        help
          Memory Tagging (part of the ARMv8.5 Extensions) provides
          architectural support for run-time, always-on detection of
@@@ -2168,14 -2146,17 +2169,14 @@@ config STACKPROTECTOR_PER_TAS
        def_bool y
        depends on STACKPROTECTOR && CC_HAVE_STACKPROTECTOR_SYSREG
  
 -# The GPIO number here must be sorted by descending number. In case of
 -# a multiplatform kernel, we just want the highest value required by the
 -# selected platforms.
 -config ARCH_NR_GPIO
 -        int
 -        default 2048 if ARCH_APPLE
 -        default 0
 -        help
 -          Maximum number of GPIOs in the system.
 -
 -          If unsure, leave the default value.
 +config UNWIND_PATCH_PAC_INTO_SCS
 +      bool "Enable shadow call stack dynamically using code patching"
 +      # needs Clang with https://reviews.llvm.org/D111780 incorporated
 +      depends on CC_IS_CLANG && CLANG_VERSION >= 150000
 +      depends on ARM64_PTR_AUTH_KERNEL && CC_HAS_BRANCH_PROT_PAC_RET
 +      depends on SHADOW_CALL_STACK
 +      select UNWIND_TABLES
 +      select DYNAMIC_SCS
  
  endmenu # "Kernel Features"
  
index fd34ab155d0b7c599613b848b4fed6cff8542a00,001c8abe87fc85d60b9b70f9d18e608bcd306b2f..35a159d131b5f862c16b52842ef6b0ba8088fb00
@@@ -73,6 -73,63 +73,63 @@@ u32 __attribute_const__ kvm_target_cpu(
  int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
  void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu);
  
+ struct kvm_hyp_memcache {
+       phys_addr_t head;
+       unsigned long nr_pages;
+ };
+ static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc,
+                                    phys_addr_t *p,
+                                    phys_addr_t (*to_pa)(void *virt))
+ {
+       *p = mc->head;
+       mc->head = to_pa(p);
+       mc->nr_pages++;
+ }
+ static inline void *pop_hyp_memcache(struct kvm_hyp_memcache *mc,
+                                    void *(*to_va)(phys_addr_t phys))
+ {
+       phys_addr_t *p = to_va(mc->head);
+       if (!mc->nr_pages)
+               return NULL;
+       mc->head = *p;
+       mc->nr_pages--;
+       return p;
+ }
+ static inline int __topup_hyp_memcache(struct kvm_hyp_memcache *mc,
+                                      unsigned long min_pages,
+                                      void *(*alloc_fn)(void *arg),
+                                      phys_addr_t (*to_pa)(void *virt),
+                                      void *arg)
+ {
+       while (mc->nr_pages < min_pages) {
+               phys_addr_t *p = alloc_fn(arg);
+               if (!p)
+                       return -ENOMEM;
+               push_hyp_memcache(mc, p, to_pa);
+       }
+       return 0;
+ }
+ static inline void __free_hyp_memcache(struct kvm_hyp_memcache *mc,
+                                      void (*free_fn)(void *virt, void *arg),
+                                      void *(*to_va)(phys_addr_t phys),
+                                      void *arg)
+ {
+       while (mc->nr_pages)
+               free_fn(pop_hyp_memcache(mc, to_va), arg);
+ }
+ void free_hyp_memcache(struct kvm_hyp_memcache *mc);
+ int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages);
  struct kvm_vmid {
        atomic64_t id;
  };
@@@ -115,6 -172,13 +172,13 @@@ struct kvm_smccc_features 
        unsigned long vendor_hyp_bmap;
  };
  
+ typedef unsigned int pkvm_handle_t;
+ struct kvm_protected_vm {
+       pkvm_handle_t handle;
+       struct kvm_hyp_memcache teardown_mc;
+ };
  struct kvm_arch {
        struct kvm_s2_mmu mmu;
  
  
        u8 pfr0_csv2;
        u8 pfr0_csv3;
+       struct {
+               u8 imp:4;
+               u8 unimp:4;
+       } dfr0_pmuver;
  
        /* Hypercall features firmware registers' descriptor */
        struct kvm_smccc_features smccc_feat;
+       /*
+        * For an untrusted host VM, 'pkvm.handle' is used to lookup
+        * the associated pKVM instance in the hypervisor.
+        */
+       struct kvm_protected_vm pkvm;
  };
  
  struct kvm_vcpu_fault_info {
@@@ -306,18 -380,8 +380,18 @@@ struct vcpu_reset_state 
  struct kvm_vcpu_arch {
        struct kvm_cpu_context ctxt;
  
 -      /* Guest floating point state */
 +      /*
 +       * Guest floating point state
 +       *
 +       * The architecture has two main floating point extensions,
 +       * the original FPSIMD and SVE.  These have overlapping
 +       * register views, with the FPSIMD V registers occupying the
 +       * low 128 bits of the SVE Z registers.  When the core
 +       * floating point code saves the register state of a task it
 +       * records which view it saved in fp_type.
 +       */
        void *sve_state;
 +      enum fp_type fp_type;
        unsigned int sve_max_vl;
        u64 svcr;
  
@@@ -925,8 -989,6 +999,6 @@@ int kvm_set_ipa_limit(void)
  #define __KVM_HAVE_ARCH_VM_ALLOC
  struct kvm *kvm_arch_alloc_vm(void);
  
- int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
  static inline bool kvm_vm_is_protected(struct kvm *kvm)
  {
        return false;
index b3faf7582a53fddbac2ab078731613a151bb9af3,8735ac1a1e326572f6c838b056336e0f7892b2ef..6914add66bcf8857633cea33e1cbc905ba184475
@@@ -77,11 -77,11 +77,11 @@@ extern unsigned long empty_zero_page[PA
  static inline phys_addr_t __pte_to_phys(pte_t pte)
  {
        return (pte_val(pte) & PTE_ADDR_LOW) |
 -              ((pte_val(pte) & PTE_ADDR_HIGH) << 36);
 +              ((pte_val(pte) & PTE_ADDR_HIGH) << PTE_ADDR_HIGH_SHIFT);
  }
  static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
  {
 -      return (phys | (phys >> 36)) & PTE_ADDR_MASK;
 +      return (phys | (phys >> PTE_ADDR_HIGH_SHIFT)) & PTE_ADDR_MASK;
  }
  #else
  #define __pte_to_phys(pte)    (pte_val(pte) & PTE_ADDR_MASK)
@@@ -609,6 -609,7 +609,6 @@@ extern pgd_t init_pg_dir[PTRS_PER_PGD]
  extern pgd_t init_pg_end[];
  extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
  extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
 -extern pgd_t idmap_pg_end[];
  extern pgd_t tramp_pg_dir[PTRS_PER_PGD];
  extern pgd_t reserved_pg_dir[PTRS_PER_PGD];
  
@@@ -862,12 -863,12 +862,12 @@@ static inline bool pte_user_accessible_
  
  static inline bool pmd_user_accessible_page(pmd_t pmd)
  {
 -      return pmd_present(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
 +      return pmd_leaf(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
  }
  
  static inline bool pud_user_accessible_page(pud_t pud)
  {
 -      return pud_present(pud) && pud_user(pud);
 +      return pud_leaf(pud) && pud_user(pud);
  }
  #endif
  
@@@ -1020,6 -1021,8 +1020,6 @@@ static inline pmd_t pmdp_establish(stru
   */
  #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)
  
 -extern int kern_addr_valid(unsigned long addr);
 -
  #ifdef CONFIG_ARM64_MTE
  
  #define __HAVE_ARCH_PREPARE_TO_SWAP
@@@ -1046,8 -1049,8 +1046,8 @@@ static inline void arch_swap_invalidate
  #define __HAVE_ARCH_SWAP_RESTORE
  static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
  {
-       if (system_supports_mte() && mte_restore_tags(entry, &folio->page))
-               set_bit(PG_mte_tagged, &folio->flags);
+       if (system_supports_mte())
+               mte_restore_tags(entry, &folio->page);
  }
  
  #endif /* CONFIG_ARM64_MTE */
@@@ -1093,15 -1096,6 +1093,15 @@@ static inline bool pud_sect_supported(v
  }
  
  
 +#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
 +#define ptep_modify_prot_start ptep_modify_prot_start
 +extern pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
 +                                  unsigned long addr, pte_t *ptep);
 +
 +#define ptep_modify_prot_commit ptep_modify_prot_commit
 +extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
 +                                  unsigned long addr, pte_t *ptep,
 +                                  pte_t old_pte, pte_t new_pte);
  #endif /* !__ASSEMBLY__ */
  
  #endif /* __ASM_PGTABLE_H */
index 7e76e1fda2a1f8ac62ed56b1fede327aa262c3e6,8ab262d7a26aa2ea8c391ee179e062458bae7a0b..a77315b338e6113c4525ce8a24bae485ea869541
@@@ -212,8 -212,6 +212,8 @@@ static const struct arm64_ftr_bits ftr_
  };
  
  static const struct arm64_ftr_bits ftr_id_aa64isar2[] = {
 +      ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CSSC_SHIFT, 4, 0),
 +      ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRFM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64ISAR2_EL1_BC_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
                       FTR_STRICT, FTR_EXACT, ID_AA64ISAR2_EL1_APA3_SHIFT, 4, 0),
@@@ -2076,8 -2074,10 +2076,10 @@@ static void cpu_enable_mte(struct arm64
         * Clear the tags in the zero page. This needs to be done via the
         * linear map which has the Tagged attribute.
         */
-       if (!test_and_set_bit(PG_mte_tagged, &ZERO_PAGE(0)->flags))
+       if (try_page_mte_tagging(ZERO_PAGE(0))) {
                mte_clear_page_tags(lm_alias(empty_zero_page));
+               set_page_mte_tagged(ZERO_PAGE(0));
+       }
  
        kasan_init_hw_tags_cpu();
  }
@@@ -2103,11 -2103,6 +2105,11 @@@ static void cpu_trap_el0_impdef(const s
        sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_TIDCP);
  }
  
 +static void cpu_enable_dit(const struct arm64_cpu_capabilities *__unused)
 +{
 +      set_pstate_dit(1);
 +}
 +
  /* Internal helper functions to match cpu capability type */
  static bool
  cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap)
@@@ -2671,18 -2666,6 +2673,18 @@@ static const struct arm64_cpu_capabilit
                .matches = has_cpuid_feature,
                .cpu_enable = cpu_trap_el0_impdef,
        },
 +      {
 +              .desc = "Data independent timing control (DIT)",
 +              .capability = ARM64_HAS_DIT,
 +              .type = ARM64_CPUCAP_SYSTEM_FEATURE,
 +              .sys_reg = SYS_ID_AA64PFR0_EL1,
 +              .sign = FTR_UNSIGNED,
 +              .field_pos = ID_AA64PFR0_EL1_DIT_SHIFT,
 +              .field_width = 4,
 +              .min_field_value = ID_AA64PFR0_EL1_DIT_IMP,
 +              .matches = has_cpuid_feature,
 +              .cpu_enable = cpu_enable_dit,
 +      },
        {},
  };
  
@@@ -2791,7 -2774,6 +2793,7 @@@ static const struct arm64_cpu_capabilit
        HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_EL1_AT_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_USCAT),
  #ifdef CONFIG_ARM64_SVE
        HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_EL1_SVE_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR0_EL1_SVE_IMP, CAP_HWCAP, KERNEL_HWCAP_SVE),
 +      HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_SVEver_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_SVEver_SVE2p1, CAP_HWCAP, KERNEL_HWCAP_SVE2P1),
        HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_SVEver_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_SVEver_SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2),
        HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_AES_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_AES_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEAES),
        HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_AES_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_AES_PMULL128, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL),
  #endif /* CONFIG_ARM64_MTE */
        HWCAP_CAP(SYS_ID_AA64MMFR0_EL1, ID_AA64MMFR0_EL1_ECV_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ECV),
        HWCAP_CAP(SYS_ID_AA64MMFR1_EL1, ID_AA64MMFR1_EL1_AFP_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AFP),
 +      HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_EL1_CSSC_SHIFT, 4, FTR_UNSIGNED, ID_AA64ISAR2_EL1_CSSC_IMP, CAP_HWCAP, KERNEL_HWCAP_CSSC),
 +      HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_EL1_RPRFM_SHIFT, 4, FTR_UNSIGNED, ID_AA64ISAR2_EL1_RPRFM_IMP, CAP_HWCAP, KERNEL_HWCAP_RPRFM),
        HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_EL1_RPRES_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RPRES),
        HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_EL1_WFxT_SHIFT, 4, FTR_UNSIGNED, ID_AA64ISAR2_EL1_WFxT_IMP, CAP_HWCAP, KERNEL_HWCAP_WFXT),
  #ifdef CONFIG_ARM64_SME
@@@ -3457,22 -3437,35 +3459,22 @@@ int do_emulate_mrs(struct pt_regs *regs
        return rc;
  }
  
 -static int emulate_mrs(struct pt_regs *regs, u32 insn)
 +bool try_emulate_mrs(struct pt_regs *regs, u32 insn)
  {
        u32 sys_reg, rt;
  
 +      if (compat_user_mode(regs) || !aarch64_insn_is_mrs(insn))
 +              return false;
 +
        /*
         * sys_reg values are defined as used in mrs/msr instruction.
         * shift the imm value to get the encoding.
         */
        sys_reg = (u32)aarch64_insn_decode_immediate(AARCH64_INSN_IMM_16, insn) << 5;
        rt = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn);
 -      return do_emulate_mrs(regs, sys_reg, rt);
 +      return do_emulate_mrs(regs, sys_reg, rt) == 0;
  }
  
 -static struct undef_hook mrs_hook = {
 -      .instr_mask = 0xffff0000,
 -      .instr_val  = 0xd5380000,
 -      .pstate_mask = PSR_AA32_MODE_MASK,
 -      .pstate_val = PSR_MODE_EL0t,
 -      .fn = emulate_mrs,
 -};
 -
 -static int __init enable_mrs_emulation(void)
 -{
 -      register_undef_hook(&mrs_hook);
 -      return 0;
 -}
 -
 -core_initcall(enable_mrs_emulation);
 -
  enum mitigation_state arm64_get_meltdown_state(void)
  {
        if (__meltdown_safe)
index f31130ba02331060fb394fb27e5d9b2cd75cc60e,e3f88b5836a20bdb9e6263cd17812c49e3909cd9..d0e9bb5c91fccad6c22c16d7ab3e24559a85b2b1
@@@ -10,6 -10,7 +10,6 @@@
  #error This file should only be included in vmlinux.lds.S
  #endif
  
 -PROVIDE(__efistub_kernel_size         = _edata - _text);
  PROVIDE(__efistub_primary_entry_offset        = primary_entry - _text);
  
  /*
   * linked at. The routines below are all implemented in assembler in a
   * position independent manner
   */
 -PROVIDE(__efistub_memcmp              = __pi_memcmp);
 -PROVIDE(__efistub_memchr              = __pi_memchr);
 -PROVIDE(__efistub_strlen              = __pi_strlen);
 -PROVIDE(__efistub_strnlen             = __pi_strnlen);
 -PROVIDE(__efistub_strcmp              = __pi_strcmp);
 -PROVIDE(__efistub_strncmp             = __pi_strncmp);
 -PROVIDE(__efistub_strrchr             = __pi_strrchr);
  PROVIDE(__efistub_dcache_clean_poc    = __pi_dcache_clean_poc);
  
  PROVIDE(__efistub__text                       = _text);
@@@ -63,12 -71,6 +63,6 @@@ KVM_NVHE_ALIAS(nvhe_hyp_panic_handler)
  /* Vectors installed by hyp-init on reset HVC. */
  KVM_NVHE_ALIAS(__hyp_stub_vectors);
  
- /* Kernel symbol used by icache_is_vpipt(). */
- KVM_NVHE_ALIAS(__icache_flags);
- /* VMID bits set by the KVM VMID allocator */
- KVM_NVHE_ALIAS(kvm_arm_vmid_bits);
  /* Static keys which are set if a vGIC trap should be handled in hyp. */
  KVM_NVHE_ALIAS(vgic_v2_cpuif_trap);
  KVM_NVHE_ALIAS(vgic_v3_cpuif_trap);
@@@ -84,9 -86,6 +78,6 @@@ KVM_NVHE_ALIAS(gic_nonsecure_priorities
  KVM_NVHE_ALIAS(__start___kvm_ex_table);
  KVM_NVHE_ALIAS(__stop___kvm_ex_table);
  
- /* Array containing bases of nVHE per-CPU memory regions. */
- KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base);
  /* PMU available static key */
  #ifdef CONFIG_HW_PERF_EVENTS
  KVM_NVHE_ALIAS(kvm_arm_pmu_available);
@@@ -103,12 -102,6 +94,6 @@@ KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcp
  KVM_NVHE_ALIAS_HYP(__memset, __pi_memset);
  #endif
  
- /* Kernel memory sections */
- KVM_NVHE_ALIAS(__start_rodata);
- KVM_NVHE_ALIAS(__end_rodata);
- KVM_NVHE_ALIAS(__bss_start);
- KVM_NVHE_ALIAS(__bss_stop);
  /* Hyp memory sections */
  KVM_NVHE_ALIAS(__hyp_idmap_text_start);
  KVM_NVHE_ALIAS(__hyp_idmap_text_end);
diff --combined arch/arm64/mm/fault.c
index 3eb2825d08cffc7335fd64e643fbed105541a424,0b1c102b89c90d06e3f2efe70eb790129cf82ad3..596f46dabe4ef2f16f5d10b05e5f9983878961bb
@@@ -354,11 -354,6 +354,11 @@@ static bool is_el1_mte_sync_tag_check_f
        return false;
  }
  
 +static bool is_translation_fault(unsigned long esr)
 +{
 +      return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT;
 +}
 +
  static void __do_kernel_fault(unsigned long addr, unsigned long esr,
                              struct pt_regs *regs)
  {
        } else if (addr < PAGE_SIZE) {
                msg = "NULL pointer dereference";
        } else {
 -              if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
 +              if (is_translation_fault(esr) &&
 +                  kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
                        return;
  
                msg = "paging request";
@@@ -943,6 -937,8 +943,8 @@@ struct page *alloc_zeroed_user_highpage
  
  void tag_clear_highpage(struct page *page)
  {
+       /* Newly allocated page, shouldn't have been tagged yet */
+       WARN_ON_ONCE(!try_page_mte_tagging(page));
        mte_zero_clear_page_tags(page_address(page));
-       set_bit(PG_mte_tagged, &page->flags);
+       set_page_mte_tagged(page);
  }
diff --combined arch/s390/kernel/entry.S
index e0d11f3adfccd05dfec8fef7dc2b920eef67cf62,12e1773a94a47997301c03c6dbe1ebb92f3da5f2..0f423e9df09565b20c3442309e5d031ee55fdc16
@@@ -122,6 -122,24 +122,6 @@@ _LPP_OFFSET       = __LC_LP
                    "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", 82
        .endm
  
 -      /*
 -       * The CHKSTG macro jumps to the provided label in case the
 -       * machine check interruption code reports one of unrecoverable
 -       * storage errors:
 -       * - Storage error uncorrected
 -       * - Storage key error uncorrected
 -       * - Storage degradation with Failing-storage-address validity
 -       */
 -      .macro CHKSTG errlabel
 -      TSTMSK  __LC_MCCK_CODE,(MCCK_CODE_STG_ERROR|MCCK_CODE_STG_KEY_ERROR)
 -      jnz     \errlabel
 -      TSTMSK  __LC_MCCK_CODE,MCCK_CODE_STG_DEGRAD
 -      jz      .Loklabel\@
 -      TSTMSK  __LC_MCCK_CODE,MCCK_CODE_STG_FAIL_ADDR
 -      jnz     \errlabel
 -.Loklabel\@:
 -      .endm
 -
  #if IS_ENABLED(CONFIG_KVM)
        /*
         * The OUTSIDE macro jumps to the provided label in case the value
@@@ -207,18 -225,20 +207,20 @@@ ENDPROC(__switch_to
  
  #if IS_ENABLED(CONFIG_KVM)
  /*
-  * sie64a calling convention:
-  * %r2 pointer to sie control block
-  * %r3 guest register save area
+  * __sie64a calling convention:
+  * %r2 pointer to sie control block phys
+  * %r3 pointer to sie control block virt
+  * %r4 guest register save area
   */
- ENTRY(sie64a)
+ ENTRY(__sie64a)
        stmg    %r6,%r14,__SF_GPRS(%r15)        # save kernel registers
        lg      %r12,__LC_CURRENT
-       stg     %r2,__SF_SIE_CONTROL(%r15)      # save control block pointer
-       stg     %r3,__SF_SIE_SAVEAREA(%r15)     # save guest register save area
+       stg     %r2,__SF_SIE_CONTROL_PHYS(%r15) # save sie block physical..
+       stg     %r3,__SF_SIE_CONTROL(%r15)      # ...and virtual addresses
+       stg     %r4,__SF_SIE_SAVEAREA(%r15)     # save guest register save area
        xc      __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0
        mvc     __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags
-       lmg     %r0,%r13,0(%r3)                 # load guest gprs 0-13
+       lmg     %r0,%r13,0(%r4)                 # load guest gprs 0-13
        lg      %r14,__LC_GMAP                  # get gmap pointer
        ltgr    %r14,%r14
        jz      .Lsie_gmap
        jnz     .Lsie_skip
        TSTMSK  __LC_CPU_FLAGS,_CIF_FPU
        jo      .Lsie_skip                      # exit if fp/vx regs changed
+       lg      %r14,__SF_SIE_CONTROL_PHYS(%r15)        # get sie block phys addr
        BPEXIT  __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
  .Lsie_entry:
        sie     0(%r14)
        BPOFF
        BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
  .Lsie_skip:
+       lg      %r14,__SF_SIE_CONTROL(%r15)     # get control block pointer
        ni      __SIE_PROG0C+3(%r14),0xfe       # no longer in SIE
        lctlg   %c1,%c1,__LC_KERNEL_ASCE        # load primary asce
  .Lsie_done:
  # some program checks are suppressing. C code (e.g. do_protection_exception)
  # will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There
  # are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable.
- # Other instructions between sie64a and .Lsie_done should not cause program
+ # Other instructions between __sie64a and .Lsie_done should not cause program
  # interrupts. So lets use 3 nops as a landing pad for all possible rewinds.
  .Lrewind_pad6:
        nopr    7
@@@ -275,8 -297,8 +279,8 @@@ sie_exit
        EX_TABLE(.Lrewind_pad4,.Lsie_fault)
        EX_TABLE(.Lrewind_pad2,.Lsie_fault)
        EX_TABLE(sie_exit,.Lsie_fault)
- ENDPROC(sie64a)
- EXPORT_SYMBOL(sie64a)
+ ENDPROC(__sie64a)
+ EXPORT_SYMBOL(__sie64a)
  EXPORT_SYMBOL(sie_exit)
  #endif
  
@@@ -355,7 -377,7 +359,7 @@@ ENTRY(pgm_check_handler
        j       3f                      # -> fault in user space
  .Lpgm_skip_asce:
  #if IS_ENABLED(CONFIG_KVM)
-       # cleanup critical section for program checks in sie64a
+       # cleanup critical section for program checks in __sie64a
        OUTSIDE %r9,.Lsie_gmap,.Lsie_done,1f
        SIEEXIT
        lghi    %r10,_PIF_GUEST_FAULT
@@@ -528,18 -550,26 +532,18 @@@ ENTRY(mcck_int_handler
  3:    TSTMSK  __LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID
        jno     .Lmcck_panic
        tmhh    %r8,0x0001              # interrupting from user ?
 -      jnz     6f
 +      jnz     .Lmcck_user
        TSTMSK  __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID
        jno     .Lmcck_panic
  #if IS_ENABLED(CONFIG_KVM)
 -      OUTSIDE %r9,.Lsie_gmap,.Lsie_done,6f
 +      OUTSIDE %r9,.Lsie_gmap,.Lsie_done,.Lmcck_stack
        OUTSIDE %r9,.Lsie_entry,.Lsie_leave,4f
        oi      __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST
 -      j       5f
 -4:    CHKSTG  .Lmcck_panic
 -5:    larl    %r14,.Lstosm_tmp
 -      stosm   0(%r14),0x04            # turn dat on, keep irqs off
 -      BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
 +4:    BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
        SIEEXIT
        j       .Lmcck_stack
  #endif
 -6:    CHKSTG  .Lmcck_panic
 -      larl    %r14,.Lstosm_tmp
 -      stosm   0(%r14),0x04            # turn dat on, keep irqs off
 -      tmhh    %r8,0x0001              # interrupting from user ?
 -      jz      .Lmcck_stack
 +.Lmcck_user:
        BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
  .Lmcck_stack:
        lg      %r15,__LC_MCCK_STACK
diff --combined arch/s390/kvm/vsie.c
index ace2541ababd383448ff654f917b66983f3b33a4,0e9d020d70932754cf0662521800a2bf75e14c6a..b6a0219e470a4a4ba4609c5fb2131a06d9ddc0c7
@@@ -546,10 -546,8 +546,10 @@@ static int shadow_scb(struct kvm_vcpu *
        if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
                scb_s->eca |= scb_o->eca & ECA_CEI;
        /* Epoch Extension */
 -      if (test_kvm_facility(vcpu->kvm, 139))
 +      if (test_kvm_facility(vcpu->kvm, 139)) {
                scb_s->ecd |= scb_o->ecd & ECD_MEF;
 +              scb_s->epdx = scb_o->epdx;
 +      }
  
        /* etoken */
        if (test_kvm_facility(vcpu->kvm, 156))
@@@ -656,7 -654,7 +656,7 @@@ static int pin_guest_page(struct kvm *k
        page = gfn_to_page(kvm, gpa_to_gfn(gpa));
        if (is_error_page(page))
                return -EINVAL;
-       *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
+       *hpa = (hpa_t)page_to_phys(page) + (gpa & ~PAGE_MASK);
        return 0;
  }
  
@@@ -871,7 -869,7 +871,7 @@@ static int pin_scb(struct kvm_vcpu *vcp
                WARN_ON_ONCE(rc);
                return 1;
        }
-       vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
+       vsie_page->scb_o = phys_to_virt(hpa);
        return 0;
  }
  
diff --combined arch/s390/mm/gmap.c
index 8947451ae0210f35b7a584e4c3c7854972b95d88,2ccfcc8a3863ee30c8389ac39e4e9a2a4750021b..74e1d873dce050fae2c1bc282498371ba3f981bb
@@@ -72,7 -72,7 +72,7 @@@ static struct gmap *gmap_alloc(unsigne
                goto out_free;
        page->index = 0;
        list_add(&page->lru, &gmap->crst_list);
-       table = (unsigned long *) page_to_phys(page);
+       table = page_to_virt(page);
        crst_table_init(table, etype);
        gmap->table = table;
        gmap->asce = atype | _ASCE_TABLE_LENGTH |
@@@ -311,12 -311,12 +311,12 @@@ static int gmap_alloc_table(struct gma
        page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
        if (!page)
                return -ENOMEM;
-       new = (unsigned long *) page_to_phys(page);
+       new = page_to_virt(page);
        crst_table_init(new, init);
        spin_lock(&gmap->guest_table_lock);
        if (*table & _REGION_ENTRY_INVALID) {
                list_add(&page->lru, &gmap->crst_list);
-               *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
+               *table = __pa(new) | _REGION_ENTRY_LENGTH |
                        (*table & _REGION_ENTRY_TYPE_MASK);
                page->index = gaddr;
                page = NULL;
  static unsigned long __gmap_segment_gaddr(unsigned long *entry)
  {
        struct page *page;
 -      unsigned long offset, mask;
 +      unsigned long offset;
  
        offset = (unsigned long) entry / sizeof(unsigned long);
        offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
 -      mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
 -      page = virt_to_page((void *)((unsigned long) entry & mask));
 +      page = pmd_pgtable_page((pmd_t *) entry);
        return page->index + offset;
  }
  
@@@ -556,7 -557,7 +556,7 @@@ int __gmap_link(struct gmap *gmap, unsi
                    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
                                     gaddr & _REGION1_MASK))
                        return -ENOMEM;
-               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               table = __va(*table & _REGION_ENTRY_ORIGIN);
        }
        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
                table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
                    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
                                     gaddr & _REGION2_MASK))
                        return -ENOMEM;
-               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               table = __va(*table & _REGION_ENTRY_ORIGIN);
        }
        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
                table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
                    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
                                     gaddr & _REGION3_MASK))
                        return -ENOMEM;
-               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               table = __va(*table & _REGION_ENTRY_ORIGIN);
        }
        table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
        /* Walk the parent mm page table */
@@@ -812,7 -813,7 +812,7 @@@ static inline unsigned long *gmap_table
                        break;
                if (*table & _REGION_ENTRY_INVALID)
                        return NULL;
-               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               table = __va(*table & _REGION_ENTRY_ORIGIN);
                fallthrough;
        case _ASCE_TYPE_REGION2:
                table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
                        break;
                if (*table & _REGION_ENTRY_INVALID)
                        return NULL;
-               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               table = __va(*table & _REGION_ENTRY_ORIGIN);
                fallthrough;
        case _ASCE_TYPE_REGION3:
                table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
                        break;
                if (*table & _REGION_ENTRY_INVALID)
                        return NULL;
-               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               table = __va(*table & _REGION_ENTRY_ORIGIN);
                fallthrough;
        case _ASCE_TYPE_SEGMENT:
                table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
                        break;
                if (*table & _REGION_ENTRY_INVALID)
                        return NULL;
-               table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+               table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
                table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT;
        }
        return table;
@@@ -1149,7 -1150,7 +1149,7 @@@ int gmap_read_table(struct gmap *gmap, 
                        if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
                                address = pte_val(pte) & PAGE_MASK;
                                address += gaddr & ~PAGE_MASK;
-                               *val = *(unsigned long *) address;
+                               *val = *(unsigned long *)__va(address);
                                set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
                                /* Do *NOT* clear the _PAGE_INVALID bit! */
                                rc = 0;
@@@ -1334,7 -1335,8 +1334,8 @@@ static void __gmap_unshadow_pgt(struct 
   */
  static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
  {
-       unsigned long sto, *ste, *pgt;
+       unsigned long *ste;
+       phys_addr_t sto, pgt;
        struct page *page;
  
        BUG_ON(!gmap_is_shadow(sg));
        if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
                return;
        gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
-       sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
+       sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
        gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
-       pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+       pgt = *ste & _SEGMENT_ENTRY_ORIGIN;
        *ste = _SEGMENT_ENTRY_EMPTY;
-       __gmap_unshadow_pgt(sg, raddr, pgt);
+       __gmap_unshadow_pgt(sg, raddr, __va(pgt));
        /* Free page table */
-       page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+       page = phys_to_page(pgt);
        list_del(&page->lru);
        page_table_free_pgste(page);
  }
  static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
                                unsigned long *sgt)
  {
-       unsigned long *pgt;
        struct page *page;
+       phys_addr_t pgt;
        int i;
  
        BUG_ON(!gmap_is_shadow(sg));
        for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
                if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
                        continue;
-               pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+               pgt = sgt[i] & _REGION_ENTRY_ORIGIN;
                sgt[i] = _SEGMENT_ENTRY_EMPTY;
-               __gmap_unshadow_pgt(sg, raddr, pgt);
+               __gmap_unshadow_pgt(sg, raddr, __va(pgt));
                /* Free page table */
-               page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+               page = phys_to_page(pgt);
                list_del(&page->lru);
                page_table_free_pgste(page);
        }
   */
  static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
  {
-       unsigned long r3o, *r3e, *sgt;
+       unsigned long r3o, *r3e;
+       phys_addr_t sgt;
        struct page *page;
  
        BUG_ON(!gmap_is_shadow(sg));
                return;
        gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
        r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
-       gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
-       sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+       gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr);
+       sgt = *r3e & _REGION_ENTRY_ORIGIN;
        *r3e = _REGION3_ENTRY_EMPTY;
-       __gmap_unshadow_sgt(sg, raddr, sgt);
+       __gmap_unshadow_sgt(sg, raddr, __va(sgt));
        /* Free segment table */
-       page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+       page = phys_to_page(sgt);
        list_del(&page->lru);
        __free_pages(page, CRST_ALLOC_ORDER);
  }
  static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
                                unsigned long *r3t)
  {
-       unsigned long *sgt;
        struct page *page;
+       phys_addr_t sgt;
        int i;
  
        BUG_ON(!gmap_is_shadow(sg));
        for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
                if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
                        continue;
-               sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+               sgt = r3t[i] & _REGION_ENTRY_ORIGIN;
                r3t[i] = _REGION3_ENTRY_EMPTY;
-               __gmap_unshadow_sgt(sg, raddr, sgt);
+               __gmap_unshadow_sgt(sg, raddr, __va(sgt));
                /* Free segment table */
-               page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+               page = phys_to_page(sgt);
                list_del(&page->lru);
                __free_pages(page, CRST_ALLOC_ORDER);
        }
   */
  static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
  {
-       unsigned long r2o, *r2e, *r3t;
+       unsigned long r2o, *r2e;
+       phys_addr_t r3t;
        struct page *page;
  
        BUG_ON(!gmap_is_shadow(sg));
                return;
        gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
        r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
-       gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
-       r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+       gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr);
+       r3t = *r2e & _REGION_ENTRY_ORIGIN;
        *r2e = _REGION2_ENTRY_EMPTY;
-       __gmap_unshadow_r3t(sg, raddr, r3t);
+       __gmap_unshadow_r3t(sg, raddr, __va(r3t));
        /* Free region 3 table */
-       page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+       page = phys_to_page(r3t);
        list_del(&page->lru);
        __free_pages(page, CRST_ALLOC_ORDER);
  }
  static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
                                unsigned long *r2t)
  {
-       unsigned long *r3t;
+       phys_addr_t r3t;
        struct page *page;
        int i;
  
        for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
                if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
                        continue;
-               r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+               r3t = r2t[i] & _REGION_ENTRY_ORIGIN;
                r2t[i] = _REGION2_ENTRY_EMPTY;
-               __gmap_unshadow_r3t(sg, raddr, r3t);
+               __gmap_unshadow_r3t(sg, raddr, __va(r3t));
                /* Free region 3 table */
-               page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+               page = phys_to_page(r3t);
                list_del(&page->lru);
                __free_pages(page, CRST_ALLOC_ORDER);
        }
   */
  static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
  {
-       unsigned long r1o, *r1e, *r2t;
+       unsigned long r1o, *r1e;
        struct page *page;
+       phys_addr_t r2t;
  
        BUG_ON(!gmap_is_shadow(sg));
        r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
                return;
        gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
        r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
-       gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
-       r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+       gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr);
+       r2t = *r1e & _REGION_ENTRY_ORIGIN;
        *r1e = _REGION1_ENTRY_EMPTY;
-       __gmap_unshadow_r2t(sg, raddr, r2t);
+       __gmap_unshadow_r2t(sg, raddr, __va(r2t));
        /* Free region 2 table */
-       page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+       page = phys_to_page(r2t);
        list_del(&page->lru);
        __free_pages(page, CRST_ALLOC_ORDER);
  }
  static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
                                unsigned long *r1t)
  {
-       unsigned long asce, *r2t;
+       unsigned long asce;
        struct page *page;
+       phys_addr_t r2t;
        int i;
  
        BUG_ON(!gmap_is_shadow(sg));
-       asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+       asce = __pa(r1t) | _ASCE_TYPE_REGION1;
        for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
                if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
                        continue;
-               r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
-               __gmap_unshadow_r2t(sg, raddr, r2t);
+               r2t = r1t[i] & _REGION_ENTRY_ORIGIN;
+               __gmap_unshadow_r2t(sg, raddr, __va(r2t));
                /* Clear entry and flush translation r1t -> r2t */
                gmap_idte_one(asce, raddr);
                r1t[i] = _REGION1_ENTRY_EMPTY;
                /* Free region 2 table */
-               page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+               page = phys_to_page(r2t);
                list_del(&page->lru);
                __free_pages(page, CRST_ALLOC_ORDER);
        }
@@@ -1572,7 -1578,7 +1577,7 @@@ static void gmap_unshadow(struct gmap *
        sg->removed = 1;
        gmap_call_notifier(sg, 0, -1UL);
        gmap_flush_tlb(sg);
-       table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+       table = __va(sg->asce & _ASCE_ORIGIN);
        switch (sg->asce & _ASCE_TYPE_MASK) {
        case _ASCE_TYPE_REGION1:
                __gmap_unshadow_r1t(sg, 0, table);
@@@ -1747,7 -1753,8 +1752,8 @@@ int gmap_shadow_r2t(struct gmap *sg, un
                    int fake)
  {
        unsigned long raddr, origin, offset, len;
-       unsigned long *s_r2t, *table;
+       unsigned long *table;
+       phys_addr_t s_r2t;
        struct page *page;
        int rc;
  
        page->index = r2t & _REGION_ENTRY_ORIGIN;
        if (fake)
                page->index |= GMAP_SHADOW_FAKE_TABLE;
-       s_r2t = (unsigned long *) page_to_phys(page);
+       s_r2t = page_to_phys(page);
        /* Install shadow region second table */
        spin_lock(&sg->guest_table_lock);
        table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
                rc = -EAGAIN;           /* Race with shadow */
                goto out_free;
        }
-       crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+       crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY);
        /* mark as invalid as long as the parent table is not protected */
-       *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+       *table = s_r2t | _REGION_ENTRY_LENGTH |
                 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
        if (sg->edat_level >= 1)
                *table |= (r2t & _REGION_ENTRY_PROTECT);
        spin_lock(&sg->guest_table_lock);
        if (!rc) {
                table = gmap_table_walk(sg, saddr, 4);
-               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
-                             (unsigned long) s_r2t)
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t)
                        rc = -EAGAIN;           /* Race with unshadow */
                else
                        *table &= ~_REGION_ENTRY_INVALID;
@@@ -1831,7 -1837,8 +1836,8 @@@ int gmap_shadow_r3t(struct gmap *sg, un
                    int fake)
  {
        unsigned long raddr, origin, offset, len;
-       unsigned long *s_r3t, *table;
+       unsigned long *table;
+       phys_addr_t s_r3t;
        struct page *page;
        int rc;
  
        page->index = r3t & _REGION_ENTRY_ORIGIN;
        if (fake)
                page->index |= GMAP_SHADOW_FAKE_TABLE;
-       s_r3t = (unsigned long *) page_to_phys(page);
+       s_r3t = page_to_phys(page);
        /* Install shadow region second table */
        spin_lock(&sg->guest_table_lock);
        table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
                rc = -EAGAIN;           /* Race with shadow */
                goto out_free;
        }
-       crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+       crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY);
        /* mark as invalid as long as the parent table is not protected */
-       *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+       *table = s_r3t | _REGION_ENTRY_LENGTH |
                 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
        if (sg->edat_level >= 1)
                *table |= (r3t & _REGION_ENTRY_PROTECT);
        spin_lock(&sg->guest_table_lock);
        if (!rc) {
                table = gmap_table_walk(sg, saddr, 3);
-               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
-                             (unsigned long) s_r3t)
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t)
                        rc = -EAGAIN;           /* Race with unshadow */
                else
                        *table &= ~_REGION_ENTRY_INVALID;
@@@ -1915,7 -1921,8 +1920,8 @@@ int gmap_shadow_sgt(struct gmap *sg, un
                    int fake)
  {
        unsigned long raddr, origin, offset, len;
-       unsigned long *s_sgt, *table;
+       unsigned long *table;
+       phys_addr_t s_sgt;
        struct page *page;
        int rc;
  
        page->index = sgt & _REGION_ENTRY_ORIGIN;
        if (fake)
                page->index |= GMAP_SHADOW_FAKE_TABLE;
-       s_sgt = (unsigned long *) page_to_phys(page);
+       s_sgt = page_to_phys(page);
        /* Install shadow region second table */
        spin_lock(&sg->guest_table_lock);
        table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
                rc = -EAGAIN;           /* Race with shadow */
                goto out_free;
        }
-       crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+       crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY);
        /* mark as invalid as long as the parent table is not protected */
-       *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+       *table = s_sgt | _REGION_ENTRY_LENGTH |
                 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
        if (sg->edat_level >= 1)
                *table |= sgt & _REGION_ENTRY_PROTECT;
        spin_lock(&sg->guest_table_lock);
        if (!rc) {
                table = gmap_table_walk(sg, saddr, 2);
-               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
-                             (unsigned long) s_sgt)
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt)
                        rc = -EAGAIN;           /* Race with unshadow */
                else
                        *table &= ~_REGION_ENTRY_INVALID;
@@@ -2039,8 -2045,9 +2044,9 @@@ int gmap_shadow_pgt(struct gmap *sg, un
                    int fake)
  {
        unsigned long raddr, origin;
-       unsigned long *s_pgt, *table;
+       unsigned long *table;
        struct page *page;
+       phys_addr_t s_pgt;
        int rc;
  
        BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
        page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
        if (fake)
                page->index |= GMAP_SHADOW_FAKE_TABLE;
-       s_pgt = (unsigned long *) page_to_phys(page);
+       s_pgt = page_to_phys(page);
        /* Install shadow page table */
        spin_lock(&sg->guest_table_lock);
        table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
        spin_lock(&sg->guest_table_lock);
        if (!rc) {
                table = gmap_table_walk(sg, saddr, 1);
-               if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
-                             (unsigned long) s_pgt)
+               if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt)
                        rc = -EAGAIN;           /* Race with unshadow */
                else
                        *table &= ~_SEGMENT_ENTRY_INVALID;
diff --combined arch/s390/mm/init.c
index 1a25d456d8657664f52149f68e0bdbbd0241a028,d509656c67d77422604f5fdd46d18d0e94d0ea0c..30ab55f868f6d87dbbb80c3d81b24105ee00de4c
@@@ -31,7 -31,6 +31,7 @@@
  #include <linux/cma.h>
  #include <linux/gfp.h>
  #include <linux/dma-direct.h>
 +#include <linux/percpu.h>
  #include <asm/processor.h>
  #include <linux/uaccess.h>
  #include <asm/pgalloc.h>
@@@ -141,25 -140,25 +141,25 @@@ void mark_rodata_ro(void
        debug_checkwx();
  }
  
- int set_memory_encrypted(unsigned long addr, int numpages)
+ int set_memory_encrypted(unsigned long vaddr, int numpages)
  {
        int i;
  
        /* make specified pages unshared, (swiotlb, dma_free) */
        for (i = 0; i < numpages; ++i) {
-               uv_remove_shared(addr);
-               addr += PAGE_SIZE;
+               uv_remove_shared(virt_to_phys((void *)vaddr));
+               vaddr += PAGE_SIZE;
        }
        return 0;
  }
  
- int set_memory_decrypted(unsigned long addr, int numpages)
+ int set_memory_decrypted(unsigned long vaddr, int numpages)
  {
        int i;
        /* make specified pages shared (swiotlb, dma_alloca) */
        for (i = 0; i < numpages; ++i) {
-               uv_set_shared(addr);
-               addr += PAGE_SIZE;
+               uv_set_shared(virt_to_phys((void *)vaddr));
+               vaddr += PAGE_SIZE;
        }
        return 0;
  }
@@@ -208,6 -207,9 +208,6 @@@ void free_initmem(void
        __set_memory((unsigned long)_sinittext,
                     (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
                     SET_MEMORY_RW | SET_MEMORY_NX);
 -      free_reserved_area(sclp_early_sccb,
 -                         sclp_early_sccb + EXT_SCCB_READ_SCP,
 -                         POISON_FREE_INITMEM, "unused early sccb");
        free_initmem_default(POISON_FREE_INITMEM);
  }
  
@@@ -220,41 -222,6 +220,41 @@@ unsigned long memory_block_size_bytes(v
        return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp.rzm);
  }
  
 +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 +EXPORT_SYMBOL(__per_cpu_offset);
 +
 +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
 +{
 +      return LOCAL_DISTANCE;
 +}
 +
 +static int __init pcpu_cpu_to_node(int cpu)
 +{
 +      return 0;
 +}
 +
 +void __init setup_per_cpu_areas(void)
 +{
 +      unsigned long delta;
 +      unsigned int cpu;
 +      int rc;
 +
 +      /*
 +       * Always reserve area for module percpu variables.  That's
 +       * what the legacy allocator did.
 +       */
 +      rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
 +                                  PERCPU_DYNAMIC_RESERVE, PAGE_SIZE,
 +                                  pcpu_cpu_distance,
 +                                  pcpu_cpu_to_node);
 +      if (rc < 0)
 +              panic("Failed to initialize percpu areas.");
 +
 +      delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 +      for_each_possible_cpu(cpu)
 +              __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
 +}
 +
  #ifdef CONFIG_MEMORY_HOTPLUG
  
  #ifdef CONFIG_CMA
index 017baba56b01923bf974c4d756590828d55e72e7,4dbde69c423bacd3df30396ea8ef1b21db662ded..1f21f576ca77fee6040a4d555eeac87ec706d638
@@@ -515,21 -515,21 +515,21 @@@ static void __intel_pmu_lbr_save(void *
        cpuc->last_log_id = ++task_context_opt(ctx)->log_id;
  }
  
 -void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
 -                               struct perf_event_context *next)
 +void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
 +                               struct perf_event_pmu_context *next_epc)
  {
        void *prev_ctx_data, *next_ctx_data;
  
 -      swap(prev->task_ctx_data, next->task_ctx_data);
 +      swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
  
        /*
 -       * Architecture specific synchronization makes sense in
 -       * case both prev->task_ctx_data and next->task_ctx_data
 +       * Architecture specific synchronization makes sense in case
 +       * both prev_epc->task_ctx_data and next_epc->task_ctx_data
         * pointers are allocated.
         */
  
 -      prev_ctx_data = next->task_ctx_data;
 -      next_ctx_data = prev->task_ctx_data;
 +      prev_ctx_data = next_epc->task_ctx_data;
 +      next_ctx_data = prev_epc->task_ctx_data;
  
        if (!prev_ctx_data || !next_ctx_data)
                return;
             task_context_opt(next_ctx_data)->lbr_callstack_users);
  }
  
 -void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
 +void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
  {
        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
        void *task_ctx;
         * the task was scheduled out, restore the stack. Otherwise flush
         * the LBR stack.
         */
 -      task_ctx = ctx ? ctx->task_ctx_data : NULL;
 +      task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL;
        if (task_ctx) {
                if (sched_in)
                        __intel_pmu_lbr_restore(task_ctx);
@@@ -587,8 -587,8 +587,8 @@@ void intel_pmu_lbr_add(struct perf_even
  
        cpuc->br_sel = event->hw.branch_reg.reg;
  
 -      if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data)
 -              task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++;
 +      if (branch_user_callstack(cpuc->br_sel) && event->pmu_ctx->task_ctx_data)
 +              task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users++;
  
        /*
         * Request pmu::sched_task() callback, which will fire inside the
         */
        if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
                cpuc->lbr_pebs_users++;
 -      perf_sched_cb_inc(event->ctx->pmu);
 +      perf_sched_cb_inc(event->pmu);
        if (!cpuc->lbr_users++ && !event->total_time_running)
                intel_pmu_lbr_reset();
  }
@@@ -664,8 -664,8 +664,8 @@@ void intel_pmu_lbr_del(struct perf_even
                return;
  
        if (branch_user_callstack(cpuc->br_sel) &&
 -          event->ctx->task_ctx_data)
 -              task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--;
 +          event->pmu_ctx->task_ctx_data)
 +              task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users--;
  
        if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
                cpuc->lbr_select = 0;
        cpuc->lbr_users--;
        WARN_ON_ONCE(cpuc->lbr_users < 0);
        WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
 -      perf_sched_cb_dec(event->ctx->pmu);
 +      perf_sched_cb_dec(event->pmu);
  }
  
  static inline bool vlbr_exclude_host(void)
@@@ -1603,10 -1603,8 +1603,8 @@@ clear_arch_lbr
   * x86_perf_get_lbr - get the LBR records information
   *
   * @lbr: the caller's memory to store the LBR records information
-  *
-  * Returns: 0 indicates the LBR info has been successfully obtained
   */
int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
void x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
  {
        int lbr_fmt = x86_pmu.intel_cap.lbr_format;
  
        lbr->from = x86_pmu.lbr_from;
        lbr->to = x86_pmu.lbr_to;
        lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0;
-       return 0;
  }
  EXPORT_SYMBOL_GPL(x86_perf_get_lbr);
  
index 2dd2691b5ee1d92dce6ad2f749a3c54dc3819b72,1419c4e04d45f38f35c378bfef056a1f1af37c26..61012476d66e0e131939f2146d248e1215b7b152
  #define X86_FEATURE_UNRET             (11*32+15) /* "" AMD BTB untrain return */
  #define X86_FEATURE_USE_IBPB_FW               (11*32+16) /* "" Use IBPB during runtime firmware calls */
  #define X86_FEATURE_RSB_VMEXIT_LITE   (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */
 +#define X86_FEATURE_SGX_EDECCSSA      (11*32+18) /* "" SGX EDECCSSA user leaf function */
 +#define X86_FEATURE_CALL_DEPTH                (11*32+19) /* "" Call depth tracking for RSB stuffing */
 +#define X86_FEATURE_MSR_TSX_CTRL      (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */
  
  /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
  #define X86_FEATURE_AVX_VNNI          (12*32+ 4) /* AVX VNNI instructions */
  #define X86_FEATURE_AVX512_BF16               (12*32+ 5) /* AVX512 BFLOAT16 instructions */
+ #define X86_FEATURE_CMPCCXADD           (12*32+ 7) /* "" CMPccXADD instructions */
+ #define X86_FEATURE_AMX_FP16          (12*32+21) /* "" AMX fp16 Support */
+ #define X86_FEATURE_AVX_IFMA            (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
  
  /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
  #define X86_FEATURE_CLZERO            (13*32+ 0) /* CLZERO instruction */
index 6d9368ea3701ca11b441a7324ba6cceed2e66308,e3efaf6e6b6285d4e2da7abfb87340ce257b4345..08e822bd7aa601314980dfada7688407df539cbb
@@@ -61,6 -61,8 +61,8 @@@
  #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE          BIT(10)
  /* Support for debug MSRs available */
  #define HV_FEATURE_DEBUG_MSRS_AVAILABLE                       BIT(11)
+ /* Support for extended gva ranges for flush hypercalls available */
+ #define HV_FEATURE_EXT_GVA_RANGES_FLUSH                       BIT(14)
  /*
   * Support for returning hypercall output block via XMM
   * registers is available
@@@ -374,20 -376,11 +376,20 @@@ struct hv_nested_enlightenments_contro
  struct hv_vp_assist_page {
        __u32 apic_assist;
        __u32 reserved1;
 -      __u64 vtl_control[3];
 +      __u32 vtl_entry_reason;
 +      __u32 vtl_reserved;
 +      __u64 vtl_ret_x64rax;
 +      __u64 vtl_ret_x64rcx;
        struct hv_nested_enlightenments_control nested_control;
        __u8 enlighten_vmentry;
        __u8 reserved2[7];
        __u64 current_nested_vmcs;
 +      __u8 synthetic_time_unhalted_timer_expired;
 +      __u8 reserved3[7];
 +      __u8 virtualization_fault_information[40];
 +      __u8 reserved4[8];
 +      __u8 intercept_message[256];
 +      __u8 vtl_ret_actions[256];
  } __packed;
  
  struct hv_enlightened_vmcs {
  
  #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL                    0xFFFF
  
+ /*
+  * Note, Hyper-V isn't actually stealing bit 28 from Intel, just abusing it by
+  * pairing it with architecturally impossible exit reasons.  Bit 28 is set only
+  * on SMI exits to a SMI transfer monitor (STM) and if and only if a MTF VM-Exit
+  * is pending.  I.e. it will never be set by hardware for non-SMI exits (there
+  * are only three), nor will it ever be set unless the VMM is an STM.
+  */
+ #define HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH         0x10000031
+ /*
+  * Hyper-V uses the software reserved 32 bytes in VMCB control area to expose
+  * SVM enlightenments to guests.
+  */
+ struct hv_vmcb_enlightenments {
+       struct __packed hv_enlightenments_control {
+               u32 nested_flush_hypercall:1;
+               u32 msr_bitmap:1;
+               u32 enlightened_npt_tlb: 1;
+               u32 reserved:29;
+       } __packed hv_enlightenments_control;
+       u32 hv_vp_id;
+       u64 hv_vm_id;
+       u64 partition_assist_page;
+       u64 reserved;
+ } __packed;
+ /*
+  * Hyper-V uses the software reserved clean bit in VMCB.
+  */
+ #define HV_VMCB_NESTED_ENLIGHTENMENTS         31
+ /* Synthetic VM-Exit */
+ #define HV_SVM_EXITCODE_ENL                   0xf0000000
+ #define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH  (1)
  struct hv_partition_assist_pg {
        u32 tlb_lock_count;
  };
diff --combined arch/x86/kernel/kvm.c
index 4d053cb2c48a48be477d05a0797c451499b769ec,cf886f86038a0d11ffbfe5b5d5160c5a0df8ac1d..1cceac5984daa902f311a4001ba1a028ab6f91b3
@@@ -349,7 -349,7 +349,7 @@@ static notrace void kvm_guest_apic_eoi_
  static void kvm_guest_cpu_init(void)
  {
        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
-               u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
+               u64 pa;
  
                WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
  
@@@ -798,13 -798,19 +798,13 @@@ extern bool __raw_callee_save___kvm_vcp
   * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
   * restoring to/from the stack.
   */
 -asm(
 -".pushsection .text;"
 -".global __raw_callee_save___kvm_vcpu_is_preempted;"
 -".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
 -"__raw_callee_save___kvm_vcpu_is_preempted:"
 -ASM_ENDBR
 -"movq __per_cpu_offset(,%rdi,8), %rax;"
 -"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
 -"setne        %al;"
 -ASM_RET
 -".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
 -".popsection");
 +#define PV_VCPU_PREEMPTED_ASM                                              \
 + "movq   __per_cpu_offset(,%rdi,8), %rax\n\t"                              \
 + "cmpb   $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
 + "setne  %al\n\t"
  
 +DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted,
 +                  PV_VCPU_PREEMPTED_ASM, .text);
  #endif
  
  static void __init kvm_guest_init(void)
diff --combined arch/x86/kvm/cpuid.c
index c92c49a0b35b048f6874ee6115bfcc2ddba48563,0b5bf013fcb8e3f516d7bbda6351feb392f2bb61..b14653b61470c2720267006f7e7166ebac159613
@@@ -62,10 -62,16 +62,16 @@@ u32 xstate_required_size(u64 xstate_bv
   * This one is tied to SSB in the user API, and not
   * visible in /proc/cpuinfo.
   */
- #define KVM_X86_FEATURE_PSFD          (13*32+28) /* Predictive Store Forwarding Disable */
+ #define KVM_X86_FEATURE_AMD_PSFD      (13*32+28) /* Predictive Store Forwarding Disable */
  
  #define F feature_bit
- #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
+ /* Scattered Flag - For features that are scattered by cpufeatures.h. */
+ #define SF(name)                                              \
+ ({                                                            \
+       BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES);   \
+       (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0);       \
+ })
  
  /*
   * Magic value used by KVM when querying userspace-provided CPUID entries and
@@@ -543,9 -549,9 +549,9 @@@ static __always_inline void __kvm_cpu_c
  }
  
  static __always_inline
- void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask)
+ void kvm_cpu_cap_init_kvm_defined(enum kvm_only_cpuid_leafs leaf, u32 mask)
  {
-       /* Use kvm_cpu_cap_mask for non-scattered leafs. */
+       /* Use kvm_cpu_cap_mask for leafs that aren't KVM-only. */
        BUILD_BUG_ON(leaf < NCAPINTS);
  
        kvm_cpu_caps[leaf] = mask;
  
  static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
  {
-       /* Use kvm_cpu_cap_init_scattered for scattered leafs. */
+       /* Use kvm_cpu_cap_init_kvm_defined for KVM-only leafs. */
        BUILD_BUG_ON(leaf >= NCAPINTS);
  
        kvm_cpu_caps[leaf] &= mask;
@@@ -657,15 -663,20 +663,20 @@@ void kvm_set_cpu_caps(void
                kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
  
        kvm_cpu_cap_mask(CPUID_7_1_EAX,
-               F(AVX_VNNI) | F(AVX512_BF16)
+               F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | F(AMX_FP16) |
+               F(AVX_IFMA)
+       );
+       kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
+               F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI)
        );
  
        kvm_cpu_cap_mask(CPUID_D_1_EAX,
                F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd
        );
  
-       kvm_cpu_cap_init_scattered(CPUID_12_EAX,
+       kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX,
 -              SF(SGX1) | SF(SGX2)
 +              SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
        );
  
        kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
                F(CLZERO) | F(XSAVEERPTR) |
                F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
                F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) |
-               __feature_bit(KVM_X86_FEATURE_PSFD)
+               __feature_bit(KVM_X86_FEATURE_AMD_PSFD)
        );
  
        /*
@@@ -913,9 -924,9 +924,9 @@@ static inline int __do_cpuid_func(struc
                                goto out;
  
                        cpuid_entry_override(entry, CPUID_7_1_EAX);
+                       cpuid_entry_override(entry, CPUID_7_1_EDX);
                        entry->ebx = 0;
                        entry->ecx = 0;
-                       entry->edx = 0;
                }
                break;
        case 0xa: { /* Architectural Performance Monitoring */
                 * userspace.  ATTRIBUTES.XFRM is not adjusted as userspace is
                 * expected to derive it from supported XCR0.
                 */
 -              entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
 -                            SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY |
 -                            SGX_ATTR_KSS;
 +              entry->eax &= SGX_ATTR_PRIV_MASK | SGX_ATTR_UNPRIV_MASK;
                entry->ebx &= 0;
                break;
        /* Intel PT */
                 * Other defined bits are for MSRs that KVM does not expose:
                 *   EAX      3      SPCL, SMM page configuration lock
                 *   EAX      13     PCMSR, Prefetch control MSR
+                *
+                * KVM doesn't support SMM_CTL.
+                *   EAX       9     SMM_CTL MSR is not supported
                 */
                entry->eax &= BIT(0) | BIT(2) | BIT(6);
+               entry->eax |= BIT(9);
                if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC))
                        entry->eax |= BIT(2);
                if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
diff --combined arch/x86/kvm/mmu/mmu.c
index b6f96d47e596d1018987755a2f96a8a9305fda0c,4736d7849c60f2029e914e56093276d0f5c16c26..835426254e768c1e7f50aab3f84867bd75906f56
@@@ -22,6 -22,7 +22,7 @@@
  #include "tdp_mmu.h"
  #include "x86.h"
  #include "kvm_cache_regs.h"
+ #include "smm.h"
  #include "kvm_emulate.h"
  #include "cpuid.h"
  #include "spte.h"
@@@ -802,15 -803,31 +803,31 @@@ static void account_shadowed(struct kv
                kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
  }
  
- void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+ void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
  {
-       if (sp->lpage_disallowed)
+       /*
+        * If it's possible to replace the shadow page with an NX huge page,
+        * i.e. if the shadow page is the only thing currently preventing KVM
+        * from using a huge page, add the shadow page to the list of "to be
+        * zapped for NX recovery" pages.  Note, the shadow page can already be
+        * on the list if KVM is reusing an existing shadow page, i.e. if KVM
+        * links a shadow page at multiple points.
+        */
+       if (!list_empty(&sp->possible_nx_huge_page_link))
                return;
  
        ++kvm->stat.nx_lpage_splits;
-       list_add_tail(&sp->lpage_disallowed_link,
-                     &kvm->arch.lpage_disallowed_mmu_pages);
-       sp->lpage_disallowed = true;
+       list_add_tail(&sp->possible_nx_huge_page_link,
+                     &kvm->arch.possible_nx_huge_pages);
+ }
+ static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+                                bool nx_huge_page_possible)
+ {
+       sp->nx_huge_page_disallowed = true;
+       if (nx_huge_page_possible)
+               track_possible_nx_huge_page(kvm, sp);
  }
  
  static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
        kvm_mmu_gfn_allow_lpage(slot, gfn);
  }
  
- void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+ void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
  {
+       if (list_empty(&sp->possible_nx_huge_page_link))
+               return;
        --kvm->stat.nx_lpage_splits;
-       sp->lpage_disallowed = false;
-       list_del(&sp->lpage_disallowed_link);
+       list_del_init(&sp->possible_nx_huge_page_link);
+ }
+ static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+ {
+       sp->nx_huge_page_disallowed = false;
+       untrack_possible_nx_huge_page(kvm, sp);
  }
  
  static struct kvm_memory_slot *
@@@ -1645,7 -1671,7 +1671,7 @@@ static int is_empty_shadow_page(u64 *sp
        u64 *pos;
        u64 *end;
  
-       for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
+       for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++)
                if (is_shadow_present_pte(*pos)) {
                        printk(KERN_ERR "%s: %p %llx\n", __func__,
                               pos, *pos);
@@@ -1793,7 -1819,7 +1819,7 @@@ static int __mmu_unsync_walk(struct kvm
                        continue;
                }
  
-               child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK);
+               child = spte_to_child_sp(ent);
  
                if (child->unsync_children) {
                        if (mmu_pages_add(pvec, child, i))
@@@ -1894,7 -1920,7 +1920,7 @@@ static bool is_obsolete_sp(struct kvm *
        if (sp->role.invalid)
                return true;
  
-       /* TDP MMU pages due not use the MMU generation. */
+       /* TDP MMU pages do not use the MMU generation. */
        return !sp->tdp_mmu_page &&
               unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
  }
@@@ -2129,6 -2155,8 +2155,8 @@@ static struct kvm_mmu_page *kvm_mmu_all
  
        set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
  
+       INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
        /*
         * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
         * depends on valid pages being added to the head of the list.  See
@@@ -2350,7 -2378,7 +2378,7 @@@ static void validate_direct_spte(struc
                 * so we should update the spte at this point to get
                 * a new sp with the correct access.
                 */
-               child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK);
+               child = spte_to_child_sp(*sptep);
                if (child->role.access == direct_access)
                        return;
  
@@@ -2371,7 -2399,7 +2399,7 @@@ static int mmu_page_zap_pte(struct kvm 
                if (is_last_spte(pte, sp->role.level)) {
                        drop_spte(kvm, spte);
                } else {
-                       child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
+                       child = spte_to_child_sp(pte);
                        drop_parent_pte(child, spte);
  
                        /*
@@@ -2443,7 -2471,6 +2471,7 @@@ static bool __kvm_mmu_prepare_zap_page(
  {
        bool list_unstable, zapped_root = false;
  
 +      lockdep_assert_held_write(&kvm->mmu_lock);
        trace_kvm_mmu_prepare_zap_page(sp);
        ++kvm->stat.mmu_shadow_zapped;
        *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
                zapped_root = !is_obsolete_sp(kvm, sp);
        }
  
-       if (sp->lpage_disallowed)
-               unaccount_huge_nx_page(kvm, sp);
+       if (sp->nx_huge_page_disallowed)
+               unaccount_nx_huge_page(kvm, sp);
  
        sp->role.invalid = 1;
  
@@@ -2811,7 -2838,7 +2839,7 @@@ static int mmu_set_spte(struct kvm_vcp
                        struct kvm_mmu_page *child;
                        u64 pte = *sptep;
  
-                       child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
+                       child = spte_to_child_sp(pte);
                        drop_parent_pte(child, sptep);
                        flush = true;
                } else if (pfn != spte_to_pfn(*sptep)) {
@@@ -3085,7 -3112,8 +3113,8 @@@ void disallowed_hugepage_adjust(struct 
        if (cur_level > PG_LEVEL_4K &&
            cur_level == fault->goal_level &&
            is_shadow_present_pte(spte) &&
-           !is_large_pte(spte)) {
+           !is_large_pte(spte) &&
+           spte_to_child_sp(spte)->nx_huge_page_disallowed) {
                /*
                 * A small SPTE exists for this pfn, but FNAME(fetch)
                 * and __direct_map would like to create a large PTE
@@@ -3127,9 -3155,9 +3156,9 @@@ static int __direct_map(struct kvm_vcp
                        continue;
  
                link_shadow_page(vcpu, it.sptep, sp);
-               if (fault->is_tdp && fault->huge_page_disallowed &&
-                   fault->req_level >= it.level)
-                       account_huge_nx_page(vcpu->kvm, sp);
+               if (fault->huge_page_disallowed)
+                       account_nx_huge_page(vcpu->kvm, sp,
+                                            fault->req_level >= it.level);
        }
  
        if (WARN_ON_ONCE(it.level != fault->goal_level))
@@@ -3149,8 -3177,13 +3178,13 @@@ static void kvm_send_hwpoison_signal(un
        send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
  }
  
- static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+ static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
  {
+       if (is_sigpending_pfn(pfn)) {
+               kvm_handle_signal_exit(vcpu);
+               return -EINTR;
+       }
        /*
         * Do not cache the mmio info caused by writing the readonly gfn
         * into the spte otherwise read access on readonly gfn also can
@@@ -3172,7 -3205,7 +3206,7 @@@ static int handle_abnormal_pfn(struct k
  {
        /* The pfn is invalid, report the error! */
        if (unlikely(is_error_pfn(fault->pfn)))
-               return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
+               return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn);
  
        if (unlikely(!fault->slot)) {
                gva_t gva = fault->is_tdp ? 0 : fault->addr;
@@@ -3423,7 -3456,11 +3457,11 @@@ static void mmu_free_root_page(struct k
        if (!VALID_PAGE(*root_hpa))
                return;
  
-       sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK);
+       /*
+        * The "root" may be a special root, e.g. a PAE entry, treat it as a
+        * SPTE to ensure any non-PA bits are dropped.
+        */
+       sp = spte_to_child_sp(*root_hpa);
        if (WARN_ON(!sp))
                return;
  
@@@ -3908,8 -3945,7 +3946,7 @@@ void kvm_mmu_sync_roots(struct kvm_vcp
                hpa_t root = vcpu->arch.mmu->pae_root[i];
  
                if (IS_VALID_PAE_ROOT(root)) {
-                       root &= SPTE_BASE_ADDR_MASK;
-                       sp = to_shadow_page(root);
+                       sp = spte_to_child_sp(root);
                        mmu_sync_children(vcpu, sp, true);
                }
        }
@@@ -4170,7 -4206,7 +4207,7 @@@ static int kvm_faultin_pfn(struct kvm_v
        }
  
        async = false;
-       fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
+       fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
                                          fault->write, &fault->map_writable,
                                          &fault->hva);
        if (!async)
                }
        }
  
-       fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
+       /*
+        * Allow gup to bail on pending non-fatal signals when it's also allowed
+        * to wait for IO.  Note, gup always bails if it is unable to quickly
+        * get a page and a fatal signal, i.e. SIGKILL, is pending.
+        */
+       fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
                                          fault->write, &fault->map_writable,
                                          &fault->hva);
        return RET_PF_CONTINUE;
@@@ -4263,14 -4304,14 +4305,14 @@@ static int direct_page_fault(struct kvm
        if (is_page_fault_stale(vcpu, fault, mmu_seq))
                goto out_unlock;
  
 -      r = make_mmu_pages_available(vcpu);
 -      if (r)
 -              goto out_unlock;
 -
 -      if (is_tdp_mmu_fault)
 +      if (is_tdp_mmu_fault) {
                r = kvm_tdp_mmu_map(vcpu, fault);
 -      else
 +      } else {
 +              r = make_mmu_pages_available(vcpu);
 +              if (r)
 +                      goto out_unlock;
                r = __direct_map(vcpu, fault);
 +      }
  
  out_unlock:
        if (is_tdp_mmu_fault)
@@@ -5972,7 -6013,7 +6014,7 @@@ int kvm_mmu_init_vm(struct kvm *kvm
  
        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
        INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
-       INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
+       INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
        spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
  
        r = kvm_mmu_init_tdp_mmu(kvm);
@@@ -6057,7 -6098,7 +6099,7 @@@ void kvm_zap_gfn_range(struct kvm *kvm
  
        write_lock(&kvm->mmu_lock);
  
 -      kvm_mmu_invalidate_begin(kvm, gfn_start, gfn_end);
 +      kvm_mmu_invalidate_begin(kvm, 0, -1ul);
  
        flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
  
                kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
                                                   gfn_end - gfn_start);
  
 -      kvm_mmu_invalidate_end(kvm, gfn_start, gfn_end);
 +      kvm_mmu_invalidate_end(kvm, 0, -1ul);
  
        write_unlock(&kvm->mmu_lock);
  }
@@@ -6657,7 -6698,7 +6699,7 @@@ static int set_nx_huge_pages(const cha
                        kvm_mmu_zap_all_fast(kvm);
                        mutex_unlock(&kvm->slots_lock);
  
-                       wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+                       wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
                }
                mutex_unlock(&kvm_lock);
        }
@@@ -6789,7 -6830,7 +6831,7 @@@ static int set_nx_huge_pages_recovery_p
                mutex_lock(&kvm_lock);
  
                list_for_each_entry(kvm, &vm_list, vm_list)
-                       wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+                       wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
  
                mutex_unlock(&kvm_lock);
        }
        return err;
  }
  
- static void kvm_recover_nx_lpages(struct kvm *kvm)
+ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
  {
        unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
+       struct kvm_memory_slot *slot;
        int rcu_idx;
        struct kvm_mmu_page *sp;
        unsigned int ratio;
        ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
        to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
        for ( ; to_zap; --to_zap) {
-               if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
+               if (list_empty(&kvm->arch.possible_nx_huge_pages))
                        break;
  
                /*
                 * We use a separate list instead of just using active_mmu_pages
-                * because the number of lpage_disallowed pages is expected to
-                * be relatively small compared to the total.
+                * because the number of shadow pages that be replaced with an
+                * NX huge page is expected to be relatively small compared to
+                * the total number of shadow pages.  And because the TDP MMU
+                * doesn't use active_mmu_pages.
                 */
-               sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
+               sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
                                      struct kvm_mmu_page,
-                                     lpage_disallowed_link);
-               WARN_ON_ONCE(!sp->lpage_disallowed);
-               if (is_tdp_mmu_page(sp)) {
+                                     possible_nx_huge_page_link);
+               WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
+               WARN_ON_ONCE(!sp->role.direct);
+               /*
+                * Unaccount and do not attempt to recover any NX Huge Pages
+                * that are being dirty tracked, as they would just be faulted
+                * back in as 4KiB pages. The NX Huge Pages in this slot will be
+                * recovered, along with all the other huge pages in the slot,
+                * when dirty logging is disabled.
+                *
+                * Since gfn_to_memslot() is relatively expensive, it helps to
+                * skip it if it the test cannot possibly return true.  On the
+                * other hand, if any memslot has logging enabled, chances are
+                * good that all of them do, in which case unaccount_nx_huge_page()
+                * is much cheaper than zapping the page.
+                *
+                * If a memslot update is in progress, reading an incorrect value
+                * of kvm->nr_memslots_dirty_logging is not a problem: if it is
+                * becoming zero, gfn_to_memslot() will be done unnecessarily; if
+                * it is becoming nonzero, the page will be zapped unnecessarily.
+                * Either way, this only affects efficiency in racy situations,
+                * and not correctness.
+                */
+               slot = NULL;
+               if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
+                       slot = gfn_to_memslot(kvm, sp->gfn);
+                       WARN_ON_ONCE(!slot);
+               }
+               if (slot && kvm_slot_dirty_track_enabled(slot))
+                       unaccount_nx_huge_page(kvm, sp);
+               else if (is_tdp_mmu_page(sp))
                        flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
-               } else {
+               else
                        kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
-                       WARN_ON_ONCE(sp->lpage_disallowed);
-               }
+               WARN_ON_ONCE(sp->nx_huge_page_disallowed);
  
                if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
                        kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
        srcu_read_unlock(&kvm->srcu, rcu_idx);
  }
  
- static long get_nx_lpage_recovery_timeout(u64 start_time)
+ static long get_nx_huge_page_recovery_timeout(u64 start_time)
  {
        bool enabled;
        uint period;
                       : MAX_SCHEDULE_TIMEOUT;
  }
  
- static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
+ static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
  {
        u64 start_time;
        long remaining_time;
  
        while (true) {
                start_time = get_jiffies_64();
-               remaining_time = get_nx_lpage_recovery_timeout(start_time);
+               remaining_time = get_nx_huge_page_recovery_timeout(start_time);
  
                set_current_state(TASK_INTERRUPTIBLE);
                while (!kthread_should_stop() && remaining_time > 0) {
                        schedule_timeout(remaining_time);
-                       remaining_time = get_nx_lpage_recovery_timeout(start_time);
+                       remaining_time = get_nx_huge_page_recovery_timeout(start_time);
                        set_current_state(TASK_INTERRUPTIBLE);
                }
  
                if (kthread_should_stop())
                        return 0;
  
-               kvm_recover_nx_lpages(kvm);
+               kvm_recover_nx_huge_pages(kvm);
        }
  }
  
@@@ -6897,17 -6970,17 +6971,17 @@@ int kvm_mmu_post_init_vm(struct kvm *kv
  {
        int err;
  
-       err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
+       err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
                                          "kvm-nx-lpage-recovery",
-                                         &kvm->arch.nx_lpage_recovery_thread);
+                                         &kvm->arch.nx_huge_page_recovery_thread);
        if (!err)
-               kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
+               kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
  
        return err;
  }
  
  void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
  {
-       if (kvm->arch.nx_lpage_recovery_thread)
-               kthread_stop(kvm->arch.nx_lpage_recovery_thread);
+       if (kvm->arch.nx_huge_page_recovery_thread)
+               kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
  }
index 4e5b8444f161c3b01b30c2b58d9aa695f0000b4e,203fdad07bae6cffe3c5385d787ff2aa87394781..042d0aca3c92b238602d93d0158e043bcd2b4447
@@@ -7,24 -7,41 +7,42 @@@
  #include <asm/cpufeatures.h>
  
  /*
-  * Hardware-defined CPUID leafs that are scattered in the kernel, but need to
-  * be directly used by KVM.  Note, these word values conflict with the kernel's
-  * "bug" caps, but KVM doesn't use those.
+  * Hardware-defined CPUID leafs that are either scattered by the kernel or are
+  * unknown to the kernel, but need to be directly used by KVM.  Note, these
+  * word values conflict with the kernel's "bug" caps, but KVM doesn't use those.
   */
  enum kvm_only_cpuid_leafs {
        CPUID_12_EAX     = NCAPINTS,
+       CPUID_7_1_EDX,
        NR_KVM_CPU_CAPS,
  
        NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
  };
  
+ /*
+  * Define a KVM-only feature flag.
+  *
+  * For features that are scattered by cpufeatures.h, __feature_translate() also
+  * needs to be updated to translate the kernel-defined feature into the
+  * KVM-defined feature.
+  *
+  * For features that are 100% KVM-only, i.e. not defined by cpufeatures.h,
+  * forego the intermediate KVM_X86_FEATURE and directly define X86_FEATURE_* so
+  * that X86_FEATURE_* can be used in KVM.  No __feature_translate() handling is
+  * needed in this case.
+  */
  #define KVM_X86_FEATURE(w, f)         ((w)*32 + (f))
  
  /* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
  #define KVM_X86_FEATURE_SGX1          KVM_X86_FEATURE(CPUID_12_EAX, 0)
  #define KVM_X86_FEATURE_SGX2          KVM_X86_FEATURE(CPUID_12_EAX, 1)
 +#define KVM_X86_FEATURE_SGX_EDECCSSA  KVM_X86_FEATURE(CPUID_12_EAX, 11)
  
+ /* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */
+ #define X86_FEATURE_AVX_VNNI_INT8       KVM_X86_FEATURE(CPUID_7_1_EDX, 4)
+ #define X86_FEATURE_AVX_NE_CONVERT      KVM_X86_FEATURE(CPUID_7_1_EDX, 5)
+ #define X86_FEATURE_PREFETCHITI         KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
  struct cpuid_reg {
        u32 function;
        u32 index;
@@@ -49,6 -66,7 +67,7 @@@ static const struct cpuid_reg reverse_c
        [CPUID_7_1_EAX]       = {         7, 1, CPUID_EAX},
        [CPUID_12_EAX]        = {0x00000012, 0, CPUID_EAX},
        [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX},
+       [CPUID_7_1_EDX]       = {         7, 1, CPUID_EDX},
  };
  
  /*
@@@ -79,8 -97,6 +98,8 @@@ static __always_inline u32 __feature_tr
                return KVM_X86_FEATURE_SGX1;
        else if (x86_feature == X86_FEATURE_SGX2)
                return KVM_X86_FEATURE_SGX2;
 +      else if (x86_feature == X86_FEATURE_SGX_EDECCSSA)
 +              return KVM_X86_FEATURE_SGX_EDECCSSA;
  
        return x86_feature;
  }
diff --combined arch/x86/kvm/svm/svm.c
index ce362e88a5676cf1deac281fed528a010e594291,6ffadbd5774472a3da0abbe25dcd0119ed6580be..9a194aa1a75a498d69f64bfafdca887e6c2e16bf
@@@ -6,6 -6,7 +6,7 @@@
  #include "mmu.h"
  #include "kvm_cache_regs.h"
  #include "x86.h"
+ #include "smm.h"
  #include "cpuid.h"
  #include "pmu.h"
  
@@@ -2704,12 -2705,10 +2705,10 @@@ static int svm_get_msr_feature(struct k
        msr->data = 0;
  
        switch (msr->index) {
 -      case MSR_F10H_DECFG:
 -              if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
 -                      msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
 +      case MSR_AMD64_DE_CFG:
 +              if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
 +                      msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
                break;
-       case MSR_IA32_PERF_CAPABILITIES:
-               return 0;
        default:
                return KVM_MSR_RET_INVALID;
        }
@@@ -2807,7 -2806,7 +2806,7 @@@ static int svm_get_msr(struct kvm_vcpu 
                        msr_info->data = 0x1E;
                }
                break;
 -      case MSR_F10H_DECFG:
 +      case MSR_AMD64_DE_CFG:
                msr_info->data = svm->msr_decfg;
                break;
        default:
@@@ -3036,7 -3035,7 +3035,7 @@@ static int svm_set_msr(struct kvm_vcpu 
        case MSR_VM_IGNNE:
                vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
                break;
 -      case MSR_F10H_DECFG: {
 +      case MSR_AMD64_DE_CFG: {
                struct kvm_msr_entry msr_entry;
  
                msr_entry.index = msr->index;
@@@ -3723,6 -3722,13 +3722,13 @@@ static void svm_flush_tlb_current(struc
  {
        struct vcpu_svm *svm = to_svm(vcpu);
  
+       /*
+        * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
+        * A TLB flush for the current ASID flushes both "host" and "guest" TLB
+        * entries, and thus is a superset of Hyper-V's fine grained flushing.
+        */
+       kvm_hv_vcpu_purge_flush_tlb(vcpu);
        /*
         * Flush only the current ASID even if the TLB flush was invoked via
         * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
@@@ -3889,8 -3895,14 +3895,14 @@@ static int svm_vcpu_pre_run(struct kvm_
  
  static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
  {
-       if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
-           to_svm(vcpu)->vmcb->control.exit_info_1)
+       struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+       /*
+        * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM
+        * can't read guest memory (dereference memslots) to decode the WRMSR.
+        */
+       if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 &&
+           nrips && control->next_rip)
                return handle_fastpath_set_msr_irqoff(vcpu);
  
        return EXIT_FASTPATH_NONE;
@@@ -4102,6 -4114,8 +4114,8 @@@ static bool svm_has_emulated_msr(struc
        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                return false;
        case MSR_IA32_SMBASE:
+               if (!IS_ENABLED(CONFIG_KVM_SMM))
+                       return false;
                /* SEV-ES guests do not support SMM, so report false */
                if (kvm && sev_es_guest(kvm))
                        return false;
@@@ -4358,6 -4372,7 +4372,7 @@@ static void svm_setup_mce(struct kvm_vc
        vcpu->arch.mcg_cap &= 0x1ff;
  }
  
+ #ifdef CONFIG_KVM_SMM
  bool svm_smi_blocked(struct kvm_vcpu *vcpu)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
@@@ -4385,7 -4400,7 +4400,7 @@@ static int svm_smi_allowed(struct kvm_v
        return 1;
  }
  
- static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
        struct kvm_host_map map_save;
        if (!is_guest_mode(vcpu))
                return 0;
  
-       /* FED8h - SVM Guest */
-       put_smstate(u64, smstate, 0x7ed8, 1);
-       /* FEE0h - SVM Guest VMCB Physical Address */
-       put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
+       /*
+        * 32-bit SMRAM format doesn't preserve EFER and SVM state.  Userspace is
+        * responsible for ensuring nested SVM and SMIs are mutually exclusive.
+        */
+       if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+               return 1;
+       smram->smram64.svm_guest_flag = 1;
+       smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
  
        svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
         * that, see svm_prepare_switch_to_guest()) which must be
         * preserved.
         */
-       if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
-                        &map_save) == -EINVAL)
+       if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
                return 1;
  
        BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
        return 0;
  }
  
- static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
        struct kvm_host_map map, map_save;
-       u64 saved_efer, vmcb12_gpa;
        struct vmcb *vmcb12;
        int ret;
  
+       const struct kvm_smram_state_64 *smram64 = &smram->smram64;
        if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
                return 0;
  
        /* Non-zero if SMI arrived while vCPU was in guest mode. */
-       if (!GET_SMSTATE(u64, smstate, 0x7ed8))
+       if (!smram64->svm_guest_flag)
                return 0;
  
        if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
                return 1;
  
-       saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
-       if (!(saved_efer & EFER_SVME))
+       if (!(smram64->efer & EFER_SVME))
                return 1;
  
-       vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
-       if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
+       if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
                return 1;
  
        ret = 1;
-       if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL)
+       if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
                goto unmap_map;
  
        if (svm_allocate_nested(svm))
        vmcb12 = map.hva;
        nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
        nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
-       ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
+       ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false);
  
        if (ret)
                goto unmap_save;
@@@ -4507,6 -4526,7 +4526,7 @@@ static void svm_enable_smi_window(struc
                /* We must be in SMM; RSM will cause a vmexit anyway.  */
        }
  }
+ #endif
  
  static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
                                        void *insn, int insn_len)
@@@ -4782,10 -4802,12 +4802,12 @@@ static struct kvm_x86_ops svm_x86_ops _
        .pi_update_irte = avic_pi_update_irte,
        .setup_mce = svm_setup_mce,
  
+ #ifdef CONFIG_KVM_SMM
        .smi_allowed = svm_smi_allowed,
        .enter_smm = svm_enter_smm,
        .leave_smm = svm_leave_smm,
        .enable_smi_window = svm_enable_smi_window,
+ #endif
  
        .mem_enc_ioctl = sev_mem_enc_ioctl,
        .mem_enc_register_region = sev_mem_enc_register_region,
@@@ -4851,6 -4873,7 +4873,7 @@@ static __init void svm_set_cpu_caps(voi
  {
        kvm_set_cpu_caps();
  
+       kvm_caps.supported_perf_cap = 0;
        kvm_caps.supported_xss = 0;
  
        /* CPUID 0x80000001 and 0x8000000A (SVM features) */
diff --combined arch/x86/kvm/x86.c
index 69227f77b201d7d9d89a77bdefc2d44f338e3ba8,fd6c01a39312820e1388fda4a3213b38c5fd1632..312aea1854ae6b8c8b4a64884ffaf9b9605c6e29
@@@ -30,6 -30,7 +30,7 @@@
  #include "hyperv.h"
  #include "lapic.h"
  #include "xen.h"
+ #include "smm.h"
  
  #include <linux/clocksource.h>
  #include <linux/interrupt.h>
@@@ -119,8 -120,6 +120,6 @@@ static u64 __read_mostly cr4_reserved_b
  
  static void update_cr8_intercept(struct kvm_vcpu *vcpu);
  static void process_nmi(struct kvm_vcpu *vcpu);
- static void process_smi(struct kvm_vcpu *vcpu);
- static void enter_smm(struct kvm_vcpu *vcpu);
  static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
  static void store_regs(struct kvm_vcpu *vcpu);
  static int sync_regs(struct kvm_vcpu *vcpu);
@@@ -464,7 -463,6 +463,6 @@@ u64 kvm_get_apic_base(struct kvm_vcpu *
  {
        return vcpu->arch.apic_base;
  }
- EXPORT_SYMBOL_GPL(kvm_get_apic_base);
  
  enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
  {
@@@ -492,7 -490,6 +490,6 @@@ int kvm_set_apic_base(struct kvm_vcpu *
        kvm_recalculate_apic_map(vcpu->kvm);
        return 0;
  }
- EXPORT_SYMBOL_GPL(kvm_set_apic_base);
  
  /*
   * Handle a fault on a hardware virtualization (VMX or SVM) instruction.
@@@ -783,7 -780,6 +780,6 @@@ void kvm_inject_page_fault(struct kvm_v
                kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
                                        fault->address);
  }
- EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
  
  void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
                                    struct x86_exception *fault)
@@@ -812,7 -808,6 +808,6 @@@ void kvm_inject_nmi(struct kvm_vcpu *vc
        atomic_inc(&vcpu->arch.nmi_queued);
        kvm_make_request(KVM_REQ_NMI, vcpu);
  }
- EXPORT_SYMBOL_GPL(kvm_inject_nmi);
  
  void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
  {
@@@ -837,7 -832,6 +832,6 @@@ bool kvm_require_cpl(struct kvm_vcpu *v
        kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
        return false;
  }
- EXPORT_SYMBOL_GPL(kvm_require_cpl);
  
  bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
  {
@@@ -1563,7 -1557,7 +1557,7 @@@ static const u32 msr_based_features_all
        MSR_IA32_VMX_EPT_VPID_CAP,
        MSR_IA32_VMX_VMFUNC,
  
 -      MSR_F10H_DECFG,
 +      MSR_AMD64_DE_CFG,
        MSR_IA32_UCODE_REV,
        MSR_IA32_ARCH_CAPABILITIES,
        MSR_IA32_PERF_CAPABILITIES,
@@@ -1654,6 -1648,9 +1648,9 @@@ static int kvm_get_msr_feature(struct k
        case MSR_IA32_ARCH_CAPABILITIES:
                msr->data = kvm_get_arch_capabilities();
                break;
+       case MSR_IA32_PERF_CAPABILITIES:
+               msr->data = kvm_caps.supported_perf_cap;
+               break;
        case MSR_IA32_UCODE_REV:
                rdmsrl_safe(msr->index, &msr->data);
                break;
@@@ -2067,7 -2064,6 +2064,6 @@@ int kvm_emulate_as_nop(struct kvm_vcpu 
  {
        return kvm_skip_emulated_instruction(vcpu);
  }
- EXPORT_SYMBOL_GPL(kvm_emulate_as_nop);
  
  int kvm_emulate_invd(struct kvm_vcpu *vcpu)
  {
@@@ -2315,13 -2311,11 +2311,11 @@@ static void kvm_write_system_time(struc
        kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
  
        /* we verify if the enable bit is set... */
-       if (system_time & 1) {
-               kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
-                                KVM_HOST_USES_PFN, system_time & ~1ULL,
+       if (system_time & 1)
+               kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
                                 sizeof(struct pvclock_vcpu_time_info));
-       } else {
-               kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);
-       }
+       else
+               kvm_gpc_deactivate(&vcpu->arch.pv_time);
  
        return;
  }
@@@ -2513,7 -2507,6 +2507,6 @@@ u64 kvm_scale_tsc(u64 tsc, u64 ratio
  
        return _tsc;
  }
- EXPORT_SYMBOL_GPL(kvm_scale_tsc);
  
  static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
  {
@@@ -2972,6 -2965,22 +2965,22 @@@ static void kvm_update_masterclock(stru
        kvm_end_pvclock_update(kvm);
  }
  
+ /*
+  * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's
+  * per-CPU value (which may be zero if a CPU is going offline).  Note, tsc_khz
+  * can change during boot even if the TSC is constant, as it's possible for KVM
+  * to be loaded before TSC calibration completes.  Ideally, KVM would get a
+  * notification when calibration completes, but practically speaking calibration
+  * will complete before userspace is alive enough to create VMs.
+  */
+ static unsigned long get_cpu_tsc_khz(void)
+ {
+       if (static_cpu_has(X86_FEATURE_CONSTANT_TSC))
+               return tsc_khz;
+       else
+               return __this_cpu_read(cpu_tsc_khz);
+ }
  /* Called within read_seqcount_begin/retry for kvm->pvclock_sc.  */
  static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
  {
        get_cpu();
  
        data->flags = 0;
-       if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) {
+       if (ka->use_master_clock &&
+           (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) {
  #ifdef CONFIG_X86_64
                struct timespec64 ts;
  
                data->flags |= KVM_CLOCK_TSC_STABLE;
                hv_clock.tsc_timestamp = ka->master_cycle_now;
                hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
-               kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
+               kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL,
                                   &hv_clock.tsc_shift,
                                   &hv_clock.tsc_to_system_mul);
                data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
@@@ -3035,12 -3045,10 +3045,10 @@@ static void kvm_setup_guest_pvclock(str
        unsigned long flags;
  
        read_lock_irqsave(&gpc->lock, flags);
-       while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
-                                          offset + sizeof(*guest_hv_clock))) {
+       while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
                read_unlock_irqrestore(&gpc->lock, flags);
  
-               if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
-                                                offset + sizeof(*guest_hv_clock)))
+               if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock)))
                        return;
  
                read_lock_irqsave(&gpc->lock, flags);
@@@ -3106,7 -3114,7 +3114,7 @@@ static int kvm_guest_time_update(struc
  
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
-       tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+       tgt_tsc_khz = get_cpu_tsc_khz();
        if (unlikely(tgt_tsc_khz == 0)) {
                local_irq_restore(flags);
                kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@@ -3389,7 -3397,7 +3397,7 @@@ static int kvm_pv_enable_async_pf_int(s
  
  static void kvmclock_reset(struct kvm_vcpu *vcpu)
  {
-       kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);
+       kvm_gpc_deactivate(&vcpu->arch.pv_time);
        vcpu->arch.time = 0;
  }
  
@@@ -3397,6 -3405,9 +3405,9 @@@ static void kvm_vcpu_flush_tlb_all(stru
  {
        ++vcpu->stat.tlb_flush;
        static_call(kvm_x86_flush_tlb_all)(vcpu);
+       /* Flushing all ASIDs flushes the current ASID... */
+       kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
  }
  
  static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
        }
  
        static_call(kvm_x86_flush_tlb_guest)(vcpu);
+       /*
+        * Flushing all "guest" TLB is always a superset of Hyper-V's fine
+        * grained flushing.
+        */
+       kvm_hv_vcpu_purge_flush_tlb(vcpu);
  }
  
  
@@@ -3566,20 -3583,15 +3583,15 @@@ int kvm_set_msr_common(struct kvm_vcpu 
                        return 1;
                vcpu->arch.arch_capabilities = data;
                break;
-       case MSR_IA32_PERF_CAPABILITIES: {
-               struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
+       case MSR_IA32_PERF_CAPABILITIES:
                if (!msr_info->host_initiated)
                        return 1;
-               if (kvm_get_msr_feature(&msr_ent))
-                       return 1;
-               if (data & ~msr_ent.data)
+               if (data & ~kvm_caps.supported_perf_cap)
                        return 1;
  
                vcpu->arch.perf_capabilities = data;
                kvm_pmu_refresh(vcpu);
                return 0;
-       }
        case MSR_EFER:
                return set_efer(vcpu, msr_info);
        case MSR_K7_HWCR:
                break;
        }
        case MSR_IA32_SMBASE:
-               if (!msr_info->host_initiated)
+               if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
                        return 1;
                vcpu->arch.smbase = data;
                break;
@@@ -4067,7 -4079,7 +4079,7 @@@ int kvm_get_msr_common(struct kvm_vcpu 
                msr_info->data = vcpu->arch.ia32_misc_enable_msr;
                break;
        case MSR_IA32_SMBASE:
-               if (!msr_info->host_initiated)
+               if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
                        return 1;
                msr_info->data = vcpu->arch.smbase;
                break;
@@@ -4425,7 -4437,8 +4437,8 @@@ int kvm_vm_ioctl_check_extension(struc
                    KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
                    KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
                if (sched_info_on())
-                       r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
+                       r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
+                            KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
                break;
  #endif
        case KVM_CAP_SYNC_REGS:
                        r |= KVM_X86_DISABLE_EXITS_MWAIT;
                break;
        case KVM_CAP_X86_SMM:
+               if (!IS_ENABLED(CONFIG_KVM_SMM))
+                       break;
                /* SMBASE is usually relocated above 1M on modern chipsets,
                 * and SMM handlers might indeed rely on 4G segment limits,
                 * so do not report SMM to be available if real mode is
                        kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
                break;
        case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
-               r = kvm_x86_ops.enable_direct_tlbflush != NULL;
+               r = kvm_x86_ops.enable_l2_tlb_flush != NULL;
                break;
        case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
                r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
@@@ -4897,13 -4913,6 +4913,6 @@@ static int kvm_vcpu_ioctl_nmi(struct kv
        return 0;
  }
  
- static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
- {
-       kvm_make_request(KVM_REQ_SMI, vcpu);
-       return 0;
- }
  static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
                                           struct kvm_tpr_access_ctl *tac)
  {
@@@ -5039,8 -5048,10 +5048,10 @@@ static void kvm_vcpu_ioctl_x86_get_vcpu
  
        process_nmi(vcpu);
  
+ #ifdef CONFIG_KVM_SMM
        if (kvm_check_request(KVM_REQ_SMI, vcpu))
                process_smi(vcpu);
+ #endif
  
        /*
         * KVM's ABI only allows for one exception to be migrated.  Luckily,
            ex->pending && ex->has_payload)
                kvm_deliver_exception_payload(vcpu, ex);
  
+       memset(events, 0, sizeof(*events));
        /*
         * The API doesn't provide the instruction length for software
         * exceptions, so don't report them. As long as the guest RIP
         * isn't advanced, we should expect to encounter the exception
         * again.
         */
-       if (kvm_exception_is_soft(ex->vector)) {
-               events->exception.injected = 0;
-               events->exception.pending = 0;
-       } else {
+       if (!kvm_exception_is_soft(ex->vector)) {
                events->exception.injected = ex->injected;
                events->exception.pending = ex->pending;
                /*
        events->interrupt.injected =
                vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
        events->interrupt.nr = vcpu->arch.interrupt.nr;
-       events->interrupt.soft = 0;
        events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
  
        events->nmi.injected = vcpu->arch.nmi_injected;
        events->nmi.pending = vcpu->arch.nmi_pending != 0;
        events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
-       events->nmi.pad = 0;
  
-       events->sipi_vector = 0; /* never valid when reporting to user space */
+       /* events->sipi_vector is never valid when reporting to user space */
  
+ #ifdef CONFIG_KVM_SMM
        events->smi.smm = is_smm(vcpu);
        events->smi.pending = vcpu->arch.smi_pending;
        events->smi.smm_inside_nmi =
                !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
+ #endif
        events->smi.latched_init = kvm_lapic_latched_init(vcpu);
  
        events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
                events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
                events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
        }
-       memset(&events->reserved, 0, sizeof(events->reserved));
  }
  
- static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm);
  static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                                              struct kvm_vcpu_events *events)
  {
                vcpu->arch.apic->sipi_vector = events->sipi_vector;
  
        if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+ #ifdef CONFIG_KVM_SMM
                if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
                        kvm_leave_nested(vcpu);
                        kvm_smm_changed(vcpu, events->smi.smm);
                                vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
                }
  
+ #else
+               if (events->smi.smm || events->smi.pending ||
+                   events->smi.smm_inside_nmi)
+                       return -EINVAL;
+ #endif
                if (lapic_in_kernel(vcpu)) {
                        if (events->smi.latched_init)
                                set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
@@@ -5497,10 -5510,10 +5510,10 @@@ static int kvm_vcpu_ioctl_enable_cap(st
                }
                return r;
        case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
-               if (!kvm_x86_ops.enable_direct_tlbflush)
+               if (!kvm_x86_ops.enable_l2_tlb_flush)
                        return -ENOTTY;
  
-               return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
+               return static_call(kvm_x86_enable_l2_tlb_flush)(vcpu);
  
        case KVM_CAP_HYPERV_ENFORCE_CPUID:
                return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
@@@ -5580,7 -5593,7 +5593,7 @@@ long kvm_arch_vcpu_ioctl(struct file *f
                break;
        }
        case KVM_SMI: {
-               r = kvm_vcpu_ioctl_smi(vcpu);
+               r = kvm_inject_smi(vcpu);
                break;
        }
        case KVM_SET_CPUID: {
@@@ -6239,9 -6252,7 +6252,7 @@@ split_irqchip_unlock
                break;
        case KVM_CAP_X86_USER_SPACE_MSR:
                r = -EINVAL;
-               if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL |
-                                    KVM_MSR_EXIT_REASON_UNKNOWN |
-                                    KVM_MSR_EXIT_REASON_FILTER))
+               if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK)
                        break;
                kvm->arch.user_space_msr_mask = cap->args[0];
                r = 0;
@@@ -6418,7 -6429,7 +6429,7 @@@ static int kvm_add_msr_filter(struct kv
        if (!user_range->nmsrs)
                return 0;
  
-       if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE))
+       if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK)
                return -EINVAL;
  
        if (!user_range->flags)
@@@ -6452,7 -6463,7 +6463,7 @@@ static int kvm_vm_ioctl_set_msr_filter(
        int r = 0;
        u32 i;
  
-       if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY)
+       if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
                return -EINVAL;
  
        for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
@@@ -7125,8 -7136,8 +7136,8 @@@ static int vcpu_mmio_read(struct kvm_vc
        return handled;
  }
  
static void kvm_set_segment(struct kvm_vcpu *vcpu,
-                       struct kvm_segment *var, int seg)
+ void kvm_set_segment(struct kvm_vcpu *vcpu,
+                    struct kvm_segment *var, int seg)
  {
        static_call(kvm_x86_set_segment)(vcpu, var, seg);
  }
@@@ -7162,16 -7173,6 +7173,6 @@@ gpa_t kvm_mmu_gva_to_gpa_read(struct kv
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
  
-  gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
-                               struct x86_exception *exception)
- {
-       struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
-       u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
-       access |= PFERR_FETCH_MASK;
-       return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
- }
  gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
                               struct x86_exception *exception)
  {
@@@ -7284,15 -7285,6 +7285,6 @@@ static int emulator_read_std(struct x86
        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
  }
  
- static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
-               unsigned long addr, void *val, unsigned int bytes)
- {
-       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-       int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
-       return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
- }
  static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
                                      struct kvm_vcpu *vcpu, u64 access,
                                      struct x86_exception *exception)
@@@ -8084,26 -8076,6 +8076,6 @@@ static int emulator_get_msr(struct x86_
        return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
  }
  
- static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
-                           u32 msr_index, u64 data)
- {
-       return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
- }
- static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
- {
-       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-       return vcpu->arch.smbase;
- }
- static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
- {
-       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-       vcpu->arch.smbase = smbase;
- }
  static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
                              u32 pmc)
  {
@@@ -8178,18 -8150,13 +8150,13 @@@ static unsigned emulator_get_hflags(str
        return emul_to_vcpu(ctxt)->arch.hflags;
  }
  
- static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt)
+ #ifndef CONFIG_KVM_SMM
+ static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
  {
-       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-       kvm_smm_changed(vcpu, false);
- }
- static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt,
-                                 const char *smstate)
- {
-       return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate);
+       WARN_ON_ONCE(1);
+       return X86EMUL_UNHANDLEABLE;
  }
+ #endif
  
  static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
  {
@@@ -8215,7 -8182,6 +8182,6 @@@ static const struct x86_emulate_ops emu
        .write_gpr           = emulator_write_gpr,
        .read_std            = emulator_read_std,
        .write_std           = emulator_write_std,
-       .read_phys           = kvm_read_guest_phys_system,
        .fetch               = kvm_fetch_guest_virt,
        .read_emulated       = emulator_read_emulated,
        .write_emulated      = emulator_write_emulated,
        .cpl                 = emulator_get_cpl,
        .get_dr              = emulator_get_dr,
        .set_dr              = emulator_set_dr,
-       .get_smbase          = emulator_get_smbase,
-       .set_smbase          = emulator_set_smbase,
        .set_msr_with_filter = emulator_set_msr_with_filter,
        .get_msr_with_filter = emulator_get_msr_with_filter,
-       .set_msr             = emulator_set_msr,
        .get_msr             = emulator_get_msr,
        .check_pmc           = emulator_check_pmc,
        .read_pmc            = emulator_read_pmc,
        .guest_has_rdpid     = emulator_guest_has_rdpid,
        .set_nmi_mask        = emulator_set_nmi_mask,
        .get_hflags          = emulator_get_hflags,
-       .exiting_smm         = emulator_exiting_smm,
        .leave_smm           = emulator_leave_smm,
        .triple_fault        = emulator_triple_fault,
        .set_xcr             = emulator_set_xcr,
@@@ -8327,8 -8289,6 +8289,6 @@@ static void init_emulate_ctxt(struct kv
                     cs_db                              ? X86EMUL_MODE_PROT32 :
                                                          X86EMUL_MODE_PROT16;
        BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
-       BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
-       BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
  
        ctxt->interruptibility = 0;
        ctxt->have_exception = false;
@@@ -8587,29 -8547,6 +8547,6 @@@ static bool retry_instruction(struct x8
  static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
  static int complete_emulated_pio(struct kvm_vcpu *vcpu);
  
- static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
- {
-       trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
-       if (entering_smm) {
-               vcpu->arch.hflags |= HF_SMM_MASK;
-       } else {
-               vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
-               /* Process a latched INIT or SMI, if any.  */
-               kvm_make_request(KVM_REQ_EVENT, vcpu);
-               /*
-                * Even if KVM_SET_SREGS2 loaded PDPTRs out of band,
-                * on SMM exit we still need to reload them from
-                * guest memory
-                */
-               vcpu->arch.pdptrs_from_userspace = false;
-       }
-       kvm_mmu_reset_context(vcpu);
- }
  static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
                                unsigned long *db)
  {
@@@ -8841,7 -8778,9 +8778,9 @@@ int x86_emulate_instruction(struct kvm_
                                                  write_fault_to_spt,
                                                  emulation_type))
                                return 1;
-                       if (ctxt->have_exception) {
+                       if (ctxt->have_exception &&
+                           !(emulation_type & EMULTYPE_SKIP)) {
                                /*
                                 * #UD should result in just EMULATION_FAILED, and trap-like
                                 * exception should not be encountered during decode.
@@@ -9105,9 -9044,11 +9044,11 @@@ static void tsc_khz_changed(void *data
        struct cpufreq_freqs *freq = data;
        unsigned long khz = 0;
  
+       WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC));
        if (data)
                khz = freq->new;
-       else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+       else
                khz = cpufreq_quick_get(raw_smp_processor_id());
        if (!khz)
                khz = tsc_khz;
@@@ -9128,8 -9069,10 +9069,10 @@@ static void kvm_hyperv_tsc_notifier(voi
        hyperv_stop_tsc_emulation();
  
        /* TSC frequency always matches when on Hyper-V */
-       for_each_present_cpu(cpu)
-               per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+               for_each_present_cpu(cpu)
+                       per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+       }
        kvm_caps.max_guest_tsc_khz = tsc_khz;
  
        list_for_each_entry(kvm, &vm_list, vm_list) {
@@@ -9266,10 -9209,10 +9209,10 @@@ static void kvm_timer_init(void
                }
                cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
                                          CPUFREQ_TRANSITION_NOTIFIER);
-       }
  
-       cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
-                         kvmclock_cpu_online, kvmclock_cpu_down_prep);
+               cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
+                                 kvmclock_cpu_online, kvmclock_cpu_down_prep);
+       }
  }
  
  #ifdef CONFIG_X86_64
@@@ -9429,10 -9372,11 +9372,11 @@@ void kvm_arch_exit(void
  #endif
        kvm_lapic_exit();
  
-       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
                cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
                                            CPUFREQ_TRANSITION_NOTIFIER);
-       cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
+               cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
+       }
  #ifdef CONFIG_X86_64
        pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
        irq_work_sync(&pvclock_irq_work);
@@@ -9999,6 -9943,7 +9943,7 @@@ static int kvm_check_and_inject_events(
         * in order to make progress and get back here for another iteration.
         * The kvm_x86_ops hooks communicate this by returning -EBUSY.
         */
+ #ifdef CONFIG_KVM_SMM
        if (vcpu->arch.smi_pending) {
                r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
                if (r < 0)
                } else
                        static_call(kvm_x86_enable_smi_window)(vcpu);
        }
+ #endif
  
        if (vcpu->arch.nmi_pending) {
                r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
@@@ -10086,246 -10032,6 +10032,6 @@@ static void process_nmi(struct kvm_vcp
        kvm_make_request(KVM_REQ_EVENT, vcpu);
  }
  
- static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
- {
-       u32 flags = 0;
-       flags |= seg->g       << 23;
-       flags |= seg->db      << 22;
-       flags |= seg->l       << 21;
-       flags |= seg->avl     << 20;
-       flags |= seg->present << 15;
-       flags |= seg->dpl     << 13;
-       flags |= seg->s       << 12;
-       flags |= seg->type    << 8;
-       return flags;
- }
- static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
- {
-       struct kvm_segment seg;
-       int offset;
-       kvm_get_segment(vcpu, &seg, n);
-       put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
-       if (n < 3)
-               offset = 0x7f84 + n * 12;
-       else
-               offset = 0x7f2c + (n - 3) * 12;
-       put_smstate(u32, buf, offset + 8, seg.base);
-       put_smstate(u32, buf, offset + 4, seg.limit);
-       put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
- }
- #ifdef CONFIG_X86_64
- static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
- {
-       struct kvm_segment seg;
-       int offset;
-       u16 flags;
-       kvm_get_segment(vcpu, &seg, n);
-       offset = 0x7e00 + n * 16;
-       flags = enter_smm_get_segment_flags(&seg) >> 8;
-       put_smstate(u16, buf, offset, seg.selector);
-       put_smstate(u16, buf, offset + 2, flags);
-       put_smstate(u32, buf, offset + 4, seg.limit);
-       put_smstate(u64, buf, offset + 8, seg.base);
- }
- #endif
- static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
- {
-       struct desc_ptr dt;
-       struct kvm_segment seg;
-       unsigned long val;
-       int i;
-       put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
-       put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
-       put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
-       put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
-       for (i = 0; i < 8; i++)
-               put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i));
-       kvm_get_dr(vcpu, 6, &val);
-       put_smstate(u32, buf, 0x7fcc, (u32)val);
-       kvm_get_dr(vcpu, 7, &val);
-       put_smstate(u32, buf, 0x7fc8, (u32)val);
-       kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
-       put_smstate(u32, buf, 0x7fc4, seg.selector);
-       put_smstate(u32, buf, 0x7f64, seg.base);
-       put_smstate(u32, buf, 0x7f60, seg.limit);
-       put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
-       kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
-       put_smstate(u32, buf, 0x7fc0, seg.selector);
-       put_smstate(u32, buf, 0x7f80, seg.base);
-       put_smstate(u32, buf, 0x7f7c, seg.limit);
-       put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
-       static_call(kvm_x86_get_gdt)(vcpu, &dt);
-       put_smstate(u32, buf, 0x7f74, dt.address);
-       put_smstate(u32, buf, 0x7f70, dt.size);
-       static_call(kvm_x86_get_idt)(vcpu, &dt);
-       put_smstate(u32, buf, 0x7f58, dt.address);
-       put_smstate(u32, buf, 0x7f54, dt.size);
-       for (i = 0; i < 6; i++)
-               enter_smm_save_seg_32(vcpu, buf, i);
-       put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
-       /* revision id */
-       put_smstate(u32, buf, 0x7efc, 0x00020000);
-       put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
- }
- #ifdef CONFIG_X86_64
- static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
- {
-       struct desc_ptr dt;
-       struct kvm_segment seg;
-       unsigned long val;
-       int i;
-       for (i = 0; i < 16; i++)
-               put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i));
-       put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
-       put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
-       kvm_get_dr(vcpu, 6, &val);
-       put_smstate(u64, buf, 0x7f68, val);
-       kvm_get_dr(vcpu, 7, &val);
-       put_smstate(u64, buf, 0x7f60, val);
-       put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
-       put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
-       put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
-       put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
-       /* revision id */
-       put_smstate(u32, buf, 0x7efc, 0x00020064);
-       put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
-       kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
-       put_smstate(u16, buf, 0x7e90, seg.selector);
-       put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
-       put_smstate(u32, buf, 0x7e94, seg.limit);
-       put_smstate(u64, buf, 0x7e98, seg.base);
-       static_call(kvm_x86_get_idt)(vcpu, &dt);
-       put_smstate(u32, buf, 0x7e84, dt.size);
-       put_smstate(u64, buf, 0x7e88, dt.address);
-       kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
-       put_smstate(u16, buf, 0x7e70, seg.selector);
-       put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
-       put_smstate(u32, buf, 0x7e74, seg.limit);
-       put_smstate(u64, buf, 0x7e78, seg.base);
-       static_call(kvm_x86_get_gdt)(vcpu, &dt);
-       put_smstate(u32, buf, 0x7e64, dt.size);
-       put_smstate(u64, buf, 0x7e68, dt.address);
-       for (i = 0; i < 6; i++)
-               enter_smm_save_seg_64(vcpu, buf, i);
- }
- #endif
- static void enter_smm(struct kvm_vcpu *vcpu)
- {
-       struct kvm_segment cs, ds;
-       struct desc_ptr dt;
-       unsigned long cr0;
-       char buf[512];
-       memset(buf, 0, 512);
- #ifdef CONFIG_X86_64
-       if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
-               enter_smm_save_state_64(vcpu, buf);
-       else
- #endif
-               enter_smm_save_state_32(vcpu, buf);
-       /*
-        * Give enter_smm() a chance to make ISA-specific changes to the vCPU
-        * state (e.g. leave guest mode) after we've saved the state into the
-        * SMM state-save area.
-        */
-       static_call(kvm_x86_enter_smm)(vcpu, buf);
-       kvm_smm_changed(vcpu, true);
-       kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
-       if (static_call(kvm_x86_get_nmi_mask)(vcpu))
-               vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
-       else
-               static_call(kvm_x86_set_nmi_mask)(vcpu, true);
-       kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
-       kvm_rip_write(vcpu, 0x8000);
-       cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
-       static_call(kvm_x86_set_cr0)(vcpu, cr0);
-       vcpu->arch.cr0 = cr0;
-       static_call(kvm_x86_set_cr4)(vcpu, 0);
-       /* Undocumented: IDT limit is set to zero on entry to SMM.  */
-       dt.address = dt.size = 0;
-       static_call(kvm_x86_set_idt)(vcpu, &dt);
-       kvm_set_dr(vcpu, 7, DR7_FIXED_1);
-       cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
-       cs.base = vcpu->arch.smbase;
-       ds.selector = 0;
-       ds.base = 0;
-       cs.limit    = ds.limit = 0xffffffff;
-       cs.type     = ds.type = 0x3;
-       cs.dpl      = ds.dpl = 0;
-       cs.db       = ds.db = 0;
-       cs.s        = ds.s = 1;
-       cs.l        = ds.l = 0;
-       cs.g        = ds.g = 1;
-       cs.avl      = ds.avl = 0;
-       cs.present  = ds.present = 1;
-       cs.unusable = ds.unusable = 0;
-       cs.padding  = ds.padding = 0;
-       kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
-       kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
-       kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
-       kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
-       kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
-       kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
- #ifdef CONFIG_X86_64
-       if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
-               static_call(kvm_x86_set_efer)(vcpu, 0);
- #endif
-       kvm_update_cpuid_runtime(vcpu);
-       kvm_mmu_reset_context(vcpu);
- }
- static void process_smi(struct kvm_vcpu *vcpu)
- {
-       vcpu->arch.smi_pending = true;
-       kvm_make_request(KVM_REQ_EVENT, vcpu);
- }
  void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
                                       unsigned long *vcpu_bitmap)
  {
@@@ -10516,20 -10222,17 +10222,17 @@@ static int vcpu_enter_guest(struct kvm_
  
        bool req_immediate_exit = false;
  
-       /* Forbid vmenter if vcpu dirty ring is soft-full */
-       if (unlikely(vcpu->kvm->dirty_ring_size &&
-                    kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) {
-               vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
-               trace_kvm_dirty_ring_exit(vcpu);
-               r = 0;
-               goto out;
-       }
        if (kvm_request_pending(vcpu)) {
                if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
                        r = -EIO;
                        goto out;
                }
+               if (kvm_dirty_ring_check_request(vcpu)) {
+                       r = 0;
+                       goto out;
+               }
                if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
                        if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
                                r = 0;
                        kvm_mmu_sync_roots(vcpu);
                if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
                        kvm_mmu_load_pgd(vcpu);
-               if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
+               /*
+                * Note, the order matters here, as flushing "all" TLB entries
+                * also flushes the "current" TLB entries, i.e. servicing the
+                * flush "all" will clear any request to flush "current".
+                */
+               if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
                        kvm_vcpu_flush_tlb_all(vcpu);
  
-                       /* Flushing all ASIDs flushes the current ASID... */
-                       kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
-               }
                kvm_service_local_tlb_flush_requests(vcpu);
  
+               /*
+                * Fall back to a "full" guest flush if Hyper-V's precise
+                * flushing fails.  Note, Hyper-V's flushing is per-vCPU, but
+                * the flushes are considered "remote" and not "local" because
+                * the requests can be initiated from other vCPUs.
+                */
+               if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu) &&
+                   kvm_hv_vcpu_flush_tlb(vcpu))
+                       kvm_vcpu_flush_tlb_guest(vcpu);
                if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
                        vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
                        r = 0;
                }
                if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
                        record_steal_time(vcpu);
+ #ifdef CONFIG_KVM_SMM
                if (kvm_check_request(KVM_REQ_SMI, vcpu))
                        process_smi(vcpu);
+ #endif
                if (kvm_check_request(KVM_REQ_NMI, vcpu))
                        process_nmi(vcpu);
                if (kvm_check_request(KVM_REQ_PMU, vcpu))
@@@ -11834,7 -11552,7 +11552,7 @@@ int kvm_arch_vcpu_create(struct kvm_vcp
        vcpu->arch.regs_avail = ~0;
        vcpu->arch.regs_dirty = ~0;
  
-       kvm_gpc_init(&vcpu->arch.pv_time);
+       kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);
  
        if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
        vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
  
        kvm_async_pf_hash_reset(vcpu);
+       vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
        kvm_pmu_init(vcpu);
  
        vcpu->arch.pending_external_vector = -1;
@@@ -12334,7 -12054,6 +12054,6 @@@ bool kvm_vcpu_is_reset_bsp(struct kvm_v
  {
        return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
  }
- EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
  
  bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
  {
@@@ -12909,10 -12628,12 +12628,12 @@@ static inline bool kvm_vcpu_has_events(
             static_call(kvm_x86_nmi_allowed)(vcpu, false)))
                return true;
  
+ #ifdef CONFIG_KVM_SMM
        if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
            (vcpu->arch.smi_pending &&
             static_call(kvm_x86_smi_allowed)(vcpu, false)))
                return true;
+ #endif
  
        if (kvm_arch_interrupt_allowed(vcpu) &&
            (kvm_cpu_has_interrupt(vcpu) ||
@@@ -12953,7 -12674,9 +12674,9 @@@ bool kvm_arch_dy_runnable(struct kvm_vc
                return true;
  
        if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+ #ifdef CONFIG_KVM_SMM
                kvm_test_request(KVM_REQ_SMI, vcpu) ||
+ #endif
                 kvm_test_request(KVM_REQ_EVENT, vcpu))
                return true;
  
index 68eeb25fb6611c38cec022a0c2b73b9d310a1756,205a001058589aedda94f6acde9b3a86b8674cbf..9720aed2ac27978b1b5795379b9b33ea319c433b
@@@ -429,7 -429,7 +429,7 @@@ static struct ap_queue_status vfio_ap_i
  
        aqic_gisa.isc = nisc;
        aqic_gisa.ir = 1;
-       aqic_gisa.gisa = (uint64_t)gisa >> 4;
+       aqic_gisa.gisa = virt_to_phys(gisa) >> 4;
  
        status = ap_aqic(q->apqn, aqic_gisa, h_nib);
        switch (status.response_code) {
@@@ -1535,29 -1535,13 +1535,29 @@@ static int vfio_ap_mdev_set_kvm(struct 
        return 0;
  }
  
 +static void unmap_iova(struct ap_matrix_mdev *matrix_mdev, u64 iova, u64 length)
 +{
 +      struct ap_queue_table *qtable = &matrix_mdev->qtable;
 +      struct vfio_ap_queue *q;
 +      int loop_cursor;
 +
 +      hash_for_each(qtable->queues, loop_cursor, q, mdev_qnode) {
 +              if (q->saved_iova >= iova && q->saved_iova < iova + length)
 +                      vfio_ap_irq_disable(q);
 +      }
 +}
 +
  static void vfio_ap_mdev_dma_unmap(struct vfio_device *vdev, u64 iova,
                                   u64 length)
  {
        struct ap_matrix_mdev *matrix_mdev =
                container_of(vdev, struct ap_matrix_mdev, vdev);
  
 -      vfio_unpin_pages(&matrix_mdev->vdev, iova, 1);
 +      mutex_lock(&matrix_dev->mdevs_lock);
 +
 +      unmap_iova(matrix_mdev, iova, length);
 +
 +      mutex_unlock(&matrix_dev->mdevs_lock);
  }
  
  /**
@@@ -1805,9 -1789,6 +1805,9 @@@ static const struct vfio_device_ops vfi
        .close_device = vfio_ap_mdev_close_device,
        .ioctl = vfio_ap_mdev_ioctl,
        .dma_unmap = vfio_ap_mdev_dma_unmap,
 +      .bind_iommufd = vfio_iommufd_emulated_bind,
 +      .unbind_iommufd = vfio_iommufd_emulated_unbind,
 +      .attach_ioas = vfio_iommufd_emulated_attach_ioas,
  };
  
  static struct mdev_driver vfio_ap_matrix_driver = {
index b17c6eeb9afa2e3ada6021ee6453913814383b51,020ca9bdbb79a6d873d74485cfb5caae44bd5a05..e29ccabf2e09de80381373a717d90c2675624d80
@@@ -102,15 -102,6 +102,15 @@@ struct ms_hyperv_tsc_page 
        volatile s64 tsc_offset;
  } __packed;
  
 +union hv_reference_tsc_msr {
 +      u64 as_uint64;
 +      struct {
 +              u64 enable:1;
 +              u64 reserved:11;
 +              u64 pfn:52;
 +      } __packed;
 +};
 +
  /*
   * The guest OS needs to register the guest ID with the hypervisor.
   * The guest ID is a 64 bit entity and the structure of this ID is
@@@ -408,6 -399,11 +408,11 @@@ struct hv_vpset 
        u64 bank_contents[];
  } __packed;
  
+ /* The maximum number of sparse vCPU banks which can be encoded by 'struct hv_vpset' */
+ #define HV_MAX_SPARSE_VCPU_BANKS (64)
+ /* The number of vCPUs in one sparse bank */
+ #define HV_VCPUS_PER_SPARSE_BANK (64)
  /* HvCallSendSyntheticClusterIpi hypercall */
  struct hv_send_ipi {
        u32 vector;
diff --combined include/linux/kvm_host.h
index 915142abdf76100ac62fd1de7ee66a995dfa662e,f16c4689322b29b4957f4fb19f3333dda3b9aa2c..4f26b244f6d09c4baf14c151af5c407ea6cce4d7
@@@ -50,8 -50,8 +50,8 @@@
  #endif
  
  /*
-  * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
-  * in kvm, other bits are visible for userspace which are defined in
+  * The bit 16 ~ bit 31 of kvm_userspace_memory_region::flags are internally
+  * used in kvm, other bits are visible for userspace which are defined in
   * include/linux/kvm_h.
   */
  #define KVM_MEMSLOT_INVALID   (1UL << 16)
@@@ -96,6 -96,7 +96,7 @@@
  #define KVM_PFN_ERR_FAULT     (KVM_PFN_ERR_MASK)
  #define KVM_PFN_ERR_HWPOISON  (KVM_PFN_ERR_MASK + 1)
  #define KVM_PFN_ERR_RO_FAULT  (KVM_PFN_ERR_MASK + 2)
+ #define KVM_PFN_ERR_SIGPENDING        (KVM_PFN_ERR_MASK + 3)
  
  /*
   * error pfns indicate that the gfn is in slot but faild to
@@@ -106,6 -107,15 +107,15 @@@ static inline bool is_error_pfn(kvm_pfn
        return !!(pfn & KVM_PFN_ERR_MASK);
  }
  
+ /*
+  * KVM_PFN_ERR_SIGPENDING indicates that fetching the PFN was interrupted
+  * by a pending signal.  Note, the signal may or may not be fatal.
+  */
+ static inline bool is_sigpending_pfn(kvm_pfn_t pfn)
+ {
+       return pfn == KVM_PFN_ERR_SIGPENDING;
+ }
  /*
   * error_noslot pfns indicate that the gfn can not be
   * translated to pfn - it is not in slot or failed to
@@@ -153,10 -163,11 +163,11 @@@ static inline bool is_error_page(struc
   * Architecture-independent vcpu->requests bit members
   * Bits 3-7 are reserved for more arch-independent bits.
   */
- #define KVM_REQ_TLB_FLUSH         (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
- #define KVM_REQ_VM_DEAD           (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
- #define KVM_REQ_UNBLOCK           2
- #define KVM_REQUEST_ARCH_BASE     8
+ #define KVM_REQ_TLB_FLUSH             (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+ #define KVM_REQ_VM_DEAD                       (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+ #define KVM_REQ_UNBLOCK                       2
+ #define KVM_REQ_DIRTY_RING_SOFT_FULL  3
+ #define KVM_REQUEST_ARCH_BASE         8
  
  /*
   * KVM_REQ_OUTSIDE_GUEST_MODE exists is purely as way to force the vCPU to
@@@ -416,7 -427,7 +427,7 @@@ static __always_inline void guest_conte
         */
        if (!context_tracking_guest_enter()) {
                instrumentation_begin();
 -              rcu_virt_note_context_switch(smp_processor_id());
 +              rcu_virt_note_context_switch();
                instrumentation_end();
        }
  }
@@@ -655,6 -666,8 +666,8 @@@ struct kvm_irq_routing_table 
  };
  #endif
  
+ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm);
  #ifndef KVM_INTERNAL_MEM_SLOTS
  #define KVM_INTERNAL_MEM_SLOTS 0
  #endif
@@@ -710,6 -723,11 +723,11 @@@ struct kvm 
        /* The current active memslot set for each address space */
        struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
        struct xarray vcpu_array;
+       /*
+        * Protected by slots_lock, but can be read outside if an
+        * incorrect answer is acceptable.
+        */
+       atomic_t nr_memslots_dirty_logging;
  
        /* Used to wait for completion of MMU notifiers.  */
        spinlock_t mn_invalidate_lock;
        struct srcu_struct srcu;
        struct srcu_struct irq_srcu;
        pid_t userspace_pid;
 +      bool override_halt_poll_ns;
        unsigned int max_halt_poll_ns;
        u32 dirty_ring_size;
+       bool dirty_ring_with_bitmap;
        bool vm_bugged;
        bool vm_dead;
  
@@@ -1141,8 -1159,8 +1160,8 @@@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *k
  kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn);
  kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn);
  kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
-                              bool atomic, bool *async, bool write_fault,
-                              bool *writable, hva_t *hva);
+                              bool atomic, bool interruptible, bool *async,
+                              bool write_fault, bool *writable, hva_t *hva);
  
  void kvm_release_pfn_clean(kvm_pfn_t pfn);
  void kvm_release_pfn_dirty(kvm_pfn_t pfn);
@@@ -1244,18 -1262,7 +1263,7 @@@ void kvm_vcpu_mark_page_dirty(struct kv
   * kvm_gpc_init - initialize gfn_to_pfn_cache.
   *
   * @gpc:         struct gfn_to_pfn_cache object.
-  *
-  * This sets up a gfn_to_pfn_cache by initializing locks.  Note, the cache must
-  * be zero-allocated (or zeroed by the caller before init).
-  */
- void kvm_gpc_init(struct gfn_to_pfn_cache *gpc);
- /**
-  * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest
-  *                    physical address.
-  *
   * @kvm:         pointer to kvm instance.
-  * @gpc:         struct gfn_to_pfn_cache object.
   * @vcpu:        vCPU to be used for marking pages dirty and to be woken on
   *               invalidation.
   * @usage:       indicates if the resulting host physical PFN is used while
   *               changes!---will also force @vcpu to exit the guest and
   *               refresh the cache); and/or if the PFN used directly
   *               by KVM (and thus needs a kernel virtual mapping).
+  *
+  * This sets up a gfn_to_pfn_cache by initializing locks and assigning the
+  * immutable attributes.  Note, the cache must be zero-allocated (or zeroed by
+  * the caller before init).
+  */
+ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
+                 struct kvm_vcpu *vcpu, enum pfn_cache_usage usage);
+ /**
+  * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest
+  *                    physical address.
+  *
+  * @gpc:         struct gfn_to_pfn_cache object.
   * @gpa:         guest physical address to map.
   * @len:         sanity check; the range being access must fit a single page.
   *
   * @return:      0 for success.
   *               -EINVAL for a mapping which would cross a page boundary.
-  *                 -EFAULT for an untranslatable guest physical address.
+  *               -EFAULT for an untranslatable guest physical address.
   *
-  * This primes a gfn_to_pfn_cache and links it into the @kvm's list for
-  * invalidations to be processed.  Callers are required to use
-  * kvm_gfn_to_pfn_cache_check() to ensure that the cache is valid before
-  * accessing the target page.
+  * This primes a gfn_to_pfn_cache and links it into the @gpc->kvm's list for
+  * invalidations to be processed.  Callers are required to use kvm_gpc_check()
+  * to ensure that the cache is valid before accessing the target page.
   */
- int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
-                    struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
-                    gpa_t gpa, unsigned long len);
+ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len);
  
  /**
-  * kvm_gfn_to_pfn_cache_check - check validity of a gfn_to_pfn_cache.
+  * kvm_gpc_check - check validity of a gfn_to_pfn_cache.
   *
-  * @kvm:         pointer to kvm instance.
   * @gpc:         struct gfn_to_pfn_cache object.
-  * @gpa:         current guest physical address to map.
   * @len:         sanity check; the range being access must fit a single page.
   *
   * @return:      %true if the cache is still valid and the address matches.
   * Callers in IN_GUEST_MODE may do so without locking, although they should
   * still hold a read lock on kvm->scru for the memslot checks.
   */
- bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
-                               gpa_t gpa, unsigned long len);
+ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len);
  
  /**
-  * kvm_gfn_to_pfn_cache_refresh - update a previously initialized cache.
+  * kvm_gpc_refresh - update a previously initialized cache.
   *
-  * @kvm:         pointer to kvm instance.
   * @gpc:         struct gfn_to_pfn_cache object.
-  * @gpa:         updated guest physical address to map.
   * @len:         sanity check; the range being access must fit a single page.
   *
   * @return:      0 for success.
   *               -EINVAL for a mapping which would cross a page boundary.
-  *                 -EFAULT for an untranslatable guest physical address.
+  *               -EFAULT for an untranslatable guest physical address.
   *
   * This will attempt to refresh a gfn_to_pfn_cache. Note that a successful
-  * returm from this function does not mean the page can be immediately
+  * return from this function does not mean the page can be immediately
   * accessed because it may have raced with an invalidation. Callers must
   * still lock and check the cache status, as this function does not return
   * with the lock still held to permit access.
   */
- int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
-                                gpa_t gpa, unsigned long len);
- /**
-  * kvm_gfn_to_pfn_cache_unmap - temporarily unmap a gfn_to_pfn_cache.
-  *
-  * @kvm:         pointer to kvm instance.
-  * @gpc:         struct gfn_to_pfn_cache object.
-  *
-  * This unmaps the referenced page. The cache is left in the invalid state
-  * but at least the mapping from GPA to userspace HVA will remain cached
-  * and can be reused on a subsequent refresh.
-  */
- void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
+ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len);
  
  /**
   * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache.
   *
-  * @kvm:         pointer to kvm instance.
   * @gpc:         struct gfn_to_pfn_cache object.
   *
-  * This removes a cache from the @kvm's list to be processed on MMU notifier
+  * This removes a cache from the VM's list to be processed on MMU notifier
   * invocation.
   */
- void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
+ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc);
  
  void kvm_sigset_activate(struct kvm_vcpu *vcpu);
  void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
diff --combined include/linux/mm.h
index 8178fe894e2e1da4a9f6f0bb8e7f2995f5730f18,3c84f4e48cd77fd8a415189ac4218e016f173abd..f3f196e4d66d6f42c74366731890a5d4102bf75b
@@@ -74,7 -74,6 +74,7 @@@ static inline void totalram_pages_add(l
  
  extern void * high_memory;
  extern int page_cluster;
 +extern const int page_cluster_max;
  
  #ifdef CONFIG_SYSCTL
  extern int sysctl_legacy_va_layout;
@@@ -550,7 -549,7 +550,7 @@@ struct vm_operations_struct 
        /*
         * Called by mprotect() to make driver-specific permission
         * checks before mprotect() is finalised.   The VMA must not
 -       * be modified.  Returns 0 if eprotect() can proceed.
 +       * be modified.  Returns 0 if mprotect() can proceed.
         */
        int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
                        unsigned long end, unsigned long newflags);
@@@ -700,10 -699,8 +700,10 @@@ static inline unsigned long vma_iter_ad
   * paths in userfault.
   */
  bool vma_is_shmem(struct vm_area_struct *vma);
 +bool vma_is_anon_shmem(struct vm_area_struct *vma);
  #else
  static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
 +static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; }
  #endif
  
  int vma_is_stack_for_current(struct vm_area_struct *vma);
@@@ -820,8 -817,8 +820,8 @@@ static inline int is_vmalloc_or_module_
  /*
   * How many times the entire folio is mapped as a single unit (eg by a
   * PMD or PUD entry).  This is probably not what you want, except for
 - * debugging purposes; look at folio_mapcount() or page_mapcount()
 - * instead.
 + * debugging purposes - it does not include PTE-mapped sub-pages; look
 + * at folio_mapcount() or page_mapcount() or total_mapcount() instead.
   */
  static inline int folio_entire_mapcount(struct folio *folio)
  {
  
  /*
   * Mapcount of compound page as a whole, does not include mapped sub-pages.
 - *
 - * Must be called only for compound pages.
 + * Must be called only on head of compound page.
   */
 -static inline int compound_mapcount(struct page *page)
 +static inline int head_compound_mapcount(struct page *head)
  {
 -      return folio_entire_mapcount(page_folio(page));
 +      return atomic_read(compound_mapcount_ptr(head)) + 1;
 +}
 +
 +/*
 + * If a 16GB hugetlb page were mapped by PTEs of all of its 4kB sub-pages,
 + * its subpages_mapcount would be 0x400000: choose the COMPOUND_MAPPED bit
 + * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE).  Hugetlb currently
 + * leaves subpages_mapcount at 0, but avoid surprise if it participates later.
 + */
 +#define COMPOUND_MAPPED       0x800000
 +#define SUBPAGES_MAPPED       (COMPOUND_MAPPED - 1)
 +
 +/*
 + * Number of sub-pages mapped by PTE, does not include compound mapcount.
 + * Must be called only on head of compound page.
 + */
 +static inline int head_subpages_mapcount(struct page *head)
 +{
 +      return atomic_read(subpages_mapcount_ptr(head)) & SUBPAGES_MAPPED;
  }
  
  /*
@@@ -866,9 -846,11 +866,9 @@@ static inline void page_mapcount_reset(
        atomic_set(&(page)->_mapcount, -1);
  }
  
 -int __page_mapcount(struct page *page);
 -
  /*
   * Mapcount of 0-order page; when compound sub-page, includes
 - * compound_mapcount().
 + * compound_mapcount of compound_head of page.
   *
   * Result is undefined for pages which cannot be mapped into userspace.
   * For example SLAB or special types of pages. See function page_has_type().
   */
  static inline int page_mapcount(struct page *page)
  {
 -      if (unlikely(PageCompound(page)))
 -              return __page_mapcount(page);
 -      return atomic_read(&page->_mapcount) + 1;
 +      int mapcount = atomic_read(&page->_mapcount) + 1;
 +
 +      if (likely(!PageCompound(page)))
 +              return mapcount;
 +      page = compound_head(page);
 +      return head_compound_mapcount(page) + mapcount;
  }
  
 -int folio_mapcount(struct folio *folio);
 +int total_compound_mapcount(struct page *head);
  
 -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 -static inline int total_mapcount(struct page *page)
 +/**
 + * folio_mapcount() - Calculate the number of mappings of this folio.
 + * @folio: The folio.
 + *
 + * A large folio tracks both how many times the entire folio is mapped,
 + * and how many times each individual page in the folio is mapped.
 + * This function calculates the total number of times the folio is
 + * mapped.
 + *
 + * Return: The number of times this folio is mapped.
 + */
 +static inline int folio_mapcount(struct folio *folio)
  {
 -      return folio_mapcount(page_folio(page));
 +      if (likely(!folio_test_large(folio)))
 +              return atomic_read(&folio->_mapcount) + 1;
 +      return total_compound_mapcount(&folio->page);
  }
  
 -#else
  static inline int total_mapcount(struct page *page)
  {
 -      return page_mapcount(page);
 +      if (likely(!PageCompound(page)))
 +              return atomic_read(&page->_mapcount) + 1;
 +      return total_compound_mapcount(compound_head(page));
 +}
 +
 +static inline bool folio_large_is_mapped(struct folio *folio)
 +{
 +      /*
 +       * Reading folio_mapcount_ptr() below could be omitted if hugetlb
 +       * participated in incrementing subpages_mapcount when compound mapped.
 +       */
 +      return atomic_read(folio_subpages_mapcount_ptr(folio)) > 0 ||
 +              atomic_read(folio_mapcount_ptr(folio)) >= 0;
 +}
 +
 +/**
 + * folio_mapped - Is this folio mapped into userspace?
 + * @folio: The folio.
 + *
 + * Return: True if any page in this folio is referenced by user page tables.
 + */
 +static inline bool folio_mapped(struct folio *folio)
 +{
 +      if (likely(!folio_test_large(folio)))
 +              return atomic_read(&folio->_mapcount) >= 0;
 +      return folio_large_is_mapped(folio);
 +}
 +
 +/*
 + * Return true if this page is mapped into pagetables.
 + * For compound page it returns true if any sub-page of compound page is mapped,
 + * even if this particular sub-page is not itself mapped by any PTE or PMD.
 + */
 +static inline bool page_mapped(struct page *page)
 +{
 +      if (likely(!PageCompound(page)))
 +              return atomic_read(&page->_mapcount) >= 0;
 +      return folio_large_is_mapped(page_folio(page));
  }
 -#endif
  
  static inline struct page *virt_to_head_page(const void *x)
  {
@@@ -997,13 -929,6 +997,13 @@@ static inline void set_compound_page_dt
        page[1].compound_dtor = compound_dtor;
  }
  
 +static inline void folio_set_compound_dtor(struct folio *folio,
 +              enum compound_dtor_id compound_dtor)
 +{
 +      VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
 +      folio->_folio_dtor = compound_dtor;
 +}
 +
  void destroy_large_folio(struct folio *folio);
  
  static inline int head_compound_pincount(struct page *head)
@@@ -1019,22 -944,6 +1019,22 @@@ static inline void set_compound_order(s
  #endif
  }
  
 +/*
 + * folio_set_compound_order is generally passed a non-zero order to
 + * initialize a large folio.  However, hugetlb code abuses this by
 + * passing in zero when 'dissolving' a large folio.
 + */
 +static inline void folio_set_compound_order(struct folio *folio,
 +              unsigned int order)
 +{
 +      VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
 +
 +      folio->_folio_order = order;
 +#ifdef CONFIG_64BIT
 +      folio->_folio_nr_pages = order ? 1U << order : 0;
 +#endif
 +}
 +
  /* Returns the number of pages in this potentially compound page. */
  static inline unsigned long compound_nr(struct page *page)
  {
@@@ -1220,7 -1129,7 +1220,7 @@@ static inline void get_page(struct pag
        folio_get(page_folio(page));
  }
  
 -bool __must_check try_grab_page(struct page *page, unsigned int flags);
 +int __must_check try_grab_page(struct page *page, unsigned int flags);
  
  static inline __must_check bool try_get_page(struct page *page)
  {
@@@ -1270,24 -1179,7 +1270,24 @@@ static inline void folio_put_refs(struc
                __folio_put(folio);
  }
  
 -void release_pages(struct page **pages, int nr);
 +/**
 + * release_pages - release an array of pages or folios
 + *
 + * This just releases a simple array of multiple pages, and
 + * accepts various different forms of said page array: either
 + * a regular old boring array of pages, an array of folios, or
 + * an array of encoded page pointers.
 + *
 + * The transparent union syntax for this kind of "any of these
 + * argument types" is all kinds of ugly, so look away.
 + */
 +typedef union {
 +      struct page **pages;
 +      struct folio **folios;
 +      struct encoded_page **encoded_pages;
 +} release_pages_arg __attribute__ ((__transparent_union__));
 +
 +void release_pages(release_pages_arg, int nr);
  
  /**
   * folios_put - Decrement the reference count on an array of folios.
   */
  static inline void folios_put(struct folio **folios, unsigned int nr)
  {
 -      release_pages((struct page **)folios, nr);
 +      release_pages(folios, nr);
  }
  
  static inline void put_page(struct page *page)
@@@ -1907,6 -1799,9 +1907,6 @@@ static inline pgoff_t page_index(struc
        return page->index;
  }
  
 -bool page_mapped(struct page *page);
 -bool folio_mapped(struct folio *folio);
 -
  /*
   * Return true only if the page has been allocated with
   * ALLOC_NO_WATERMARKS and the low watermark was not
@@@ -1957,25 -1852,6 +1957,25 @@@ static void __maybe_unused show_free_ar
        __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1);
  }
  
 +/*
 + * Parameter block passed down to zap_pte_range in exceptional cases.
 + */
 +struct zap_details {
 +      struct folio *single_folio;     /* Locked folio to be unmapped */
 +      bool even_cows;                 /* Zap COWed private pages too? */
 +      zap_flags_t zap_flags;          /* Extra flags for zapping */
 +};
 +
 +/*
 + * Whether to drop the pte markers, for example, the uffd-wp information for
 + * file-backed memory.  This should only be specified when we will completely
 + * drop the page in the mm, either by truncation or unmapping of the vma.  By
 + * default, the flag is not set.
 + */
 +#define  ZAP_FLAG_DROP_MARKER        ((__force zap_flags_t) BIT(0))
 +/* Set in unmap_vmas() to indicate a final unmap call.  Only used by hugetlb */
 +#define  ZAP_FLAG_UNMAP              ((__force zap_flags_t) BIT(1))
 +
  #ifdef CONFIG_MMU
  extern bool can_do_mlock(void);
  #else
@@@ -1993,8 -1869,6 +1993,8 @@@ void zap_vma_ptes(struct vm_area_struc
                  unsigned long size);
  void zap_page_range(struct vm_area_struct *vma, unsigned long address,
                    unsigned long size);
 +void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
 +                         unsigned long size, struct zap_details *details);
  void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
                struct vm_area_struct *start_vma, unsigned long start,
                unsigned long end);
@@@ -2130,22 -2004,6 +2130,22 @@@ extern unsigned long move_page_tables(s
  #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
                                            MM_CP_UFFD_WP_RESOLVE)
  
 +int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
 +static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
 +{
 +      /*
 +       * We want to check manually if we can change individual PTEs writable
 +       * if we can't do that automatically for all PTEs in a mapping. For
 +       * private mappings, that's always the case when we have write
 +       * permissions as we properly have to handle COW.
 +       */
 +      if (vma->vm_flags & VM_SHARED)
 +              return vma_wants_writenotify(vma, vma->vm_page_prot);
 +      return !!(vma->vm_flags & VM_WRITE);
 +
 +}
 +bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
 +                           pte_t pte);
  extern unsigned long change_protection(struct mmu_gather *tlb,
                              struct vm_area_struct *vma, unsigned long start,
                              unsigned long end, pgprot_t newprot,
@@@ -2172,30 -2030,40 +2172,30 @@@ static inline bool get_user_page_fast_o
   */
  static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
  {
 -      long val = atomic_long_read(&mm->rss_stat.count[member]);
 -
 -#ifdef SPLIT_RSS_COUNTING
 -      /*
 -       * counter is updated in asynchronous manner and may go to minus.
 -       * But it's never be expected number for users.
 -       */
 -      if (val < 0)
 -              val = 0;
 -#endif
 -      return (unsigned long)val;
 +      return percpu_counter_read_positive(&mm->rss_stat[member]);
  }
  
 -void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);
 +void mm_trace_rss_stat(struct mm_struct *mm, int member);
  
  static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
  {
 -      long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
 +      percpu_counter_add(&mm->rss_stat[member], value);
  
 -      mm_trace_rss_stat(mm, member, count);
 +      mm_trace_rss_stat(mm, member);
  }
  
  static inline void inc_mm_counter(struct mm_struct *mm, int member)
  {
 -      long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
 +      percpu_counter_inc(&mm->rss_stat[member]);
  
 -      mm_trace_rss_stat(mm, member, count);
 +      mm_trace_rss_stat(mm, member);
  }
  
  static inline void dec_mm_counter(struct mm_struct *mm, int member)
  {
 -      long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
 +      percpu_counter_dec(&mm->rss_stat[member]);
  
 -      mm_trace_rss_stat(mm, member, count);
 +      mm_trace_rss_stat(mm, member);
  }
  
  /* Optimized variant when page is already known not to be PageAnon */
@@@ -2285,6 -2153,8 +2285,6 @@@ static inline int pte_devmap(pte_t pte
  }
  #endif
  
 -int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
 -
  extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                               spinlock_t **ptl);
  static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@@ -2533,7 -2403,7 +2533,7 @@@ static inline void pgtable_pte_page_dto
  
  #if USE_SPLIT_PMD_PTLOCKS
  
 -static struct page *pmd_to_page(pmd_t *pmd)
 +static inline struct page *pmd_pgtable_page(pmd_t *pmd)
  {
        unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
        return virt_to_page((void *)((unsigned long) pmd & mask));
  
  static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
  {
 -      return ptlock_ptr(pmd_to_page(pmd));
 +      return ptlock_ptr(pmd_pgtable_page(pmd));
  }
  
  static inline bool pmd_ptlock_init(struct page *page)
@@@ -2560,7 -2430,7 +2560,7 @@@ static inline void pmd_ptlock_free(stru
        ptlock_free(page);
  }
  
 -#define pmd_huge_pte(mm, pmd) (pmd_to_page(pmd)->pmd_huge_pte)
 +#define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte)
  
  #else
  
@@@ -3080,6 -2950,7 +3080,6 @@@ struct page *follow_page(struct vm_area
                                 * and return without waiting upon it */
  #define FOLL_NOFAULT  0x80    /* do not fault in pages */
  #define FOLL_HWPOISON 0x100   /* check page is hwpoisoned */
 -#define FOLL_MIGRATION        0x400   /* wait for page to replace migration entry */
  #define FOLL_TRIED    0x800   /* a retry, previous pass started an IO */
  #define FOLL_REMOTE   0x2000  /* we are working on non-current tsk/mm */
  #define FOLL_ANON     0x8000  /* don't do file mappings */
  #define FOLL_SPLIT_PMD        0x20000 /* split huge pmd before returning */
  #define FOLL_PIN      0x40000 /* pages must be released via unpin_user_page */
  #define FOLL_FAST_ONLY        0x80000 /* gup_fast: prevent fall-back to slow gup */
 -#define FOLL_INTERRUPTIBLE  0x100000 /* allow interrupts from generic signals */
 +#define FOLL_PCI_P2PDMA       0x100000 /* allow returning PCI P2PDMA pages */
++#define FOLL_INTERRUPTIBLE  0x200000 /* allow interrupts from generic signals */
  
  /*
   * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
@@@ -3172,12 -3043,8 +3173,12 @@@ static inline int vm_fault_to_errno(vm_
   * Must be called with the (sub)page that's actually referenced via the
   * page table entry, which might not necessarily be the head page for a
   * PTE-mapped THP.
 + *
 + * If the vma is NULL, we're coming from the GUP-fast path and might have
 + * to fallback to the slow path just to lookup the vma.
   */
 -static inline bool gup_must_unshare(unsigned int flags, struct page *page)
 +static inline bool gup_must_unshare(struct vm_area_struct *vma,
 +                                  unsigned int flags, struct page *page)
  {
        /*
         * FOLL_WRITE is implicitly handled correctly as the page table entry
         * Note: PageAnon(page) is stable until the page is actually getting
         * freed.
         */
 -      if (!PageAnon(page))
 -              return false;
 +      if (!PageAnon(page)) {
 +              /*
 +               * We only care about R/O long-term pining: R/O short-term
 +               * pinning does not have the semantics to observe successive
 +               * changes through the process page tables.
 +               */
 +              if (!(flags & FOLL_LONGTERM))
 +                      return false;
 +
 +              /* We really need the vma ... */
 +              if (!vma)
 +                      return true;
 +
 +              /*
 +               * ... because we only care about writable private ("COW")
 +               * mappings where we have to break COW early.
 +               */
 +              return is_cow_mapping(vma->vm_flags);
 +      }
  
        /* Paired with a memory barrier in page_try_share_anon_rmap(). */
        if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
@@@ -3384,8 -3234,6 +3385,8 @@@ void *sparse_buffer_alloc(unsigned lon
  struct page * __populate_section_memmap(unsigned long pfn,
                unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
                struct dev_pagemap *pgmap);
 +void pmd_init(void *addr);
 +void pud_init(void *addr);
  pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
  p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
  pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
@@@ -3397,14 -3245,8 +3398,14 @@@ struct vmem_altmap
  void *vmemmap_alloc_block_buf(unsigned long size, int node,
                              struct vmem_altmap *altmap);
  void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
 +void vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
 +                   unsigned long addr, unsigned long next);
 +int vmemmap_check_pmd(pmd_t *pmd, int node,
 +                    unsigned long addr, unsigned long next);
  int vmemmap_populate_basepages(unsigned long start, unsigned long end,
                               int node, struct vmem_altmap *altmap);
 +int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
 +                             int node, struct vmem_altmap *altmap);
  int vmemmap_populate(unsigned long start, unsigned long end, int node,
                struct vmem_altmap *altmap);
  void vmemmap_populate_print_last(void);
@@@ -3427,6 -3269,7 +3428,6 @@@ enum mf_flags 
  int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
                      unsigned long count, int mf_flags);
  extern int memory_failure(unsigned long pfn, int flags);
 -extern void memory_failure_queue(unsigned long pfn, int flags);
  extern void memory_failure_queue_kick(int cpu);
  extern int unpoison_memory(unsigned long pfn);
  extern int sysctl_memory_failure_early_kill;
@@@ -3435,42 -3278,12 +3436,42 @@@ extern void shake_page(struct page *p)
  extern atomic_long_t num_poisoned_pages __read_mostly;
  extern int soft_offline_page(unsigned long pfn, int flags);
  #ifdef CONFIG_MEMORY_FAILURE
 -extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags);
 +extern void memory_failure_queue(unsigned long pfn, int flags);
 +extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
 +                                      bool *migratable_cleared);
 +void num_poisoned_pages_inc(unsigned long pfn);
 +void num_poisoned_pages_sub(unsigned long pfn, long i);
  #else
 -static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
 +static inline void memory_failure_queue(unsigned long pfn, int flags)
 +{
 +}
 +
 +static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
 +                                      bool *migratable_cleared)
  {
        return 0;
  }
 +
 +static inline void num_poisoned_pages_inc(unsigned long pfn)
 +{
 +}
 +
 +static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
 +{
 +}
 +#endif
 +
 +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
 +extern void memblk_nr_poison_inc(unsigned long pfn);
 +extern void memblk_nr_poison_sub(unsigned long pfn, long i);
 +#else
 +static inline void memblk_nr_poison_inc(unsigned long pfn)
 +{
 +}
 +
 +static inline void memblk_nr_poison_sub(unsigned long pfn, long i)
 +{
 +}
  #endif
  
  #ifndef arch_memory_failure
@@@ -3655,4 -3468,12 +3656,4 @@@ madvise_set_anon_name(struct mm_struct 
  }
  #endif
  
 -/*
 - * Whether to drop the pte markers, for example, the uffd-wp information for
 - * file-backed memory.  This should only be specified when we will completely
 - * drop the page in the mm, either by truncation or unmapping of the vma.  By
 - * default, the flag is not set.
 - */
 -#define  ZAP_FLAG_DROP_MARKER        ((__force zap_flags_t) BIT(0))
 -
  #endif /* _LINUX_MM_H */
index 9aec9fd8c50bd4dc2056618d7a2218e4b5e229ce,c50ce2812f1775bca841ab05b228e47c02caf3ae..69e93a0c1277191f96e9c065c2ae64ee5aab109c
@@@ -132,8 -132,9 +132,9 @@@ enum pageflags 
        PG_young,
        PG_idle,
  #endif
- #ifdef CONFIG_64BIT
+ #ifdef CONFIG_ARCH_USES_PG_ARCH_X
        PG_arch_2,
+       PG_arch_3,
  #endif
  #ifdef CONFIG_KASAN_HW_TAGS
        PG_skip_kasan_poison,
        /* SLOB */
        PG_slob_free = PG_private,
  
 -      /* Compound pages. Stored in first tail page's flags */
 -      PG_double_map = PG_workingset,
 -
  #ifdef CONFIG_MEMORY_FAILURE
        /*
         * Compound pages. Stored in first tail page's flags.
@@@ -638,7 -642,7 +639,7 @@@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemm
   * Different with flags above, this flag is used only for fsdax mode.  It
   * indicates that this page->mapping is now under reflink case.
   */
 -#define PAGE_MAPPING_DAX_COW  0x1
 +#define PAGE_MAPPING_DAX_SHARED       ((void *)0x1)
  
  static __always_inline bool folio_mapping_flags(struct folio *folio)
  {
@@@ -871,11 -875,29 +872,11 @@@ static inline int PageTransTail(struct 
  {
        return PageTail(page);
  }
 -
 -/*
 - * PageDoubleMap indicates that the compound page is mapped with PTEs as well
 - * as PMDs.
 - *
 - * This is required for optimization of rmap operations for THP: we can postpone
 - * per small page mapcount accounting (and its overhead from atomic operations)
 - * until the first PMD split.
 - *
 - * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up
 - * by one. This reference will go away with last compound_mapcount.
 - *
 - * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap().
 - */
 -PAGEFLAG(DoubleMap, double_map, PF_SECOND)
 -      TESTSCFLAG(DoubleMap, double_map, PF_SECOND)
  #else
  TESTPAGEFLAG_FALSE(TransHuge, transhuge)
  TESTPAGEFLAG_FALSE(TransCompound, transcompound)
  TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap)
  TESTPAGEFLAG_FALSE(TransTail, transtail)
 -PAGEFLAG_FALSE(DoubleMap, double_map)
 -      TESTSCFLAG_FALSE(DoubleMap, double_map)
  #endif
  
  #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
diff --combined mm/Kconfig
index 34d36958b8ac981cd4b8ea5c3cdc415bd67d2d9a,807bd7192f51915e597fc8bb9a9a34c3b6d64839..3425708f274cd32bf82f1d6cd88e545e2c792787
@@@ -219,43 -219,17 +219,43 @@@ config SLU
           and has enhanced diagnostics. SLUB is the default choice for
           a slab allocator.
  
 -config SLOB
 +config SLOB_DEPRECATED
        depends on EXPERT
 -      bool "SLOB (Simple Allocator)"
 +      bool "SLOB (Simple Allocator - DEPRECATED)"
        depends on !PREEMPT_RT
        help
 +         Deprecated and scheduled for removal in a few cycles. SLUB
 +         recommended as replacement. CONFIG_SLUB_TINY can be considered
 +         on systems with 16MB or less RAM.
 +
 +         If you need SLOB to stay, please contact [email protected] and
 +         people listed in the SLAB ALLOCATOR section of MAINTAINERS file,
 +         with your use case.
 +
           SLOB replaces the stock allocator with a drastically simpler
           allocator. SLOB is generally more space efficient but
           does not perform as well on large systems.
  
  endchoice
  
 +config SLOB
 +      bool
 +      default y
 +      depends on SLOB_DEPRECATED
 +
 +config SLUB_TINY
 +      bool "Configure SLUB for minimal memory footprint"
 +      depends on SLUB && EXPERT
 +      select SLAB_MERGE_DEFAULT
 +      help
 +         Configures the SLUB allocator in a way to achieve minimal memory
 +         footprint, sacrificing scalability, debugging and other features.
 +         This is intended only for the smallest system that had used the
 +         SLOB allocator and is not recommended for systems with more than
 +         16MB RAM.
 +
 +         If unsure, say N.
 +
  config SLAB_MERGE_DEFAULT
        bool "Allow slab caches to be merged"
        default y
  
  config SLAB_FREELIST_RANDOM
        bool "Randomize slab freelist"
 -      depends on SLAB || SLUB
 +      depends on SLAB || (SLUB && !SLUB_TINY)
        help
          Randomizes the freelist order used on creating new pages. This
          security feature reduces the predictability of the kernel slab
  
  config SLAB_FREELIST_HARDENED
        bool "Harden slab freelist metadata"
 -      depends on SLAB || SLUB
 +      depends on SLAB || (SLUB && !SLUB_TINY)
        help
          Many kernel heap attacks try to target slab cache metadata and
          other infrastructure. This options makes minor performance
  config SLUB_STATS
        default n
        bool "Enable SLUB performance statistics"
 -      depends on SLUB && SYSFS
 +      depends on SLUB && SYSFS && !SLUB_TINY
        help
          SLUB statistics are useful to debug SLUBs allocation behavior in
          order find ways to optimize the allocator. This should never be
  
  config SLUB_CPU_PARTIAL
        default y
 -      depends on SLUB && SMP
 +      depends on SLUB && SMP && !SLUB_TINY
        bool "SLUB per cpu partial cache"
        help
          Per cpu partial caches accelerate objects allocation and freeing
@@@ -801,7 -775,7 +801,7 @@@ endchoic
  
  config THP_SWAP
        def_bool y
 -      depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
 +      depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT
        help
          Swap transparent huge pages in one piece, without splitting.
          XXX: For now, swap cluster backing transparent huge page
@@@ -1031,6 -1005,14 +1031,14 @@@ config ARCH_USES_HIGH_VMA_FLAG
  config ARCH_HAS_PKEYS
        bool
  
+ config ARCH_USES_PG_ARCH_X
+       bool
+       help
+         Enable the definition of PG_arch_x page flags with x > 1. Only
+         suitable for 64-bit architectures with CONFIG_FLATMEM or
+         CONFIG_SPARSEMEM_VMEMMAP enabled, otherwise there may not be
+         enough room for additional bits in page->flags.
  config VM_EVENT_COUNTERS
        default y
        bool "Enable VM event counters for /proc/vmstat" if EXPERT
@@@ -1100,13 -1082,7 +1108,13 @@@ config IO_MAPPIN
        bool
  
  config SECRETMEM
 -      def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
 +      default y
 +      bool "Enable memfd_secret() system call" if EXPERT
 +      depends on ARCH_HAS_SET_DIRECT_MAP
 +      help
 +        Enable the memfd_secret() system call with the ability to create
 +        memory areas visible only in the context of the owning process and
 +        not mapped to other processes and other kernel page tables.
  
  config ANON_VMA_NAME
        bool "Anonymous VMA name support"
@@@ -1139,10 -1115,17 +1147,10 @@@ config HAVE_ARCH_USERFAULTFD_MINO
        help
          Arch has userfaultfd minor fault support
  
 -config PTE_MARKER
 -      bool
 -
 -      help
 -        Allows to create marker PTEs for file-backed memory.
 -
  config PTE_MARKER_UFFD_WP
        bool "Userfaultfd write protection support for shmem/hugetlbfs"
        default y
        depends on HAVE_ARCH_USERFAULTFD_WP
 -      select PTE_MARKER
  
        help
          Allows to create marker PTEs for userfaultfd write protection
diff --combined mm/gup.c
index f212d571b563d88b1d7172fee5d69d5e4ea1e33c,90e372352e824dbabdd66bfb5970bc483ec319e8..2b45d7817a90c48fee0be12d249098b190425e19
+++ b/mm/gup.c
@@@ -123,9 -123,6 +123,9 @@@ retry
   */
  struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
  {
 +      if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
 +              return NULL;
 +
        if (flags & FOLL_GET)
                return try_get_folio(page, refs);
        else if (flags & FOLL_PIN) {
@@@ -205,22 -202,17 +205,22 @@@ static void gup_put_folio(struct folio 
   * time. Cases: please see the try_grab_folio() documentation, with
   * "refs=1".
   *
 - * Return: true for success, or if no action was required (if neither FOLL_PIN
 - * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
 - * FOLL_PIN was set, but the page could not be grabbed.
 + * Return: 0 for success, or if no action was required (if neither FOLL_PIN
 + * nor FOLL_GET was set, nothing is done). A negative error code for failure:
 + *
 + *   -ENOMEM          FOLL_GET or FOLL_PIN was set, but the page could not
 + *                    be grabbed.
   */
 -bool __must_check try_grab_page(struct page *page, unsigned int flags)
 +int __must_check try_grab_page(struct page *page, unsigned int flags)
  {
        struct folio *folio = page_folio(page);
  
        WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
        if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
 -              return false;
 +              return -ENOMEM;
 +
 +      if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
 +              return -EREMOTEIO;
  
        if (flags & FOLL_GET)
                folio_ref_inc(folio);
                node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
        }
  
 -      return true;
 +      return 0;
  }
  
  /**
@@@ -545,13 -537,42 +545,13 @@@ static struct page *follow_page_pte(str
        if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
                         (FOLL_PIN | FOLL_GET)))
                return ERR_PTR(-EINVAL);
 -
 -      /*
 -       * Considering PTE level hugetlb, like continuous-PTE hugetlb on
 -       * ARM64 architecture.
 -       */
 -      if (is_vm_hugetlb_page(vma)) {
 -              page = follow_huge_pmd_pte(vma, address, flags);
 -              if (page)
 -                      return page;
 -              return no_page_table(vma, flags);
 -      }
 -
 -retry:
        if (unlikely(pmd_bad(*pmd)))
                return no_page_table(vma, flags);
  
        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
        pte = *ptep;
 -      if (!pte_present(pte)) {
 -              swp_entry_t entry;
 -              /*
 -               * KSM's break_ksm() relies upon recognizing a ksm page
 -               * even while it is being migrated, so for that case we
 -               * need migration_entry_wait().
 -               */
 -              if (likely(!(flags & FOLL_MIGRATION)))
 -                      goto no_page;
 -              if (pte_none(pte))
 -                      goto no_page;
 -              entry = pte_to_swp_entry(pte);
 -              if (!is_migration_entry(entry))
 -                      goto no_page;
 -              pte_unmap_unlock(ptep, ptl);
 -              migration_entry_wait(mm, pmd, address);
 -              goto retry;
 -      }
 +      if (!pte_present(pte))
 +              goto no_page;
        if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
                goto no_page;
  
                }
        }
  
 -      if (!pte_write(pte) && gup_must_unshare(flags, page)) {
 +      if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) {
                page = ERR_PTR(-EMLINK);
                goto out;
        }
                       !PageAnonExclusive(page), page);
  
        /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
 -      if (unlikely(!try_grab_page(page, flags))) {
 -              page = ERR_PTR(-ENOMEM);
 +      ret = try_grab_page(page, flags);
 +      if (unlikely(ret)) {
 +              page = ERR_PTR(ret);
                goto out;
        }
 +
        /*
         * We need to make the page accessible if and only if we are going
         * to access its content (the FOLL_PIN case).  Please see
@@@ -661,8 -680,42 +661,8 @@@ static struct page *follow_pmd_mask(str
        pmdval = READ_ONCE(*pmd);
        if (pmd_none(pmdval))
                return no_page_table(vma, flags);
 -      if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
 -              page = follow_huge_pmd_pte(vma, address, flags);
 -              if (page)
 -                      return page;
 -              return no_page_table(vma, flags);
 -      }
 -      if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
 -              page = follow_huge_pd(vma, address,
 -                                    __hugepd(pmd_val(pmdval)), flags,
 -                                    PMD_SHIFT);
 -              if (page)
 -                      return page;
 +      if (!pmd_present(pmdval))
                return no_page_table(vma, flags);
 -      }
 -retry:
 -      if (!pmd_present(pmdval)) {
 -              /*
 -               * Should never reach here, if thp migration is not supported;
 -               * Otherwise, it must be a thp migration entry.
 -               */
 -              VM_BUG_ON(!thp_migration_supported() ||
 -                                !is_pmd_migration_entry(pmdval));
 -
 -              if (likely(!(flags & FOLL_MIGRATION)))
 -                      return no_page_table(vma, flags);
 -
 -              pmd_migration_entry_wait(mm, pmd);
 -              pmdval = READ_ONCE(*pmd);
 -              /*
 -               * MADV_DONTNEED may convert the pmd to null because
 -               * mmap_lock is held in read mode
 -               */
 -              if (pmd_none(pmdval))
 -                      return no_page_table(vma, flags);
 -              goto retry;
 -      }
        if (pmd_devmap(pmdval)) {
                ptl = pmd_lock(mm, pmd);
                page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
        if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags))
                return no_page_table(vma, flags);
  
 -retry_locked:
        ptl = pmd_lock(mm, pmd);
 -      if (unlikely(pmd_none(*pmd))) {
 -              spin_unlock(ptl);
 -              return no_page_table(vma, flags);
 -      }
        if (unlikely(!pmd_present(*pmd))) {
                spin_unlock(ptl);
 -              if (likely(!(flags & FOLL_MIGRATION)))
 -                      return no_page_table(vma, flags);
 -              pmd_migration_entry_wait(mm, pmd);
 -              goto retry_locked;
 +              return no_page_table(vma, flags);
        }
        if (unlikely(!pmd_trans_huge(*pmd))) {
                spin_unlock(ptl);
@@@ -722,6 -783,20 +722,6 @@@ static struct page *follow_pud_mask(str
        pud = pud_offset(p4dp, address);
        if (pud_none(*pud))
                return no_page_table(vma, flags);
 -      if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
 -              page = follow_huge_pud(mm, address, pud, flags);
 -              if (page)
 -                      return page;
 -              return no_page_table(vma, flags);
 -      }
 -      if (is_hugepd(__hugepd(pud_val(*pud)))) {
 -              page = follow_huge_pd(vma, address,
 -                                    __hugepd(pud_val(*pud)), flags,
 -                                    PUD_SHIFT);
 -              if (page)
 -                      return page;
 -              return no_page_table(vma, flags);
 -      }
        if (pud_devmap(*pud)) {
                ptl = pud_lock(mm, pud);
                page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
@@@ -741,6 -816,7 +741,6 @@@ static struct page *follow_p4d_mask(str
                                    struct follow_page_context *ctx)
  {
        p4d_t *p4d;
 -      struct page *page;
  
        p4d = p4d_offset(pgdp, address);
        if (p4d_none(*p4d))
        if (unlikely(p4d_bad(*p4d)))
                return no_page_table(vma, flags);
  
 -      if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
 -              page = follow_huge_pd(vma, address,
 -                                    __hugepd(p4d_val(*p4d)), flags,
 -                                    P4D_SHIFT);
 -              if (page)
 -                      return page;
 -              return no_page_table(vma, flags);
 -      }
        return follow_pud_mask(vma, address, p4d, flags, ctx);
  }
  
@@@ -786,18 -870,10 +786,18 @@@ static struct page *follow_page_mask(st
  
        ctx->page_mask = 0;
  
 -      /* make this handle hugepd */
 -      page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
 -      if (!IS_ERR(page)) {
 -              WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
 +      /*
 +       * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
 +       * special hugetlb page table walking code.  This eliminates the
 +       * need to check for hugetlb entries in the general walking code.
 +       *
 +       * hugetlb_follow_page_mask is only for follow_page() handling here.
 +       * Ordinary GUP uses follow_hugetlb_page for hugetlb processing.
 +       */
 +      if (is_vm_hugetlb_page(vma)) {
 +              page = hugetlb_follow_page_mask(vma, address, flags);
 +              if (!page)
 +                      page = no_page_table(vma, flags);
                return page;
        }
  
        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
                return no_page_table(vma, flags);
  
 -      if (pgd_huge(*pgd)) {
 -              page = follow_huge_pgd(mm, address, pgd, flags);
 -              if (page)
 -                      return page;
 -              return no_page_table(vma, flags);
 -      }
 -      if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
 -              page = follow_huge_pd(vma, address,
 -                                    __hugepd(pgd_val(*pgd)), flags,
 -                                    PGDIR_SHIFT);
 -              if (page)
 -                      return page;
 -              return no_page_table(vma, flags);
 -      }
 -
        return follow_p4d_mask(vma, address, pgd, flags, ctx);
  }
  
@@@ -869,9 -960,10 +869,9 @@@ static int get_gate_page(struct mm_stru
                        goto unmap;
                *page = pte_page(*pte);
        }
 -      if (unlikely(!try_grab_page(*page, gup_flags))) {
 -              ret = -ENOMEM;
 +      ret = try_grab_page(*page, gup_flags);
 +      if (unlikely(ret))
                goto unmap;
 -      }
  out:
        ret = 0;
  unmap:
@@@ -897,8 -989,17 +897,17 @@@ static int faultin_page(struct vm_area_
                fault_flags |= FAULT_FLAG_WRITE;
        if (*flags & FOLL_REMOTE)
                fault_flags |= FAULT_FLAG_REMOTE;
-       if (locked)
+       if (locked) {
                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+               /*
+                * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set
+                * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE.
+                * That's because some callers may not be prepared to
+                * handle early exits caused by non-fatal signals.
+                */
+               if (*flags & FOLL_INTERRUPTIBLE)
+                       fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
+       }
        if (*flags & FOLL_NOWAIT)
                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
        if (*flags & FOLL_TRIED) {
@@@ -966,9 -1067,6 +975,9 @@@ static int check_vma_flags(struct vm_ar
        if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
                return -EOPNOTSUPP;
  
 +      if ((gup_flags & FOLL_LONGTERM) && (gup_flags & FOLL_PCI_P2PDMA))
 +              return -EOPNOTSUPP;
 +
        if (vma_is_secretmem(vma))
                return -EFAULT;
  
                if (!(vm_flags & VM_WRITE)) {
                        if (!(gup_flags & FOLL_FORCE))
                                return -EFAULT;
 +                      /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
 +                      if (is_vm_hugetlb_page(vma))
 +                              return -EFAULT;
                        /*
                         * We used to let the write,force case do COW in a
                         * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
@@@ -1305,6 -1400,22 +1314,22 @@@ retry
  }
  EXPORT_SYMBOL_GPL(fixup_user_fault);
  
+ /*
+  * GUP always responds to fatal signals.  When FOLL_INTERRUPTIBLE is
+  * specified, it'll also respond to generic signals.  The caller of GUP
+  * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption.
+  */
+ static bool gup_signal_pending(unsigned int flags)
+ {
+       if (fatal_signal_pending(current))
+               return true;
+       if (!(flags & FOLL_INTERRUPTIBLE))
+               return false;
+       return signal_pending(current);
+ }
  /*
   * Please note that this function, unlike __get_user_pages will not
   * return 0 for nr_pages > 0 without FOLL_NOWAIT
@@@ -1386,11 -1497,11 +1411,11 @@@ retry
                 * Repeat on the address that fired VM_FAULT_RETRY
                 * with both FAULT_FLAG_ALLOW_RETRY and
                 * FAULT_FLAG_TRIED.  Note that GUP can be interrupted
-                * by fatal signals, so we need to check it before we
+                * by fatal signals of even common signals, depending on
+                * the caller's request. So we need to check it before we
                 * start trying again otherwise it can loop forever.
                 */
-               if (fatal_signal_pending(current)) {
+               if (gup_signal_pending(flags)) {
                        if (!pages_done)
                                pages_done = -EINTR;
                        break;
@@@ -2019,19 -2130,14 +2044,19 @@@ static long __gup_longterm_locked(struc
                                  unsigned long nr_pages,
                                  struct page **pages,
                                  struct vm_area_struct **vmas,
 +                                int *locked,
                                  unsigned int gup_flags)
  {
 +      bool must_unlock = false;
        unsigned int flags;
        long rc, nr_pinned_pages;
  
 +      if (locked && WARN_ON_ONCE(!*locked))
 +              return -EINVAL;
 +
        if (!(gup_flags & FOLL_LONGTERM))
                return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
 -                                             NULL, gup_flags);
 +                                             locked, gup_flags);
  
        /*
         * If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM
                return -EINVAL;
        flags = memalloc_pin_save();
        do {
 +              if (locked && !*locked) {
 +                      mmap_read_lock(mm);
 +                      must_unlock = true;
 +                      *locked = 1;
 +              }
                nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
 -                                                        pages, vmas, NULL,
 +                                                        pages, vmas, locked,
                                                          gup_flags);
                if (nr_pinned_pages <= 0) {
                        rc = nr_pinned_pages;
        } while (rc == -EAGAIN);
        memalloc_pin_restore(flags);
  
 +      if (locked && *locked && must_unlock) {
 +              mmap_read_unlock(mm);
 +              *locked = 0;
 +      }
        return rc ? rc : nr_pinned_pages;
  }
  
@@@ -2088,6 -2185,35 +2113,6 @@@ static bool is_valid_gup_flags(unsigne
  }
  
  #ifdef CONFIG_MMU
 -static long __get_user_pages_remote(struct mm_struct *mm,
 -                                  unsigned long start, unsigned long nr_pages,
 -                                  unsigned int gup_flags, struct page **pages,
 -                                  struct vm_area_struct **vmas, int *locked)
 -{
 -      /*
 -       * Parts of FOLL_LONGTERM behavior are incompatible with
 -       * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
 -       * vmas. However, this only comes up if locked is set, and there are
 -       * callers that do request FOLL_LONGTERM, but do not set locked. So,
 -       * allow what we can.
 -       */
 -      if (gup_flags & FOLL_LONGTERM) {
 -              if (WARN_ON_ONCE(locked))
 -                      return -EINVAL;
 -              /*
 -               * This will check the vmas (even if our vmas arg is NULL)
 -               * and return -ENOTSUPP if DAX isn't allowed in this case:
 -               */
 -              return __gup_longterm_locked(mm, start, nr_pages, pages,
 -                                           vmas, gup_flags | FOLL_TOUCH |
 -                                           FOLL_REMOTE);
 -      }
 -
 -      return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
 -                                     locked,
 -                                     gup_flags | FOLL_TOUCH | FOLL_REMOTE);
 -}
 -
  /**
   * get_user_pages_remote() - pin user pages in memory
   * @mm:               mm_struct of target mm
@@@ -2156,8 -2282,8 +2181,8 @@@ long get_user_pages_remote(struct mm_st
        if (!is_valid_gup_flags(gup_flags))
                return -EINVAL;
  
 -      return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
 -                                     pages, vmas, locked);
 +      return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked,
 +                                   gup_flags | FOLL_TOUCH | FOLL_REMOTE);
  }
  EXPORT_SYMBOL(get_user_pages_remote);
  
@@@ -2169,6 -2295,14 +2194,6 @@@ long get_user_pages_remote(struct mm_st
  {
        return 0;
  }
 -
 -static long __get_user_pages_remote(struct mm_struct *mm,
 -                                  unsigned long start, unsigned long nr_pages,
 -                                  unsigned int gup_flags, struct page **pages,
 -                                  struct vm_area_struct **vmas, int *locked)
 -{
 -      return 0;
 -}
  #endif /* !CONFIG_MMU */
  
  /**
@@@ -2195,7 -2329,7 +2220,7 @@@ long get_user_pages(unsigned long start
                return -EINVAL;
  
        return __gup_longterm_locked(current->mm, start, nr_pages,
 -                                   pages, vmas, gup_flags | FOLL_TOUCH);
 +                                   pages, vmas, NULL, gup_flags | FOLL_TOUCH);
  }
  EXPORT_SYMBOL(get_user_pages);
  
@@@ -2221,9 -2355,18 +2246,9 @@@ long get_user_pages_unlocked(unsigned l
        int locked = 1;
        long ret;
  
 -      /*
 -       * FIXME: Current FOLL_LONGTERM behavior is incompatible with
 -       * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
 -       * vmas.  As there are no users of this flag in this call we simply
 -       * disallow this option for now.
 -       */
 -      if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
 -              return -EINVAL;
 -
        mmap_read_lock(mm);
 -      ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
 -                                    &locked, gup_flags | FOLL_TOUCH);
 +      ret = __gup_longterm_locked(mm, start, nr_pages, pages, NULL, &locked,
 +                                  gup_flags | FOLL_TOUCH);
        if (locked)
                mmap_read_unlock(mm);
        return ret;
@@@ -2350,7 -2493,7 +2375,7 @@@ static int gup_pte_range(pmd_t pmd, pmd
                        goto pte_unmap;
                }
  
 -              if (!pte_write(pte) && gup_must_unshare(flags, page)) {
 +              if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
                        gup_put_folio(folio, 1, flags);
                        goto pte_unmap;
                }
@@@ -2416,15 -2559,9 +2441,15 @@@ static int __gup_device_huge(unsigned l
                        undo_dev_pagemap(nr, nr_start, flags, pages);
                        break;
                }
 +
 +              if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
 +                      undo_dev_pagemap(nr, nr_start, flags, pages);
 +                      break;
 +              }
 +
                SetPageReferenced(page);
                pages[*nr] = page;
 -              if (unlikely(!try_grab_page(page, flags))) {
 +              if (unlikely(try_grab_page(page, flags))) {
                        undo_dev_pagemap(nr, nr_start, flags, pages);
                        break;
                }
@@@ -2542,7 -2679,7 +2567,7 @@@ static int gup_hugepte(pte_t *ptep, uns
                return 0;
        }
  
 -      if (!pte_write(pte) && gup_must_unshare(flags, &folio->page)) {
 +      if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }
@@@ -2608,7 -2745,7 +2633,7 @@@ static int gup_huge_pmd(pmd_t orig, pmd
                return 0;
        }
  
 -      if (!pmd_write(orig) && gup_must_unshare(flags, &folio->page)) {
 +      if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }
@@@ -2648,7 -2785,7 +2673,7 @@@ static int gup_huge_pud(pud_t orig, pud
                return 0;
        }
  
 -      if (!pud_write(orig) && gup_must_unshare(flags, &folio->page)) {
 +      if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }
@@@ -2740,7 -2877,7 +2765,7 @@@ static int gup_pud_range(p4d_t *p4dp, p
                next = pud_addr_end(addr, end);
                if (unlikely(!pud_present(pud)))
                        return 0;
 -              if (unlikely(pud_huge(pud))) {
 +              if (unlikely(pud_huge(pud) || pud_devmap(pud))) {
                        if (!gup_huge_pud(pud, pudp, addr, next, flags,
                                          pages, nr))
                                return 0;
@@@ -2823,6 -2960,29 +2848,6 @@@ static bool gup_fast_permitted(unsigne
  }
  #endif
  
 -static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
 -                                 unsigned int gup_flags, struct page **pages)
 -{
 -      int ret;
 -
 -      /*
 -       * FIXME: FOLL_LONGTERM does not work with
 -       * get_user_pages_unlocked() (see comments in that function)
 -       */
 -      if (gup_flags & FOLL_LONGTERM) {
 -              mmap_read_lock(current->mm);
 -              ret = __gup_longterm_locked(current->mm,
 -                                          start, nr_pages,
 -                                          pages, NULL, gup_flags);
 -              mmap_read_unlock(current->mm);
 -      } else {
 -              ret = get_user_pages_unlocked(start, nr_pages,
 -                                            pages, gup_flags);
 -      }
 -
 -      return ret;
 -}
 -
  static unsigned long lockless_pages_from_mm(unsigned long start,
                                            unsigned long end,
                                            unsigned int gup_flags,
@@@ -2883,8 -3043,7 +2908,8 @@@ static int internal_get_user_pages_fast
  
        if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
                                       FOLL_FORCE | FOLL_PIN | FOLL_GET |
 -                                     FOLL_FAST_ONLY | FOLL_NOFAULT)))
 +                                     FOLL_FAST_ONLY | FOLL_NOFAULT |
 +                                     FOLL_PCI_P2PDMA)))
                return -EINVAL;
  
        if (gup_flags & FOLL_PIN)
        /* Slow path: try to get the remaining pages with get_user_pages */
        start += nr_pinned << PAGE_SHIFT;
        pages += nr_pinned;
 -      ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
 -                                    pages);
 +      ret = get_user_pages_unlocked(start, nr_pages - nr_pinned, pages,
 +                                    gup_flags);
        if (ret < 0) {
                /*
                 * The caller has to unpin the pages we already pinned so
@@@ -3107,9 -3266,9 +3132,9 @@@ long pin_user_pages_remote(struct mm_st
        if (WARN_ON_ONCE(!pages))
                return -EINVAL;
  
 -      gup_flags |= FOLL_PIN;
 -      return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
 -                                     pages, vmas, locked);
 +      return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked,
 +                                   gup_flags | FOLL_PIN | FOLL_TOUCH |
 +                                           FOLL_REMOTE);
  }
  EXPORT_SYMBOL(pin_user_pages_remote);
  
@@@ -3143,7 -3302,7 +3168,7 @@@ long pin_user_pages(unsigned long start
  
        gup_flags |= FOLL_PIN;
        return __gup_longterm_locked(current->mm, start, nr_pages,
 -                                   pages, vmas, gup_flags);
 +                                   pages, vmas, NULL, gup_flags);
  }
  EXPORT_SYMBOL(pin_user_pages);
  
diff --combined mm/huge_memory.c
index 2546199ab3c0856362d926e2afb56ad70e8876b0,dfe72ea23c5f39f8e8bc7c4504e8d943531eede0..abe6cfd92ffa0ecbb50a096742f8ca2073d45736
@@@ -1035,7 -1035,6 +1035,7 @@@ struct page *follow_devmap_pmd(struct v
        unsigned long pfn = pmd_pfn(*pmd);
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;
 +      int ret;
  
        assert_spin_locked(pmd_lockptr(mm, pmd));
  
        if (!*pgmap)
                return ERR_PTR(-EFAULT);
        page = pfn_to_page(pfn);
 -      if (!try_grab_page(page, flags))
 -              page = ERR_PTR(-ENOMEM);
 +      ret = try_grab_page(page, flags);
 +      if (ret)
 +              page = ERR_PTR(ret);
  
        return page;
  }
@@@ -1195,7 -1193,6 +1195,7 @@@ struct page *follow_devmap_pud(struct v
        unsigned long pfn = pud_pfn(*pud);
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;
 +      int ret;
  
        assert_spin_locked(pud_lockptr(mm, pud));
  
        if (!*pgmap)
                return ERR_PTR(-EFAULT);
        page = pfn_to_page(pfn);
 -      if (!try_grab_page(page, flags))
 -              page = ERR_PTR(-ENOMEM);
 +
 +      ret = try_grab_page(page, flags);
 +      if (ret)
 +              page = ERR_PTR(ret);
  
        return page;
  }
@@@ -1318,6 -1313,9 +1318,6 @@@ vm_fault_t do_huge_pmd_wp_page(struct v
        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
        VM_BUG_ON_VMA(!vma->anon_vma, vma);
  
 -      VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
 -      VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));
 -
        if (is_huge_zero_pmd(orig_pmd))
                goto fallback;
  
@@@ -1381,7 -1379,7 +1381,7 @@@ reuse
                if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
                        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                spin_unlock(vmf->ptl);
 -              return VM_FAULT_WRITE;
 +              return 0;
        }
  
  unlock_fallback:
@@@ -1392,36 -1390,6 +1392,36 @@@ fallback
        return VM_FAULT_FALLBACK;
  }
  
 +static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
 +                                         unsigned long addr, pmd_t pmd)
 +{
 +      struct page *page;
 +
 +      if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
 +              return false;
 +
 +      /* Don't touch entries that are not even readable (NUMA hinting). */
 +      if (pmd_protnone(pmd))
 +              return false;
 +
 +      /* Do we need write faults for softdirty tracking? */
 +      if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
 +              return false;
 +
 +      /* Do we need write faults for uffd-wp tracking? */
 +      if (userfaultfd_huge_pmd_wp(vma, pmd))
 +              return false;
 +
 +      if (!(vma->vm_flags & VM_SHARED)) {
 +              /* See can_change_pte_writable(). */
 +              page = vm_normal_page_pmd(vma, addr, pmd);
 +              return page && PageAnon(page) && PageAnonExclusive(page);
 +      }
 +
 +      /* See can_change_pte_writable(). */
 +      return pmd_dirty(pmd);
 +}
 +
  /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
  static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
                                        struct vm_area_struct *vma,
@@@ -1467,7 -1435,6 +1467,7 @@@ struct page *follow_trans_huge_pmd(stru
  {
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;
 +      int ret;
  
        assert_spin_locked(pmd_lockptr(mm, pmd));
  
        if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags))
                return NULL;
  
 -      if (!pmd_write(*pmd) && gup_must_unshare(flags, page))
 +      if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page))
                return ERR_PTR(-EMLINK);
  
        VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
                        !PageAnonExclusive(page), page);
  
 -      if (!try_grab_page(page, flags))
 -              return ERR_PTR(-ENOMEM);
 +      ret = try_grab_page(page, flags);
 +      if (ret)
 +              return ERR_PTR(ret);
  
        if (flags & FOLL_TOUCH)
                touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
@@@ -1515,7 -1481,8 +1515,7 @@@ vm_fault_t do_huge_pmd_numa_page(struc
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        int page_nid = NUMA_NO_NODE;
        int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
 -      bool migrated = false;
 -      bool was_writable = pmd_savedwrite(oldpmd);
 +      bool migrated = false, writable = false;
        int flags = 0;
  
        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        }
  
        pmd = pmd_modify(oldpmd, vma->vm_page_prot);
 +
 +      /*
 +       * Detect now whether the PMD could be writable; this information
 +       * is only valid while holding the PT lock.
 +       */
 +      writable = pmd_write(pmd);
 +      if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
 +          can_change_pmd_writable(vma, vmf->address, pmd))
 +              writable = true;
 +
        page = vm_normal_page_pmd(vma, haddr, pmd);
        if (!page)
                goto out_map;
  
        /* See similar comment in do_numa_page for explanation */
 -      if (!was_writable)
 +      if (!writable)
                flags |= TNF_NO_GROUP;
  
        page_nid = page_to_nid(page);
        }
  
        spin_unlock(vmf->ptl);
 +      writable = false;
  
        migrated = migrate_misplaced_page(page, vma, target_nid);
        if (migrated) {
@@@ -1586,7 -1542,7 +1586,7 @@@ out_map
        /* Restore the PMD */
        pmd = pmd_modify(oldpmd, vma->vm_page_prot);
        pmd = pmd_mkyoung(pmd);
 -      if (was_writable)
 +      if (writable)
                pmd = pmd_mkwrite(pmd);
        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
@@@ -1827,10 -1783,11 +1827,10 @@@ int change_huge_pmd(struct mmu_gather *
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
        pmd_t oldpmd, entry;
 -      bool preserve_write;
 -      int ret;
        bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
        bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
        bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 +      int ret = 1;
  
        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
  
        if (!ptl)
                return 0;
  
 -      preserve_write = prot_numa && pmd_write(*pmd);
 -      ret = 1;
 -
  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
        if (is_swap_pmd(*pmd)) {
                swp_entry_t entry = pmd_to_swp_entry(*pmd);
        oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
  
        entry = pmd_modify(oldpmd, newprot);
 -      if (preserve_write)
 -              entry = pmd_mk_savedwrite(entry);
        if (uffd_wp) {
                entry = pmd_wrprotect(entry);
                entry = pmd_mkuffd_wp(entry);
                 */
                entry = pmd_clear_uffd_wp(entry);
        }
 +
 +      /* See change_pte_range(). */
 +      if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
 +          can_change_pmd_writable(vma, addr, entry))
 +              entry = pmd_mkwrite(entry);
 +
        ret = HPAGE_PMD_NR;
        set_pmd_at(mm, addr, pmd, entry);
  
        if (huge_pmd_needs_flush(oldpmd, entry))
                tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
 -
 -      BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
  unlock:
        spin_unlock(ptl);
        return ret;
@@@ -2183,6 -2141,7 +2183,6 @@@ static void __split_huge_pmd_locked(str
                uffd_wp = pmd_uffd_wp(old_pmd);
  
                VM_BUG_ON_PAGE(!page_count(page), page);
 -              page_ref_add(page, HPAGE_PMD_NR - 1);
  
                /*
                 * Without "freeze", we'll simply split the PMD, propagating the
                anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
                if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
                        freeze = false;
 +              if (!freeze)
 +                      page_ref_add(page, HPAGE_PMD_NR - 1);
        }
  
        /*
                        entry = maybe_mkwrite(entry, vma);
                        if (anon_exclusive)
                                SetPageAnonExclusive(page + i);
 -                      if (!write)
 -                              entry = pte_wrprotect(entry);
                        if (!young)
                                entry = pte_mkold(entry);
                        /* NOTE: this may set soft-dirty too on some archs */
                        if (dirty)
                                entry = pte_mkdirty(entry);
 +                      /*
 +                       * NOTE: this needs to happen after pte_mkdirty,
 +                       * because some archs (sparc64, loongarch) could
 +                       * set hw write bit when mkdirty.
 +                       */
 +                      if (!write)
 +                              entry = pte_wrprotect(entry);
                        if (soft_dirty)
                                entry = pte_mksoft_dirty(entry);
                        if (uffd_wp)
                                entry = pte_mkuffd_wp(entry);
 +                      page_add_anon_rmap(page + i, vma, addr, false);
                }
                pte = pte_offset_map(&_pmd, addr);
                BUG_ON(!pte_none(*pte));
                set_pte_at(mm, addr, pte, entry);
 -              if (!pmd_migration)
 -                      atomic_inc(&page[i]._mapcount);
                pte_unmap(pte);
        }
  
 -      if (!pmd_migration) {
 -              /*
 -               * Set PG_double_map before dropping compound_mapcount to avoid
 -               * false-negative page_mapped().
 -               */
 -              if (compound_mapcount(page) > 1 &&
 -                  !TestSetPageDoubleMap(page)) {
 -                      for (i = 0; i < HPAGE_PMD_NR; i++)
 -                              atomic_inc(&page[i]._mapcount);
 -              }
 -
 -              lock_page_memcg(page);
 -              if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
 -                      /* Last compound_mapcount is gone. */
 -                      __mod_lruvec_page_state(page, NR_ANON_THPS,
 -                                              -HPAGE_PMD_NR);
 -                      if (TestClearPageDoubleMap(page)) {
 -                              /* No need in mapcount reference anymore */
 -                              for (i = 0; i < HPAGE_PMD_NR; i++)
 -                                      atomic_dec(&page[i]._mapcount);
 -                      }
 -              }
 -              unlock_page_memcg(page);
 -
 -              /* Above is effectively page_remove_rmap(page, vma, true) */
 -              munlock_vma_page(page, vma, true);
 -      }
 +      if (!pmd_migration)
 +              page_remove_rmap(page, vma, true);
 +      if (freeze)
 +              put_page(page);
  
        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
 -
 -      if (freeze) {
 -              for (i = 0; i < HPAGE_PMD_NR; i++) {
 -                      page_remove_rmap(page + i, vma, false);
 -                      put_page(page + i);
 -              }
 -      }
  }
  
  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
@@@ -2461,13 -2444,14 +2461,14 @@@ static void __split_huge_page_tail(stru
                         (1L << PG_workingset) |
                         (1L << PG_locked) |
                         (1L << PG_unevictable) |
- #ifdef CONFIG_64BIT
+ #ifdef CONFIG_ARCH_USES_PG_ARCH_X
                         (1L << PG_arch_2) |
+                        (1L << PG_arch_3) |
  #endif
                         (1L << PG_dirty) |
                         LRU_GEN_MASK | LRU_REFS_MASK));
  
 -      /* ->mapping in first tail page is compound_mapcount */
 +      /* ->mapping in first and second tail page is replaced by other uses */
        VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
                        page_tail);
        page_tail->mapping = head->mapping;
         * page->private should not be set in tail pages with the exception
         * of swap cache pages that store the swp_entry_t in tail pages.
         * Fix up and warn once if private is unexpectedly set.
 +       *
 +       * What of 32-bit systems, on which head[1].compound_pincount overlays
 +       * head[1].private?  No problem: THP_SWAP is not enabled on 32-bit, and
 +       * compound_pincount must be 0 for folio_ref_freeze() to have succeeded.
         */
        if (!folio_test_swapcache(page_folio(head))) {
                VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail);
@@@ -2733,7 -2713,7 +2734,7 @@@ int split_huge_page_to_list(struct pag
         * split PMDs
         */
        if (!can_split_folio(folio, &extra_pins)) {
 -              ret = -EBUSY;
 +              ret = -EAGAIN;
                goto out_unlock;
        }
  
@@@ -2783,7 -2763,7 +2784,7 @@@ fail
                        xas_unlock(&xas);
                local_irq_enable();
                remap_page(folio, folio_nr_pages(folio));
 -              ret = -EBUSY;
 +              ret = -EAGAIN;
        }
  
  out_unlock:
@@@ -3087,28 -3067,28 +3088,28 @@@ static int split_huge_pages_in_file(con
        mapping = candidate->f_mapping;
  
        for (index = off_start; index < off_end; index += nr_pages) {
 -              struct page *fpage = pagecache_get_page(mapping, index,
 -                                              FGP_ENTRY | FGP_HEAD, 0);
 +              struct folio *folio = __filemap_get_folio(mapping, index,
 +                                              FGP_ENTRY, 0);
  
                nr_pages = 1;
 -              if (xa_is_value(fpage) || !fpage)
 +              if (xa_is_value(folio) || !folio)
                        continue;
  
 -              if (!is_transparent_hugepage(fpage))
 +              if (!folio_test_large(folio))
                        goto next;
  
                total++;
 -              nr_pages = thp_nr_pages(fpage);
 +              nr_pages = folio_nr_pages(folio);
  
 -              if (!trylock_page(fpage))
 +              if (!folio_trylock(folio))
                        goto next;
  
 -              if (!split_huge_page(fpage))
 +              if (!split_folio(folio))
                        split++;
  
 -              unlock_page(fpage);
 +              folio_unlock(folio);
  next:
 -              put_page(fpage);
 +              folio_put(folio);
                cond_resched();
        }
  
diff --combined mm/hugetlb.c
index f5f445c39dbcb6852bf346623c6bb97d123f42c2,b5ed54f760bb21ebeae9dd714b8baf7f8efdda7e..77f36e3681e390dc7f1382cd4dc9bca84b09a611
@@@ -54,13 -54,13 +54,13 @@@ struct hstate hstates[HUGE_MAX_HSTATE]
  #ifdef CONFIG_CMA
  static struct cma *hugetlb_cma[MAX_NUMNODES];
  static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
 -static bool hugetlb_cma_page(struct page *page, unsigned int order)
 +static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
  {
 -      return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page,
 +      return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page,
                                1 << order);
  }
  #else
 -static bool hugetlb_cma_page(struct page *page, unsigned int order)
 +static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
  {
        return false;
  }
@@@ -1127,17 -1127,17 +1127,17 @@@ static bool vma_has_reserves(struct vm_
        return false;
  }
  
 -static void enqueue_huge_page(struct hstate *h, struct page *page)
 +static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
  {
 -      int nid = page_to_nid(page);
 +      int nid = folio_nid(folio);
  
        lockdep_assert_held(&hugetlb_lock);
 -      VM_BUG_ON_PAGE(page_count(page), page);
 +      VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
  
 -      list_move(&page->lru, &h->hugepage_freelists[nid]);
 +      list_move(&folio->lru, &h->hugepage_freelists[nid]);
        h->free_huge_pages++;
        h->free_huge_pages_node[nid]++;
 -      SetHPageFreed(page);
 +      folio_set_hugetlb_freed(folio);
  }
  
  static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
@@@ -1325,76 -1325,76 +1325,76 @@@ static int hstate_next_node_to_free(str
                nr_nodes--)
  
  /* used to demote non-gigantic_huge pages as well */
 -static void __destroy_compound_gigantic_page(struct page *page,
 +static void __destroy_compound_gigantic_folio(struct folio *folio,
                                        unsigned int order, bool demote)
  {
        int i;
        int nr_pages = 1 << order;
        struct page *p;
  
 -      atomic_set(compound_mapcount_ptr(page), 0);
 -      atomic_set(compound_pincount_ptr(page), 0);
 +      atomic_set(folio_mapcount_ptr(folio), 0);
 +      atomic_set(folio_subpages_mapcount_ptr(folio), 0);
 +      atomic_set(folio_pincount_ptr(folio), 0);
  
        for (i = 1; i < nr_pages; i++) {
 -              p = nth_page(page, i);
 +              p = folio_page(folio, i);
                p->mapping = NULL;
                clear_compound_head(p);
                if (!demote)
                        set_page_refcounted(p);
        }
  
 -      set_compound_order(page, 0);
 -#ifdef CONFIG_64BIT
 -      page[1].compound_nr = 0;
 -#endif
 -      __ClearPageHead(page);
 +      folio_set_compound_order(folio, 0);
 +      __folio_clear_head(folio);
  }
  
 -static void destroy_compound_hugetlb_page_for_demote(struct page *page,
 +static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio,
                                        unsigned int order)
  {
 -      __destroy_compound_gigantic_page(page, order, true);
 +      __destroy_compound_gigantic_folio(folio, order, true);
  }
  
  #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
 -static void destroy_compound_gigantic_page(struct page *page,
 +static void destroy_compound_gigantic_folio(struct folio *folio,
                                        unsigned int order)
  {
 -      __destroy_compound_gigantic_page(page, order, false);
 +      __destroy_compound_gigantic_folio(folio, order, false);
  }
  
 -static void free_gigantic_page(struct page *page, unsigned int order)
 +static void free_gigantic_folio(struct folio *folio, unsigned int order)
  {
        /*
         * If the page isn't allocated using the cma allocator,
         * cma_release() returns false.
         */
  #ifdef CONFIG_CMA
 -      if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
 +      int nid = folio_nid(folio);
 +
 +      if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order))
                return;
  #endif
  
 -      free_contig_range(page_to_pfn(page), 1 << order);
 +      free_contig_range(folio_pfn(folio), 1 << order);
  }
  
  #ifdef CONFIG_CONTIG_ALLOC
 -static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
 +static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
                int nid, nodemask_t *nodemask)
  {
 +      struct page *page;
        unsigned long nr_pages = pages_per_huge_page(h);
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();
  
  #ifdef CONFIG_CMA
        {
 -              struct page *page;
                int node;
  
                if (hugetlb_cma[nid]) {
                        page = cma_alloc(hugetlb_cma[nid], nr_pages,
                                        huge_page_order(h), true);
                        if (page)
 -                              return page;
 +                              return page_folio(page);
                }
  
                if (!(gfp_mask & __GFP_THISNODE)) {
                                page = cma_alloc(hugetlb_cma[node], nr_pages,
                                                huge_page_order(h), true);
                                if (page)
 -                                      return page;
 +                                      return page_folio(page);
                        }
                }
        }
  #endif
  
 -      return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
 +      page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
 +      return page ? page_folio(page) : NULL;
  }
  
  #else /* !CONFIG_CONTIG_ALLOC */
 -static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
 +static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
                                        int nid, nodemask_t *nodemask)
  {
        return NULL;
  #endif /* CONFIG_CONTIG_ALLOC */
  
  #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
 -static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
 +static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
                                        int nid, nodemask_t *nodemask)
  {
        return NULL;
  }
 -static inline void free_gigantic_page(struct page *page, unsigned int order) { }
 -static inline void destroy_compound_gigantic_page(struct page *page,
 +static inline void free_gigantic_folio(struct folio *folio,
 +                                              unsigned int order) { }
 +static inline void destroy_compound_gigantic_folio(struct folio *folio,
                                                unsigned int order) { }
  #endif
  
  /*
 - * Remove hugetlb page from lists, and update dtor so that page appears
 + * Remove hugetlb folio from lists, and update dtor so that the folio appears
   * as just a compound page.
   *
 - * A reference is held on the page, except in the case of demote.
 + * A reference is held on the folio, except in the case of demote.
   *
   * Must be called with hugetlb lock held.
   */
 -static void __remove_hugetlb_page(struct hstate *h, struct page *page,
 +static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
                                                        bool adjust_surplus,
                                                        bool demote)
  {
 -      int nid = page_to_nid(page);
 +      int nid = folio_nid(folio);
  
 -      VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
 -      VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
 +      VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio);
 +      VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio);
  
        lockdep_assert_held(&hugetlb_lock);
        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;
  
 -      list_del(&page->lru);
 +      list_del(&folio->lru);
  
 -      if (HPageFreed(page)) {
 +      if (folio_test_hugetlb_freed(folio)) {
                h->free_huge_pages--;
                h->free_huge_pages_node[nid]--;
        }
         *
         * For gigantic pages set the destructor to the null dtor.  This
         * destructor will never be called.  Before freeing the gigantic
 -       * page destroy_compound_gigantic_page will turn the compound page
 -       * into a simple group of pages.  After this the destructor does not
 +       * page destroy_compound_gigantic_folio will turn the folio into a
 +       * simple group of pages.  After this the destructor does not
         * apply.
         *
         * This handles the case where more than one ref is held when and
 -       * after update_and_free_page is called.
 +       * after update_and_free_hugetlb_folio is called.
         *
         * In the case of demote we do not ref count the page as it will soon
         * be turned into a page of smaller size.
         */
        if (!demote)
 -              set_page_refcounted(page);
 +              folio_ref_unfreeze(folio, 1);
        if (hstate_is_gigantic(h))
 -              set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
 +              folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR);
        else
 -              set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
 +              folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
  
        h->nr_huge_pages--;
        h->nr_huge_pages_node[nid]--;
  }
  
 -static void remove_hugetlb_page(struct hstate *h, struct page *page,
 +static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
                                                        bool adjust_surplus)
  {
 -      __remove_hugetlb_page(h, page, adjust_surplus, false);
 +      __remove_hugetlb_folio(h, folio, adjust_surplus, false);
  }
  
 -static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page,
 +static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio,
                                                        bool adjust_surplus)
  {
 -      __remove_hugetlb_page(h, page, adjust_surplus, true);
 +      __remove_hugetlb_folio(h, folio, adjust_surplus, true);
  }
  
 -static void add_hugetlb_page(struct hstate *h, struct page *page,
 +static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
                             bool adjust_surplus)
  {
        int zeroed;
 -      int nid = page_to_nid(page);
 +      int nid = folio_nid(folio);
  
 -      VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
 +      VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio);
  
        lockdep_assert_held(&hugetlb_lock);
  
 -      INIT_LIST_HEAD(&page->lru);
 +      INIT_LIST_HEAD(&folio->lru);
        h->nr_huge_pages++;
        h->nr_huge_pages_node[nid]++;
  
                h->surplus_huge_pages_node[nid]++;
        }
  
 -      set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
 -      set_page_private(page, 0);
 +      folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
 +      folio_change_private(folio, NULL);
        /*
 -       * We have to set HPageVmemmapOptimized again as above
 -       * set_page_private(page, 0) cleared it.
 +       * We have to set hugetlb_vmemmap_optimized again as above
 +       * folio_change_private(folio, NULL) cleared it.
         */
 -      SetHPageVmemmapOptimized(page);
 +      folio_set_hugetlb_vmemmap_optimized(folio);
  
        /*
 -       * This page is about to be managed by the hugetlb allocator and
 +       * This folio is about to be managed by the hugetlb allocator and
         * should have no users.  Drop our reference, and check for others
         * just in case.
         */
 -      zeroed = put_page_testzero(page);
 -      if (!zeroed)
 +      zeroed = folio_put_testzero(folio);
 +      if (unlikely(!zeroed))
                /*
                 * It is VERY unlikely soneone else has taken a ref on
                 * the page.  In this case, we simply return as the
                 */
                return;
  
 -      arch_clear_hugepage_flags(page);
 -      enqueue_huge_page(h, page);
 +      arch_clear_hugepage_flags(&folio->page);
 +      enqueue_hugetlb_folio(h, folio);
  }
  
  static void __update_and_free_page(struct hstate *h, struct page *page)
  {
        int i;
 +      struct folio *folio = page_folio(page);
        struct page *subpage;
  
        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
         * If we don't know which subpages are hwpoisoned, we can't free
         * the hugepage, so it's leaked intentionally.
         */
 -      if (HPageRawHwpUnreliable(page))
 +      if (folio_test_hugetlb_raw_hwp_unreliable(folio))
                return;
  
        if (hugetlb_vmemmap_restore(h, page)) {
                 * page and put the page back on the hugetlb free list and treat
                 * as a surplus page.
                 */
 -              add_hugetlb_page(h, page, true);
 +              add_hugetlb_folio(h, folio, true);
                spin_unlock_irq(&hugetlb_lock);
                return;
        }
         * Move PageHWPoison flag from head page to the raw error pages,
         * which makes any healthy subpages reusable.
         */
 -      if (unlikely(PageHWPoison(page)))
 -              hugetlb_clear_page_hwpoison(page);
 +      if (unlikely(folio_test_hwpoison(folio)))
 +              hugetlb_clear_page_hwpoison(&folio->page);
  
        for (i = 0; i < pages_per_huge_page(h); i++) {
 -              subpage = nth_page(page, i);
 +              subpage = folio_page(folio, i);
                subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
                                1 << PG_referenced | 1 << PG_dirty |
                                1 << PG_active | 1 << PG_private |
  
        /*
         * Non-gigantic pages demoted from CMA allocated gigantic pages
 -       * need to be given back to CMA in free_gigantic_page.
 +       * need to be given back to CMA in free_gigantic_folio.
         */
        if (hstate_is_gigantic(h) ||
 -          hugetlb_cma_page(page, huge_page_order(h))) {
 -              destroy_compound_gigantic_page(page, huge_page_order(h));
 -              free_gigantic_page(page, huge_page_order(h));
 +          hugetlb_cma_folio(folio, huge_page_order(h))) {
 +              destroy_compound_gigantic_folio(folio, huge_page_order(h));
 +              free_gigantic_folio(folio, huge_page_order(h));
        } else {
                __free_pages(page, huge_page_order(h));
        }
  }
  
  /*
 - * As update_and_free_page() can be called under any context, so we cannot
 + * As update_and_free_hugetlb_folio() can be called under any context, so we cannot
   * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
   * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
   * the vmemmap pages.
@@@ -1642,9 -1639,8 +1642,9 @@@ static void free_hpage_workfn(struct wo
                /*
                 * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
                 * is going to trigger because a previous call to
 -               * remove_hugetlb_page() will set_compound_page_dtor(page,
 -               * NULL_COMPOUND_DTOR), so do not use page_hstate() directly.
 +               * remove_hugetlb_folio() will call folio_set_compound_dtor
 +               * (folio, NULL_COMPOUND_DTOR), so do not use page_hstate()
 +               * directly.
                 */
                h = size_to_hstate(page_size(page));
  
@@@ -1661,11 -1657,11 +1661,11 @@@ static inline void flush_free_hpage_wor
                flush_work(&free_hpage_work);
  }
  
 -static void update_and_free_page(struct hstate *h, struct page *page,
 +static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
                                 bool atomic)
  {
 -      if (!HPageVmemmapOptimized(page) || !atomic) {
 -              __update_and_free_page(h, page);
 +      if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
 +              __update_and_free_page(h, &folio->page);
                return;
        }
  
         * empty. Otherwise, schedule_work() had been called but the workfn
         * hasn't retrieved the list yet.
         */
 -      if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
 +      if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist))
                schedule_work(&free_hpage_work);
  }
  
  static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
  {
        struct page *page, *t_page;
 +      struct folio *folio;
  
        list_for_each_entry_safe(page, t_page, list, lru) {
 -              update_and_free_page(h, page, false);
 +              folio = page_folio(page);
 +              update_and_free_hugetlb_folio(h, folio, false);
                cond_resched();
        }
  }
@@@ -1709,22 -1703,21 +1709,22 @@@ void free_huge_page(struct page *page
         * Can't pass hstate in here because it is called from the
         * compound page destructor.
         */
 -      struct hstate *h = page_hstate(page);
 -      int nid = page_to_nid(page);
 -      struct hugepage_subpool *spool = hugetlb_page_subpool(page);
 +      struct folio *folio = page_folio(page);
 +      struct hstate *h = folio_hstate(folio);
 +      int nid = folio_nid(folio);
 +      struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
        bool restore_reserve;
        unsigned long flags;
  
 -      VM_BUG_ON_PAGE(page_count(page), page);
 -      VM_BUG_ON_PAGE(page_mapcount(page), page);
 +      VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
 +      VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);
  
 -      hugetlb_set_page_subpool(page, NULL);
 -      if (PageAnon(page))
 -              __ClearPageAnonExclusive(page);
 -      page->mapping = NULL;
 -      restore_reserve = HPageRestoreReserve(page);
 -      ClearHPageRestoreReserve(page);
 +      hugetlb_set_folio_subpool(folio, NULL);
 +      if (folio_test_anon(folio))
 +              __ClearPageAnonExclusive(&folio->page);
 +      folio->mapping = NULL;
 +      restore_reserve = folio_test_hugetlb_restore_reserve(folio);
 +      folio_clear_hugetlb_restore_reserve(folio);
  
        /*
         * If HPageRestoreReserve was set on page, page allocation consumed a
        }
  
        spin_lock_irqsave(&hugetlb_lock, flags);
 -      ClearHPageMigratable(page);
 -      hugetlb_cgroup_uncharge_page(hstate_index(h),
 -                                   pages_per_huge_page(h), page);
 -      hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
 -                                        pages_per_huge_page(h), page);
 +      folio_clear_hugetlb_migratable(folio);
 +      hugetlb_cgroup_uncharge_folio(hstate_index(h),
 +                                   pages_per_huge_page(h), folio);
 +      hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
 +                                        pages_per_huge_page(h), folio);
        if (restore_reserve)
                h->resv_huge_pages++;
  
 -      if (HPageTemporary(page)) {
 -              remove_hugetlb_page(h, page, false);
 +      if (folio_test_hugetlb_temporary(folio)) {
 +              remove_hugetlb_folio(h, folio, false);
                spin_unlock_irqrestore(&hugetlb_lock, flags);
 -              update_and_free_page(h, page, true);
 +              update_and_free_hugetlb_folio(h, folio, true);
        } else if (h->surplus_huge_pages_node[nid]) {
                /* remove the page from active list */
 -              remove_hugetlb_page(h, page, true);
 +              remove_hugetlb_folio(h, folio, true);
                spin_unlock_irqrestore(&hugetlb_lock, flags);
 -              update_and_free_page(h, page, true);
 +              update_and_free_hugetlb_folio(h, folio, true);
        } else {
                arch_clear_hugepage_flags(page);
 -              enqueue_huge_page(h, page);
 +              enqueue_hugetlb_folio(h, folio);
                spin_unlock_irqrestore(&hugetlb_lock, flags);
        }
  }
@@@ -1780,37 -1773,36 +1780,37 @@@ static void __prep_account_new_huge_pag
        h->nr_huge_pages_node[nid]++;
  }
  
 -static void __prep_new_huge_page(struct hstate *h, struct page *page)
 +static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
  {
 -      hugetlb_vmemmap_optimize(h, page);
 -      INIT_LIST_HEAD(&page->lru);
 -      set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
 -      hugetlb_set_page_subpool(page, NULL);
 -      set_hugetlb_cgroup(page, NULL);
 -      set_hugetlb_cgroup_rsvd(page, NULL);
 +      hugetlb_vmemmap_optimize(h, &folio->page);
 +      INIT_LIST_HEAD(&folio->lru);
 +      folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
 +      hugetlb_set_folio_subpool(folio, NULL);
 +      set_hugetlb_cgroup(folio, NULL);
 +      set_hugetlb_cgroup_rsvd(folio, NULL);
  }
  
 -static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 +static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid)
  {
 -      __prep_new_huge_page(h, page);
 +      __prep_new_hugetlb_folio(h, folio);
        spin_lock_irq(&hugetlb_lock);
        __prep_account_new_huge_page(h, nid);
        spin_unlock_irq(&hugetlb_lock);
  }
  
 -static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
 -                                                              bool demote)
 +static bool __prep_compound_gigantic_folio(struct folio *folio,
 +                                      unsigned int order, bool demote)
  {
        int i, j;
        int nr_pages = 1 << order;
        struct page *p;
  
 -      /* we rely on prep_new_huge_page to set the destructor */
 -      set_compound_order(page, order);
 -      __SetPageHead(page);
 +      __folio_clear_reserved(folio);
 +      __folio_set_head(folio);
 +      /* we rely on prep_new_hugetlb_folio to set the destructor */
 +      folio_set_compound_order(folio, order);
        for (i = 0; i < nr_pages; i++) {
 -              p = nth_page(page, i);
 +              p = folio_page(folio, i);
  
                /*
                 * For gigantic hugepages allocated through bootmem at
                 * on the head page when they need know if put_page() is needed
                 * after get_user_pages().
                 */
 -              __ClearPageReserved(p);
 +              if (i != 0)     /* head page cleared above */
 +                      __ClearPageReserved(p);
                /*
                 * Subtle and very unlikely
                 *
                        VM_BUG_ON_PAGE(page_count(p), p);
                }
                if (i != 0)
 -                      set_compound_head(p, page);
 +                      set_compound_head(p, &folio->page);
        }
 -      atomic_set(compound_mapcount_ptr(page), -1);
 -      atomic_set(compound_pincount_ptr(page), 0);
 +      atomic_set(folio_mapcount_ptr(folio), -1);
 +      atomic_set(folio_subpages_mapcount_ptr(folio), 0);
 +      atomic_set(folio_pincount_ptr(folio), 0);
        return true;
  
  out_error:
        /* undo page modifications made above */
        for (j = 0; j < i; j++) {
 -              p = nth_page(page, j);
 +              p = folio_page(folio, j);
                if (j != 0)
                        clear_compound_head(p);
                set_page_refcounted(p);
        }
        /* need to clear PG_reserved on remaining tail pages  */
        for (; j < nr_pages; j++) {
 -              p = nth_page(page, j);
 +              p = folio_page(folio, j);
                __ClearPageReserved(p);
        }
 -      set_compound_order(page, 0);
 -#ifdef CONFIG_64BIT
 -      page[1].compound_nr = 0;
 -#endif
 -      __ClearPageHead(page);
 +      folio_set_compound_order(folio, 0);
 +      __folio_clear_head(folio);
        return false;
  }
  
 -static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
 +static bool prep_compound_gigantic_folio(struct folio *folio,
 +                                                      unsigned int order)
  {
 -      return __prep_compound_gigantic_page(page, order, false);
 +      return __prep_compound_gigantic_folio(folio, order, false);
  }
  
 -static bool prep_compound_gigantic_page_for_demote(struct page *page,
 +static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
                                                        unsigned int order)
  {
 -      return __prep_compound_gigantic_page(page, order, true);
 +      return __prep_compound_gigantic_folio(folio, order, true);
  }
  
  /*
@@@ -1951,7 -1943,7 +1951,7 @@@ pgoff_t hugetlb_basepage_index(struct p
        return (index << compound_order(page_head)) + compound_idx;
  }
  
 -static struct page *alloc_buddy_huge_page(struct hstate *h,
 +static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
                gfp_t gfp_mask, int nid, nodemask_t *nmask,
                nodemask_t *node_alloc_noretry)
  {
@@@ -1989,6 -1981,11 +1989,6 @@@ retry
                page = NULL;
        }
  
 -      if (page)
 -              __count_vm_event(HTLB_BUDDY_PGALLOC);
 -      else
 -              __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
 -
        /*
         * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
         * indicates an overall state change.  Clear bit so that we resume
        if (node_alloc_noretry && !page && alloc_try_hard)
                node_set(nid, *node_alloc_noretry);
  
 -      return page;
 +      if (!page) {
 +              __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
 +              return NULL;
 +      }
 +
 +      __count_vm_event(HTLB_BUDDY_PGALLOC);
 +      return page_folio(page);
  }
  
  /*
   * Note that returned page is 'frozen':  ref count of head page and all tail
   * pages is zero.
   */
 -static struct page *alloc_fresh_huge_page(struct hstate *h,
 +static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
                gfp_t gfp_mask, int nid, nodemask_t *nmask,
                nodemask_t *node_alloc_noretry)
  {
 -      struct page *page;
 +      struct folio *folio;
        bool retry = false;
  
  retry:
        if (hstate_is_gigantic(h))
 -              page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
 +              folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
        else
 -              page = alloc_buddy_huge_page(h, gfp_mask,
 +              folio = alloc_buddy_hugetlb_folio(h, gfp_mask,
                                nid, nmask, node_alloc_noretry);
 -      if (!page)
 +      if (!folio)
                return NULL;
 -
        if (hstate_is_gigantic(h)) {
 -              if (!prep_compound_gigantic_page(page, huge_page_order(h))) {
 +              if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) {
                        /*
                         * Rare failure to convert pages to compound page.
                         * Free pages and try again - ONCE!
                         */
 -                      free_gigantic_page(page, huge_page_order(h));
 +                      free_gigantic_folio(folio, huge_page_order(h));
                        if (!retry) {
                                retry = true;
                                goto retry;
                        return NULL;
                }
        }
 -      prep_new_huge_page(h, page, page_to_nid(page));
 +      prep_new_hugetlb_folio(h, folio, folio_nid(folio));
  
 -      return page;
 +      return folio;
  }
  
  /*
  static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
                                nodemask_t *node_alloc_noretry)
  {
 -      struct page *page;
 +      struct folio *folio;
        int nr_nodes, node;
        gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
  
        for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
 -              page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
 -                                              node_alloc_noretry);
 -              if (page)
 -                      break;
 +              folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node,
 +                                      nodes_allowed, node_alloc_noretry);
 +              if (folio) {
 +                      free_huge_page(&folio->page); /* free it into the hugepage allocator */
 +                      return 1;
 +              }
        }
  
 -      if (!page)
 -              return 0;
 -
 -      free_huge_page(page); /* free it into the hugepage allocator */
 -
 -      return 1;
 +      return 0;
  }
  
  /*
@@@ -2091,7 -2086,6 +2091,7 @@@ static struct page *remove_pool_huge_pa
  {
        int nr_nodes, node;
        struct page *page = NULL;
 +      struct folio *folio;
  
        lockdep_assert_held(&hugetlb_lock);
        for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
                    !list_empty(&h->hugepage_freelists[node])) {
                        page = list_entry(h->hugepage_freelists[node].next,
                                          struct page, lru);
 -                      remove_hugetlb_page(h, page, acct_surplus);
 +                      folio = page_folio(page);
 +                      remove_hugetlb_folio(h, folio, acct_surplus);
                        break;
                }
        }
  int dissolve_free_huge_page(struct page *page)
  {
        int rc = -EBUSY;
 +      struct folio *folio = page_folio(page);
  
  retry:
        /* Not to disrupt normal path by vainly holding hugetlb_lock */
 -      if (!PageHuge(page))
 +      if (!folio_test_hugetlb(folio))
                return 0;
  
        spin_lock_irq(&hugetlb_lock);
 -      if (!PageHuge(page)) {
 +      if (!folio_test_hugetlb(folio)) {
                rc = 0;
                goto out;
        }
  
 -      if (!page_count(page)) {
 -              struct page *head = compound_head(page);
 -              struct hstate *h = page_hstate(head);
 +      if (!folio_ref_count(folio)) {
 +              struct hstate *h = folio_hstate(folio);
                if (!available_huge_pages(h))
                        goto out;
  
                 * We should make sure that the page is already on the free list
                 * when it is dissolved.
                 */
 -              if (unlikely(!HPageFreed(head))) {
 +              if (unlikely(!folio_test_hugetlb_freed(folio))) {
                        spin_unlock_irq(&hugetlb_lock);
                        cond_resched();
  
                        goto retry;
                }
  
 -              remove_hugetlb_page(h, head, false);
 +              remove_hugetlb_folio(h, folio, false);
                h->max_huge_pages--;
                spin_unlock_irq(&hugetlb_lock);
  
                /*
 -               * Normally update_and_free_page will allocate required vmemmmap
 -               * before freeing the page.  update_and_free_page will fail to
 +               * Normally update_and_free_hugtlb_folio will allocate required vmemmmap
 +               * before freeing the page.  update_and_free_hugtlb_folio will fail to
                 * free the page if it can not allocate required vmemmap.  We
                 * need to adjust max_huge_pages if the page is not freed.
                 * Attempt to allocate vmemmmap here so that we can take
                 * appropriate action on failure.
                 */
 -              rc = hugetlb_vmemmap_restore(h, head);
 +              rc = hugetlb_vmemmap_restore(h, &folio->page);
                if (!rc) {
 -                      update_and_free_page(h, head, false);
 +                      update_and_free_hugetlb_folio(h, folio, false);
                } else {
                        spin_lock_irq(&hugetlb_lock);
 -                      add_hugetlb_page(h, head, false);
 +                      add_hugetlb_folio(h, folio, false);
                        h->max_huge_pages++;
                        spin_unlock_irq(&hugetlb_lock);
                }
@@@ -2234,7 -2227,7 +2234,7 @@@ int dissolve_free_huge_pages(unsigned l
  static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
                                                int nid, nodemask_t *nmask)
  {
 -      struct page *page = NULL;
 +      struct folio *folio = NULL;
  
        if (hstate_is_gigantic(h))
                return NULL;
                goto out_unlock;
        spin_unlock_irq(&hugetlb_lock);
  
 -      page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
 -      if (!page)
 +      folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
 +      if (!folio)
                return NULL;
  
        spin_lock_irq(&hugetlb_lock);
         * codeflow
         */
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
 -              SetHPageTemporary(page);
 +              folio_set_hugetlb_temporary(folio);
                spin_unlock_irq(&hugetlb_lock);
 -              free_huge_page(page);
 +              free_huge_page(&folio->page);
                return NULL;
        }
  
        h->surplus_huge_pages++;
 -      h->surplus_huge_pages_node[page_to_nid(page)]++;
 +      h->surplus_huge_pages_node[folio_nid(folio)]++;
  
  out_unlock:
        spin_unlock_irq(&hugetlb_lock);
  
 -      return page;
 +      return &folio->page;
  }
  
  static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
                                     int nid, nodemask_t *nmask)
  {
 -      struct page *page;
 +      struct folio *folio;
  
        if (hstate_is_gigantic(h))
                return NULL;
  
 -      page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
 -      if (!page)
 +      folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
 +      if (!folio)
                return NULL;
  
        /* fresh huge pages are frozen */
 -      set_page_refcounted(page);
 -
 +      folio_ref_unfreeze(folio, 1);
        /*
         * We do not account these pages as surplus because they are only
         * temporary and will be released properly on the last reference
         */
 -      SetHPageTemporary(page);
 +      folio_set_hugetlb_temporary(folio);
  
 -      return page;
 +      return &folio->page;
  }
  
  /*
@@@ -2434,7 -2428,7 +2434,7 @@@ retry
                if ((--needed) < 0)
                        break;
                /* Add the page to the hugetlb allocator */
 -              enqueue_huge_page(h, page);
 +              enqueue_hugetlb_folio(h, page_folio(page));
        }
  free:
        spin_unlock_irq(&hugetlb_lock);
@@@ -2741,52 -2735,51 +2741,52 @@@ void restore_reserve_on_error(struct hs
  }
  
  /*
 - * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
 + * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
 + * the old one
   * @h: struct hstate old page belongs to
 - * @old_page: Old page to dissolve
 + * @old_folio: Old folio to dissolve
   * @list: List to isolate the page in case we need to
   * Returns 0 on success, otherwise negated error.
   */
 -static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
 -                                      struct list_head *list)
 +static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
 +                      struct folio *old_folio, struct list_head *list)
  {
        gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
 -      int nid = page_to_nid(old_page);
 -      struct page *new_page;
 +      int nid = folio_nid(old_folio);
 +      struct folio *new_folio;
        int ret = 0;
  
        /*
 -       * Before dissolving the page, we need to allocate a new one for the
 -       * pool to remain stable.  Here, we allocate the page and 'prep' it
 +       * Before dissolving the folio, we need to allocate a new one for the
 +       * pool to remain stable.  Here, we allocate the folio and 'prep' it
         * by doing everything but actually updating counters and adding to
         * the pool.  This simplifies and let us do most of the processing
         * under the lock.
         */
 -      new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
 -      if (!new_page)
 +      new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL);
 +      if (!new_folio)
                return -ENOMEM;
 -      __prep_new_huge_page(h, new_page);
 +      __prep_new_hugetlb_folio(h, new_folio);
  
  retry:
        spin_lock_irq(&hugetlb_lock);
 -      if (!PageHuge(old_page)) {
 +      if (!folio_test_hugetlb(old_folio)) {
                /*
 -               * Freed from under us. Drop new_page too.
 +               * Freed from under us. Drop new_folio too.
                 */
                goto free_new;
 -      } else if (page_count(old_page)) {
 +      } else if (folio_ref_count(old_folio)) {
                /*
 -               * Someone has grabbed the page, try to isolate it here.
 +               * Someone has grabbed the folio, try to isolate it here.
                 * Fail with -EBUSY if not possible.
                 */
                spin_unlock_irq(&hugetlb_lock);
 -              ret = isolate_hugetlb(old_page, list);
 +              ret = isolate_hugetlb(&old_folio->page, list);
                spin_lock_irq(&hugetlb_lock);
                goto free_new;
 -      } else if (!HPageFreed(old_page)) {
 +      } else if (!folio_test_hugetlb_freed(old_folio)) {
                /*
 -               * Page's refcount is 0 but it has not been enqueued in the
 +               * Folio's refcount is 0 but it has not been enqueued in the
                 * freelist yet. Race window is small, so we can succeed here if
                 * we retry.
                 */
                goto retry;
        } else {
                /*
 -               * Ok, old_page is still a genuine free hugepage. Remove it from
 +               * Ok, old_folio is still a genuine free hugepage. Remove it from
                 * the freelist and decrease the counters. These will be
                 * incremented again when calling __prep_account_new_huge_page()
 -               * and enqueue_huge_page() for new_page. The counters will remain
 -               * stable since this happens under the lock.
 +               * and enqueue_hugetlb_folio() for new_folio. The counters will
 +               * remain stable since this happens under the lock.
                 */
 -              remove_hugetlb_page(h, old_page, false);
 +              remove_hugetlb_folio(h, old_folio, false);
  
                /*
 -               * Ref count on new page is already zero as it was dropped
 +               * Ref count on new_folio is already zero as it was dropped
                 * earlier.  It can be directly added to the pool free list.
                 */
                __prep_account_new_huge_page(h, nid);
 -              enqueue_huge_page(h, new_page);
 +              enqueue_hugetlb_folio(h, new_folio);
  
                /*
 -               * Pages have been replaced, we can safely free the old one.
 +               * Folio has been replaced, we can safely free the old one.
                 */
                spin_unlock_irq(&hugetlb_lock);
 -              update_and_free_page(h, old_page, false);
 +              update_and_free_hugetlb_folio(h, old_folio, false);
        }
  
        return ret;
  
  free_new:
        spin_unlock_irq(&hugetlb_lock);
 -      /* Page has a zero ref count, but needs a ref to be freed */
 -      set_page_refcounted(new_page);
 -      update_and_free_page(h, new_page, false);
 +      /* Folio has a zero ref count, but needs a ref to be freed */
 +      folio_ref_unfreeze(new_folio, 1);
 +      update_and_free_hugetlb_folio(h, new_folio, false);
  
        return ret;
  }
  int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
  {
        struct hstate *h;
 -      struct page *head;
 +      struct folio *folio = page_folio(page);
        int ret = -EBUSY;
  
        /*
         * Return success when racing as if we dissolved the page ourselves.
         */
        spin_lock_irq(&hugetlb_lock);
 -      if (PageHuge(page)) {
 -              head = compound_head(page);
 -              h = page_hstate(head);
 +      if (folio_test_hugetlb(folio)) {
 +              h = folio_hstate(folio);
        } else {
                spin_unlock_irq(&hugetlb_lock);
                return 0;
        if (hstate_is_gigantic(h))
                return -ENOMEM;
  
 -      if (page_count(head) && !isolate_hugetlb(head, list))
 +      if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list))
                ret = 0;
 -      else if (!page_count(head))
 -              ret = alloc_and_dissolve_huge_page(h, head, list);
 +      else if (!folio_ref_count(folio))
 +              ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);
  
        return ret;
  }
@@@ -2870,7 -2864,6 +2870,7 @@@ struct page *alloc_huge_page(struct vm_
        struct hugepage_subpool *spool = subpool_vma(vma);
        struct hstate *h = hstate_vma(vma);
        struct page *page;
 +      struct folio *folio;
        long map_chg, map_commit;
        long gbl_chg;
        int ret, idx;
                set_page_refcounted(page);
                /* Fall through */
        }
 +      folio = page_folio(page);
        hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
        /* If allocation is not consuming a reservation, also store the
         * hugetlb_cgroup pointer on the page.
                rsv_adjust = hugepage_subpool_put_pages(spool, 1);
                hugetlb_acct_memory(h, -rsv_adjust);
                if (deferred_reserve)
 -                      hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
 -                                      pages_per_huge_page(h), page);
 +                      hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
 +                                      pages_per_huge_page(h), folio);
        }
        return page;
  
@@@ -3044,18 -3036,17 +3044,18 @@@ static void __init gather_bootmem_preal
  
        list_for_each_entry(m, &huge_boot_pages, list) {
                struct page *page = virt_to_page(m);
 +              struct folio *folio = page_folio(page);
                struct hstate *h = m->hstate;
  
                VM_BUG_ON(!hstate_is_gigantic(h));
 -              WARN_ON(page_count(page) != 1);
 -              if (prep_compound_gigantic_page(page, huge_page_order(h))) {
 -                      WARN_ON(PageReserved(page));
 -                      prep_new_huge_page(h, page, page_to_nid(page));
 +              WARN_ON(folio_ref_count(folio) != 1);
 +              if (prep_compound_gigantic_folio(folio, huge_page_order(h))) {
 +                      WARN_ON(folio_test_reserved(folio));
 +                      prep_new_hugetlb_folio(h, folio, folio_nid(folio));
                        free_huge_page(page); /* add to the hugepage allocator */
                } else {
                        /* VERY unlikely inflated ref count on a tail page */
 -                      free_gigantic_page(page, huge_page_order(h));
 +                      free_gigantic_folio(folio, huge_page_order(h));
                }
  
                /*
@@@ -3077,14 -3068,14 +3077,14 @@@ static void __init hugetlb_hstate_alloc
                        if (!alloc_bootmem_huge_page(h, nid))
                                break;
                } else {
 -                      struct page *page;
 +                      struct folio *folio;
                        gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
  
 -                      page = alloc_fresh_huge_page(h, gfp_mask, nid,
 +                      folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
                                        &node_states[N_MEMORY], NULL);
 -                      if (!page)
 +                      if (!folio)
                                break;
 -                      free_huge_page(page); /* free it into the hugepage allocator */
 +                      free_huge_page(&folio->page); /* free it into the hugepage allocator */
                }
                cond_resched();
        }
@@@ -3229,7 -3220,7 +3229,7 @@@ static void try_to_free_low(struct hsta
                                goto out;
                        if (PageHighMem(page))
                                continue;
 -                      remove_hugetlb_page(h, page, false);
 +                      remove_hugetlb_folio(h, page_folio(page), false);
                        list_add(&page->lru, &page_list);
                }
        }
@@@ -3434,13 -3425,12 +3434,13 @@@ static int demote_free_huge_page(struc
  {
        int i, nid = page_to_nid(page);
        struct hstate *target_hstate;
 +      struct folio *folio = page_folio(page);
        struct page *subpage;
        int rc = 0;
  
        target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
  
 -      remove_hugetlb_page_for_demote(h, page, false);
 +      remove_hugetlb_folio_for_demote(h, folio, false);
        spin_unlock_irq(&hugetlb_lock);
  
        rc = hugetlb_vmemmap_restore(h, page);
                /* Allocation of vmemmmap failed, we can not demote page */
                spin_lock_irq(&hugetlb_lock);
                set_page_refcounted(page);
 -              add_hugetlb_page(h, page, false);
 +              add_hugetlb_folio(h, page_folio(page), false);
                return rc;
        }
  
        /*
 -       * Use destroy_compound_hugetlb_page_for_demote for all huge page
 +       * Use destroy_compound_hugetlb_folio_for_demote for all huge page
         * sizes as it will not ref count pages.
         */
 -      destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h));
 +      destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h));
  
        /*
         * Taking target hstate mutex synchronizes with set_max_huge_pages.
        for (i = 0; i < pages_per_huge_page(h);
                                i += pages_per_huge_page(target_hstate)) {
                subpage = nth_page(page, i);
 +              folio = page_folio(subpage);
                if (hstate_is_gigantic(target_hstate))
 -                      prep_compound_gigantic_page_for_demote(subpage,
 +                      prep_compound_gigantic_folio_for_demote(folio,
                                                        target_hstate->order);
                else
                        prep_compound_page(subpage, target_hstate->order);
                set_page_private(subpage, 0);
 -              prep_new_huge_page(target_hstate, subpage, nid);
 +              prep_new_hugetlb_folio(target_hstate, folio, nid);
                free_huge_page(subpage);
        }
        mutex_unlock(&target_hstate->resize_lock);
@@@ -4786,6 -4775,7 +4786,6 @@@ hugetlb_install_page(struct vm_area_str
        hugepage_add_new_anon_rmap(new_page, vma, addr);
        set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
        hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
 -      ClearHPageRestoreReserve(new_page);
        SetHPageMigratable(new_page);
  }
  
@@@ -5074,6 -5064,7 +5074,6 @@@ static void __unmap_hugepage_range(stru
        struct page *page;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
 -      struct mmu_notifier_range range;
        unsigned long last_addr_mask;
        bool force_flush = false;
  
        tlb_change_page_size(tlb, sz);
        tlb_start_vma(tlb, vma);
  
 -      /*
 -       * If sharing possible, alert mmu notifiers of worst case.
 -       */
 -      mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
 -                              end);
 -      adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
 -      mmu_notifier_invalidate_range_start(&range);
        last_addr_mask = hugetlb_mask_last_page(h);
        address = start;
        for (; address < end; address += sz) {
                 * unmapped and its refcount is dropped, so just clear pte here.
                 */
                if (unlikely(!pte_present(pte))) {
 -#ifdef CONFIG_PTE_MARKER_UFFD_WP
                        /*
                         * If the pte was wr-protected by uffd-wp in any of the
                         * swap forms, meanwhile the caller does not want to
                                set_huge_pte_at(mm, address, ptep,
                                                make_pte_marker(PTE_MARKER_UFFD_WP));
                        else
 -#endif
                                huge_pte_clear(mm, address, ptep, sz);
                        spin_unlock(ptl);
                        continue;
                tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
                if (huge_pte_dirty(pte))
                        set_page_dirty(page);
 -#ifdef CONFIG_PTE_MARKER_UFFD_WP
                /* Leave a uffd-wp pte marker if needed */
                if (huge_pte_uffd_wp(pte) &&
                    !(zap_flags & ZAP_FLAG_DROP_MARKER))
                        set_huge_pte_at(mm, address, ptep,
                                        make_pte_marker(PTE_MARKER_UFFD_WP));
 -#endif
                hugetlb_count_sub(pages_per_huge_page(h), mm);
                page_remove_rmap(page, vma, true);
  
                if (ref_page)
                        break;
        }
 -      mmu_notifier_invalidate_range_end(&range);
        tlb_end_vma(tlb, vma);
  
        /*
@@@ -5199,43 -5202,29 +5199,43 @@@ void __unmap_hugepage_range_final(struc
        hugetlb_vma_lock_write(vma);
        i_mmap_lock_write(vma->vm_file->f_mapping);
  
 +      /* mmu notification performed in caller */
        __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
  
 -      /*
 -       * Unlock and free the vma lock before releasing i_mmap_rwsem.  When
 -       * the vma_lock is freed, this makes the vma ineligible for pmd
 -       * sharing.  And, i_mmap_rwsem is required to set up pmd sharing.
 -       * This is important as page tables for this unmapped range will
 -       * be asynchrously deleted.  If the page tables are shared, there
 -       * will be issues when accessed by someone else.
 -       */
 -      __hugetlb_vma_unlock_write_free(vma);
 -
 -      i_mmap_unlock_write(vma->vm_file->f_mapping);
 +      if (zap_flags & ZAP_FLAG_UNMAP) {       /* final unmap */
 +              /*
 +               * Unlock and free the vma lock before releasing i_mmap_rwsem.
 +               * When the vma_lock is freed, this makes the vma ineligible
 +               * for pmd sharing.  And, i_mmap_rwsem is required to set up
 +               * pmd sharing.  This is important as page tables for this
 +               * unmapped range will be asynchrously deleted.  If the page
 +               * tables are shared, there will be issues when accessed by
 +               * someone else.
 +               */
 +              __hugetlb_vma_unlock_write_free(vma);
 +              i_mmap_unlock_write(vma->vm_file->f_mapping);
 +      } else {
 +              i_mmap_unlock_write(vma->vm_file->f_mapping);
 +              hugetlb_vma_unlock_write(vma);
 +      }
  }
  
  void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                          unsigned long end, struct page *ref_page,
                          zap_flags_t zap_flags)
  {
 +      struct mmu_notifier_range range;
        struct mmu_gather tlb;
  
 +      mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
 +                              start, end);
 +      adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
 +      mmu_notifier_invalidate_range_start(&range);
        tlb_gather_mmu(&tlb, vma->vm_mm);
 +
        __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
 +
 +      mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb);
  }
  
@@@ -5314,6 -5303,9 +5314,6 @@@ static vm_fault_t hugetlb_wp(struct mm_
        unsigned long haddr = address & huge_page_mask(h);
        struct mmu_notifier_range range;
  
 -      VM_BUG_ON(unshare && (flags & FOLL_WRITE));
 -      VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
 -
        /*
         * hugetlb does not support FOLL_FORCE-style write faults that keep the
         * PTE mapped R/O such as maybe_mkwrite() would do.
  
        /* Let's take out MAP_SHARED mappings first. */
        if (vma->vm_flags & VM_MAYSHARE) {
 -              if (unlikely(unshare))
 -                      return 0;
                set_huge_ptep_writable(vma, haddr, ptep);
                return 0;
        }
@@@ -5444,6 -5438,8 +5444,6 @@@ retry_avoidcopy
        spin_lock(ptl);
        ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
        if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
 -              ClearHPageRestoreReserve(new_page);
 -
                /* Break COW or unshare */
                huge_ptep_clear_flush(vma, haddr, ptep);
                mmu_notifier_invalidate_range(mm, range.start, range.end);
@@@ -5738,9 -5734,10 +5738,9 @@@ static vm_fault_t hugetlb_no_page(struc
        if (!pte_same(huge_ptep_get(ptep), old_pte))
                goto backout;
  
 -      if (anon_rmap) {
 -              ClearHPageRestoreReserve(page);
 +      if (anon_rmap)
                hugepage_add_new_anon_rmap(page, vma, haddr);
 -      else
 +      else
                page_dup_file_rmap(page, true);
        new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
                                && (vma->vm_flags & VM_SHARED)));
@@@ -6114,10 -6111,6 +6114,10 @@@ int hugetlb_mcopy_atomic_pte(struct mm_
  
        ptl = huge_pte_lock(h, dst_mm, dst_pte);
  
 +      ret = -EIO;
 +      if (PageHWPoison(page))
 +              goto out_release_unlock;
 +
        /*
         * We allow to overwrite a pte marker: consider when both MISSING|WP
         * registered, we firstly wr-protect a none pte which has no page cache
        if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
                goto out_release_unlock;
  
 -      if (page_in_pagecache) {
 +      if (page_in_pagecache)
                page_dup_file_rmap(page, true);
 -      } else {
 -              ClearHPageRestoreReserve(page);
 +      else
                hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
 -      }
  
        /*
         * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
@@@ -6195,8 -6190,7 +6195,8 @@@ static void record_subpages_vmas(struc
        }
  }
  
 -static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
 +static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma,
 +                                             unsigned int flags, pte_t *pte,
                                               bool *unshare)
  {
        pte_t pteval = huge_ptep_get(pte);
                return false;
        if (flags & FOLL_WRITE)
                return true;
 -      if (gup_must_unshare(flags, pte_page(pteval))) {
 +      if (gup_must_unshare(vma, flags, pte_page(pteval))) {
                *unshare = true;
                return true;
        }
        return false;
  }
  
 +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
 +                              unsigned long address, unsigned int flags)
 +{
 +      struct hstate *h = hstate_vma(vma);
 +      struct mm_struct *mm = vma->vm_mm;
 +      unsigned long haddr = address & huge_page_mask(h);
 +      struct page *page = NULL;
 +      spinlock_t *ptl;
 +      pte_t *pte, entry;
 +
 +      /*
 +       * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
 +       * follow_hugetlb_page().
 +       */
 +      if (WARN_ON_ONCE(flags & FOLL_PIN))
 +              return NULL;
 +
 +retry:
 +      pte = huge_pte_offset(mm, haddr, huge_page_size(h));
 +      if (!pte)
 +              return NULL;
 +
 +      ptl = huge_pte_lock(h, mm, pte);
 +      entry = huge_ptep_get(pte);
 +      if (pte_present(entry)) {
 +              page = pte_page(entry) +
 +                              ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
 +              /*
 +               * Note that page may be a sub-page, and with vmemmap
 +               * optimizations the page struct may be read only.
 +               * try_grab_page() will increase the ref count on the
 +               * head page, so this will be OK.
 +               *
 +               * try_grab_page() should always be able to get the page here,
 +               * because we hold the ptl lock and have verified pte_present().
 +               */
 +              if (try_grab_page(page, flags)) {
 +                      page = NULL;
 +                      goto out;
 +              }
 +      } else {
 +              if (is_hugetlb_entry_migration(entry)) {
 +                      spin_unlock(ptl);
 +                      __migration_entry_wait_huge(pte, ptl);
 +                      goto retry;
 +              }
 +              /*
 +               * hwpoisoned entry is treated as no_page_table in
 +               * follow_page_mask().
 +               */
 +      }
 +out:
 +      spin_unlock(ptl);
 +      return page;
 +}
 +
  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         struct page **pages, struct vm_area_struct **vmas,
                         unsigned long *position, unsigned long *nr_pages,
                 * directly from any kind of swap entries.
                 */
                if (absent ||
 -                  __follow_hugetlb_must_fault(flags, pte, &unshare)) {
 +                  __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) {
                        vm_fault_t ret;
                        unsigned int fault_flags = 0;
  
                                fault_flags |= FAULT_FLAG_WRITE;
                        else if (unshare)
                                fault_flags |= FAULT_FLAG_UNSHARE;
-                       if (locked)
+                       if (locked) {
                                fault_flags |= FAULT_FLAG_ALLOW_RETRY |
                                        FAULT_FLAG_KILLABLE;
+                               if (flags & FOLL_INTERRUPTIBLE)
+                                       fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
+                       }
                        if (flags & FOLL_NOWAIT)
                                fault_flags |= FAULT_FLAG_ALLOW_RETRY |
                                        FAULT_FLAG_RETRY_NOWAIT;
                         * tables. If the huge page is present, then the tail
                         * pages must also be present. The ptl prevents the
                         * head page and tail pages from being rearranged in
 -                       * any way. So this page must be available at this
 -                       * point, unless the page refcount overflowed:
 +                       * any way. As this is hugetlb, the pages will never
 +                       * be p2pdma or not longterm pinable. So this page
 +                       * must be available at this point, unless the page
 +                       * refcount overflowed:
                         */
                        if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
                                                         flags))) {
@@@ -7265,6 -7204,122 +7268,6 @@@ __weak unsigned long hugetlb_mask_last_
   * These functions are overwritable if your architecture needs its own
   * behavior.
   */
 -struct page * __weak
 -follow_huge_addr(struct mm_struct *mm, unsigned long address,
 -                            int write)
 -{
 -      return ERR_PTR(-EINVAL);
 -}
 -
 -struct page * __weak
 -follow_huge_pd(struct vm_area_struct *vma,
 -             unsigned long address, hugepd_t hpd, int flags, int pdshift)
 -{
 -      WARN(1, "hugepd follow called with no support for hugepage directory format\n");
 -      return NULL;
 -}
 -
 -struct page * __weak
 -follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
 -{
 -      struct hstate *h = hstate_vma(vma);
 -      struct mm_struct *mm = vma->vm_mm;
 -      struct page *page = NULL;
 -      spinlock_t *ptl;
 -      pte_t *ptep, pte;
 -
 -      /*
 -       * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
 -       * follow_hugetlb_page().
 -       */
 -      if (WARN_ON_ONCE(flags & FOLL_PIN))
 -              return NULL;
 -
 -retry:
 -      ptep = huge_pte_offset(mm, address, huge_page_size(h));
 -      if (!ptep)
 -              return NULL;
 -
 -      ptl = huge_pte_lock(h, mm, ptep);
 -      pte = huge_ptep_get(ptep);
 -      if (pte_present(pte)) {
 -              page = pte_page(pte) +
 -                      ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
 -              /*
 -               * try_grab_page() should always succeed here, because: a) we
 -               * hold the pmd (ptl) lock, and b) we've just checked that the
 -               * huge pmd (head) page is present in the page tables. The ptl
 -               * prevents the head page and tail pages from being rearranged
 -               * in any way. So this page must be available at this point,
 -               * unless the page refcount overflowed:
 -               */
 -              if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
 -                      page = NULL;
 -                      goto out;
 -              }
 -      } else {
 -              if (is_hugetlb_entry_migration(pte)) {
 -                      spin_unlock(ptl);
 -                      __migration_entry_wait_huge(ptep, ptl);
 -                      goto retry;
 -              }
 -              /*
 -               * hwpoisoned entry is treated as no_page_table in
 -               * follow_page_mask().
 -               */
 -      }
 -out:
 -      spin_unlock(ptl);
 -      return page;
 -}
 -
 -struct page * __weak
 -follow_huge_pud(struct mm_struct *mm, unsigned long address,
 -              pud_t *pud, int flags)
 -{
 -      struct page *page = NULL;
 -      spinlock_t *ptl;
 -      pte_t pte;
 -
 -      if (WARN_ON_ONCE(flags & FOLL_PIN))
 -              return NULL;
 -
 -retry:
 -      ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
 -      if (!pud_huge(*pud))
 -              goto out;
 -      pte = huge_ptep_get((pte_t *)pud);
 -      if (pte_present(pte)) {
 -              page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
 -              if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
 -                      page = NULL;
 -                      goto out;
 -              }
 -      } else {
 -              if (is_hugetlb_entry_migration(pte)) {
 -                      spin_unlock(ptl);
 -                      __migration_entry_wait(mm, (pte_t *)pud, ptl);
 -                      goto retry;
 -              }
 -              /*
 -               * hwpoisoned entry is treated as no_page_table in
 -               * follow_page_mask().
 -               */
 -      }
 -out:
 -      spin_unlock(ptl);
 -      return page;
 -}
 -
 -struct page * __weak
 -follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
 -{
 -      if (flags & (FOLL_GET | FOLL_PIN))
 -              return NULL;
 -
 -      return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
 -}
 -
  int isolate_hugetlb(struct page *page, struct list_head *list)
  {
        int ret = 0;
@@@ -7283,7 -7338,7 +7286,7 @@@ unlock
        return ret;
  }
  
 -int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
 +int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison)
  {
        int ret = 0;
  
                *hugetlb = true;
                if (HPageFreed(page))
                        ret = 0;
 -              else if (HPageMigratable(page))
 +              else if (HPageMigratable(page) || unpoison)
                        ret = get_page_unless_zero(page);
                else
                        ret = -EBUSY;
        return ret;
  }
  
 -int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
 +int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
 +                              bool *migratable_cleared)
  {
        int ret;
  
        spin_lock_irq(&hugetlb_lock);
 -      ret = __get_huge_page_for_hwpoison(pfn, flags);
 +      ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
        spin_unlock_irq(&hugetlb_lock);
        return ret;
  }
@@@ -7322,15 -7376,15 +7325,15 @@@ void putback_active_hugepage(struct pag
        put_page(page);
  }
  
 -void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
 +void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason)
  {
 -      struct hstate *h = page_hstate(oldpage);
 +      struct hstate *h = folio_hstate(old_folio);
  
 -      hugetlb_cgroup_migrate(oldpage, newpage);
 -      set_page_owner_migrate_reason(newpage, reason);
 +      hugetlb_cgroup_migrate(old_folio, new_folio);
 +      set_page_owner_migrate_reason(&new_folio->page, reason);
  
        /*
 -       * transfer temporary state of the new huge page. This is
 +       * transfer temporary state of the new hugetlb folio. This is
         * reverse to other transitions because the newpage is going to
         * be final while the old one will be freed so it takes over
         * the temporary status.
         * here as well otherwise the global surplus count will not match
         * the per-node's.
         */
 -      if (HPageTemporary(newpage)) {
 -              int old_nid = page_to_nid(oldpage);
 -              int new_nid = page_to_nid(newpage);
 +      if (folio_test_hugetlb_temporary(new_folio)) {
 +              int old_nid = folio_nid(old_folio);
 +              int new_nid = folio_nid(new_folio);
 +
 +              folio_set_hugetlb_temporary(old_folio);
 +              folio_clear_hugetlb_temporary(new_folio);
  
 -              SetHPageTemporary(oldpage);
 -              ClearHPageTemporary(newpage);
  
                /*
                 * There is no need to transfer the per-node surplus state
diff --combined virt/kvm/kvm_main.c
index fab4d379057853d5c44e2865799dc74efaf6c74e,954ab969f55eb438bac73620b84f1c72f2d2d4d3..13e88297f999631d1322db5bbc04b51159744578
@@@ -1198,6 -1198,8 +1198,6 @@@ static struct kvm *kvm_create_vm(unsign
                        goto out_err_no_arch_destroy_vm;
        }
  
 -      kvm->max_halt_poll_ns = halt_poll_ns;
 -
        r = kvm_arch_init_vm(kvm, type);
        if (r)
                goto out_err_no_arch_destroy_vm;
@@@ -1615,7 -1617,7 +1615,7 @@@ static int kvm_prepare_memory_region(st
                        new->dirty_bitmap = NULL;
                else if (old && old->dirty_bitmap)
                        new->dirty_bitmap = old->dirty_bitmap;
-               else if (!kvm->dirty_ring_size) {
+               else if (kvm_use_dirty_bitmap(kvm)) {
                        r = kvm_alloc_dirty_bitmap(new);
                        if (r)
                                return r;
@@@ -1639,6 -1641,8 +1639,8 @@@ static void kvm_commit_memory_region(st
                                     const struct kvm_memory_slot *new,
                                     enum kvm_mr_change change)
  {
+       int old_flags = old ? old->flags : 0;
+       int new_flags = new ? new->flags : 0;
        /*
         * Update the total number of memslot pages before calling the arch
         * hook so that architectures can consume the result directly.
        else if (change == KVM_MR_CREATE)
                kvm->nr_memslot_pages += new->npages;
  
+       if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
+               int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
+               atomic_set(&kvm->nr_memslots_dirty_logging,
+                          atomic_read(&kvm->nr_memslots_dirty_logging) + change);
+       }
        kvm_arch_commit_memory_region(kvm, old, new, change);
  
        switch (change) {
@@@ -2058,8 -2068,8 +2066,8 @@@ int kvm_get_dirty_log(struct kvm *kvm, 
        unsigned long n;
        unsigned long any = 0;
  
-       /* Dirty ring tracking is exclusive to dirty log tracking */
-       if (kvm->dirty_ring_size)
+       /* Dirty ring tracking may be exclusive to dirty log tracking */
+       if (!kvm_use_dirty_bitmap(kvm))
                return -ENXIO;
  
        *memslot = NULL;
@@@ -2123,8 -2133,8 +2131,8 @@@ static int kvm_get_dirty_log_protect(st
        unsigned long *dirty_bitmap_buffer;
        bool flush;
  
-       /* Dirty ring tracking is exclusive to dirty log tracking */
-       if (kvm->dirty_ring_size)
+       /* Dirty ring tracking may be exclusive to dirty log tracking */
+       if (!kvm_use_dirty_bitmap(kvm))
                return -ENXIO;
  
        as_id = log->slot >> 16;
@@@ -2235,8 -2245,8 +2243,8 @@@ static int kvm_clear_dirty_log_protect(
        unsigned long *dirty_bitmap_buffer;
        bool flush;
  
-       /* Dirty ring tracking is exclusive to dirty log tracking */
-       if (kvm->dirty_ring_size)
+       /* Dirty ring tracking may be exclusive to dirty log tracking */
+       if (!kvm_use_dirty_bitmap(kvm))
                return -ENXIO;
  
        as_id = log->slot >> 16;
@@@ -2512,7 -2522,7 +2520,7 @@@ static bool hva_to_pfn_fast(unsigned lo
   * 1 indicates success, -errno is returned if error is detected.
   */
  static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
-                          bool *writable, kvm_pfn_t *pfn)
+                          bool interruptible, bool *writable, kvm_pfn_t *pfn)
  {
        unsigned int flags = FOLL_HWPOISON;
        struct page *page;
                flags |= FOLL_WRITE;
        if (async)
                flags |= FOLL_NOWAIT;
+       if (interruptible)
+               flags |= FOLL_INTERRUPTIBLE;
  
        npages = get_user_pages_unlocked(addr, 1, &page, flags);
        if (npages != 1)
@@@ -2636,6 -2648,7 +2646,7 @@@ out
   * Pin guest page in memory and return its pfn.
   * @addr: host virtual address which maps memory to the guest
   * @atomic: whether this function can sleep
+  * @interruptible: whether the process can be interrupted by non-fatal signals
   * @async: whether this function need to wait IO complete if the
   *         host page is not in the memory
   * @write_fault: whether we should get a writable host page
   * 2): @write_fault = false && @writable, @writable will tell the caller
   *     whether the mapping is writable.
   */
- kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
-                    bool write_fault, bool *writable)
+ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
+                    bool *async, bool write_fault, bool *writable)
  {
        struct vm_area_struct *vma;
        kvm_pfn_t pfn;
        if (atomic)
                return KVM_PFN_ERR_FAULT;
  
-       npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
+       npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
+                                writable, &pfn);
        if (npages == 1)
                return pfn;
+       if (npages == -EINTR)
+               return KVM_PFN_ERR_SIGPENDING;
  
        mmap_read_lock(current->mm);
        if (npages == -EHWPOISON ||
@@@ -2695,8 -2711,8 +2709,8 @@@ exit
  }
  
  kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
-                              bool atomic, bool *async, bool write_fault,
-                              bool *writable, hva_t *hva)
+                              bool atomic, bool interruptible, bool *async,
+                              bool write_fault, bool *writable, hva_t *hva)
  {
        unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
  
                writable = NULL;
        }
  
-       return hva_to_pfn(addr, atomic, async, write_fault,
+       return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
                          writable);
  }
  EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
  kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
                      bool *writable)
  {
-       return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
-                                   write_fault, writable, NULL);
+       return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
+                                   NULL, write_fault, writable, NULL);
  }
  EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
  
  kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
  {
-       return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
+       return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
+                                   NULL, NULL);
  }
  EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
  
  kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
  {
-       return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
+       return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
+                                   NULL, NULL);
  }
  EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
  
@@@ -3303,18 -3321,19 +3319,19 @@@ void mark_page_dirty_in_slot(struct kv
        struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
  
  #ifdef CONFIG_HAVE_KVM_DIRTY_RING
-       if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
+       if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
                return;
+       WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
  #endif
  
        if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
                unsigned long rel_gfn = gfn - memslot->base_gfn;
                u32 slot = (memslot->as_id << 16) | memslot->id;
  
-               if (kvm->dirty_ring_size)
-                       kvm_dirty_ring_push(&vcpu->dirty_ring,
-                                           slot, rel_gfn);
-               else
+               if (kvm->dirty_ring_size && vcpu)
+                       kvm_dirty_ring_push(vcpu, slot, rel_gfn);
+               else if (memslot->dirty_bitmap)
                        set_bit_le(rel_gfn, memslot->dirty_bitmap);
        }
  }
@@@ -3375,6 -3394,9 +3392,6 @@@ static void grow_halt_poll_ns(struct kv
        if (val < grow_start)
                val = grow_start;
  
 -      if (val > vcpu->kvm->max_halt_poll_ns)
 -              val = vcpu->kvm->max_halt_poll_ns;
 -
        vcpu->halt_poll_ns = val;
  out:
        trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
@@@ -3478,24 -3500,6 +3495,24 @@@ static inline void update_halt_poll_sta
        }
  }
  
 +static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
 +{
 +      struct kvm *kvm = vcpu->kvm;
 +
 +      if (kvm->override_halt_poll_ns) {
 +              /*
 +               * Ensure kvm->max_halt_poll_ns is not read before
 +               * kvm->override_halt_poll_ns.
 +               *
 +               * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
 +               */
 +              smp_rmb();
 +              return READ_ONCE(kvm->max_halt_poll_ns);
 +      }
 +
 +      return READ_ONCE(halt_poll_ns);
 +}
 +
  /*
   * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc...  If halt
   * polling is enabled, busy wait for a short time before blocking to avoid the
   */
  void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
  {
 +      unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
        bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
 -      bool do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
        ktime_t start, cur, poll_end;
        bool waited = false;
 +      bool do_halt_poll;
        u64 halt_ns;
  
 +      if (vcpu->halt_poll_ns > max_halt_poll_ns)
 +              vcpu->halt_poll_ns = max_halt_poll_ns;
 +
 +      do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
 +
        start = cur = poll_end = ktime_get();
        if (do_halt_poll) {
                ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
  
                do {
-                       /*
-                        * This sets KVM_REQ_UNHALT if an interrupt
-                        * arrives.
-                        */
                        if (kvm_vcpu_check_block(vcpu) < 0)
                                goto out;
                        cpu_relax();
                update_halt_poll_stats(vcpu, start, poll_end, !waited);
  
        if (halt_poll_allowed) {
 +              /* Recompute the max halt poll time in case it changed. */
 +              max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
 +
                if (!vcpu_valid_wakeup(vcpu)) {
                        shrink_halt_poll_ns(vcpu);
 -              } else if (vcpu->kvm->max_halt_poll_ns) {
 +              } else if (max_halt_poll_ns) {
                        if (halt_ns <= vcpu->halt_poll_ns)
                                ;
                        /* we had a long block, shrink polling */
                        else if (vcpu->halt_poll_ns &&
 -                               halt_ns > vcpu->kvm->max_halt_poll_ns)
 +                               halt_ns > max_halt_poll_ns)
                                shrink_halt_poll_ns(vcpu);
                        /* we had a short halt and our poll time is too small */
 -                      else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
 -                               halt_ns < vcpu->kvm->max_halt_poll_ns)
 +                      else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
 +                               halt_ns < max_halt_poll_ns)
                                grow_halt_poll_ns(vcpu);
                } else {
                        vcpu->halt_poll_ns = 0;
@@@ -4505,6 -4496,9 +4518,9 @@@ static long kvm_vm_ioctl_check_extensio
                return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
  #else
                return 0;
+ #endif
+ #ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
+       case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
  #endif
        case KVM_CAP_BINARY_STATS_FD:
        case KVM_CAP_SYSTEM_EVENT_DATA:
@@@ -4581,6 -4575,20 +4597,20 @@@ int __attribute__((weak)) kvm_vm_ioctl_
        return -EINVAL;
  }
  
+ static bool kvm_are_all_memslots_empty(struct kvm *kvm)
+ {
+       int i;
+       lockdep_assert_held(&kvm->slots_lock);
+       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+               if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
+                       return false;
+       }
+       return true;
+ }
  static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
                                           struct kvm_enable_cap *cap)
  {
                        return -EINVAL;
  
                kvm->max_halt_poll_ns = cap->args[0];
 +
 +              /*
 +               * Ensure kvm->override_halt_poll_ns does not become visible
 +               * before kvm->max_halt_poll_ns.
 +               *
 +               * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
 +               */
 +              smp_wmb();
 +              kvm->override_halt_poll_ns = true;
 +
                return 0;
        }
        case KVM_CAP_DIRTY_LOG_RING:
                        return -EINVAL;
  
                return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
+       case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
+               int r = -EINVAL;
+               if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
+                   !kvm->dirty_ring_size || cap->flags)
+                       return r;
+               mutex_lock(&kvm->slots_lock);
+               /*
+                * For simplicity, allow enabling ring+bitmap if and only if
+                * there are no memslots, e.g. to ensure all memslots allocate
+                * a bitmap after the capability is enabled.
+                */
+               if (kvm_are_all_memslots_empty(kvm)) {
+                       kvm->dirty_ring_with_bitmap = true;
+                       r = 0;
+               }
+               mutex_unlock(&kvm->slots_lock);
+               return r;
+       }
        default:
                return kvm_vm_ioctl_enable_cap(kvm, cap);
        }
This page took 0.450634 seconds and 4 git commands to generate.