]> Git Repo - linux.git/commitdiff
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <[email protected]>
Sun, 24 Nov 2024 00:00:50 +0000 (16:00 -0800)
committerLinus Torvalds <[email protected]>
Sun, 24 Nov 2024 00:00:50 +0000 (16:00 -0800)
Pull kvm updates from Paolo Bonzini:
 "The biggest change here is eliminating the awful idea that KVM had of
  essentially guessing which pfns are refcounted pages.

  The reason to do so was that KVM needs to map both non-refcounted
  pages (for example BARs of VFIO devices) and VM_PFNMAP/VM_MIXMEDMAP
  VMAs that contain refcounted pages.

  However, the result was security issues in the past, and more recently
  the inability to map VM_IO and VM_PFNMAP memory that _is_ backed by
  struct page but is not refcounted. In particular this broke virtio-gpu
  blob resources (which directly map host graphics buffers into the
  guest as "vram" for the virtio-gpu device) with the amdgpu driver,
  because amdgpu allocates non-compound higher order pages and the tail
  pages could not be mapped into KVM.

  This requires adjusting all uses of struct page in the
  per-architecture code, to always work on the pfn whenever possible.
  The large series that did this, from David Stevens and Sean
  Christopherson, also cleaned up substantially the set of functions
  that provided arch code with the pfn for a host virtual addresses.

  The previous maze of twisty little passages, all different, is
  replaced by five functions (__gfn_to_page, __kvm_faultin_pfn, the
  non-__ versions of these two, and kvm_prefetch_pages) saving almost
  200 lines of code.

  ARM:

   - Support for stage-1 permission indirection (FEAT_S1PIE) and
     permission overlays (FEAT_S1POE), including nested virt + the
     emulated page table walker

   - Introduce PSCI SYSTEM_OFF2 support to KVM + client driver. This
     call was introduced in PSCIv1.3 as a mechanism to request
     hibernation, similar to the S4 state in ACPI

   - Explicitly trap + hide FEAT_MPAM (QoS controls) from KVM guests. As
     part of it, introduce trivial initialization of the host's MPAM
     context so KVM can use the corresponding traps

   - PMU support under nested virtualization, honoring the guest
     hypervisor's trap configuration and event filtering when running a
     nested guest

   - Fixes to vgic ITS serialization where stale device/interrupt table
     entries are not zeroed when the mapping is invalidated by the VM

   - Avoid emulated MMIO completion if userspace has requested
     synchronous external abort injection

   - Various fixes and cleanups affecting pKVM, vCPU initialization, and
     selftests

  LoongArch:

   - Add iocsr and mmio bus simulation in kernel.

   - Add in-kernel interrupt controller emulation.

   - Add support for virtualization extensions to the eiointc irqchip.

  PPC:

   - Drop lingering and utterly obsolete references to PPC970 KVM, which
     was removed 10 years ago.

   - Fix incorrect documentation references to non-existing ioctls

  RISC-V:

   - Accelerate KVM RISC-V when running as a guest

   - Perf support to collect KVM guest statistics from host side

  s390:

   - New selftests: more ucontrol selftests and CPU model sanity checks

   - Support for the gen17 CPU model

   - List registers supported by KVM_GET/SET_ONE_REG in the
     documentation

  x86:

   - Cleanup KVM's handling of Accessed and Dirty bits to dedup code,
     improve documentation, harden against unexpected changes.

     Even if the hardware A/D tracking is disabled, it is possible to
     use the hardware-defined A/D bits to track if a PFN is Accessed
     and/or Dirty, and that removes a lot of special cases.

   - Elide TLB flushes when aging secondary PTEs, as has been done in
     x86's primary MMU for over 10 years.

   - Recover huge pages in-place in the TDP MMU when dirty page logging
     is toggled off, instead of zapping them and waiting until the page
     is re-accessed to create a huge mapping. This reduces vCPU jitter.

   - Batch TLB flushes when dirty page logging is toggled off. This
     reduces the time it takes to disable dirty logging by ~3x.

   - Remove the shrinker that was (poorly) attempting to reclaim shadow
     page tables in low-memory situations.

   - Clean up and optimize KVM's handling of writes to
     MSR_IA32_APICBASE.

   - Advertise CPUIDs for new instructions in Clearwater Forest

   - Quirk KVM's misguided behavior of initialized certain feature MSRs
     to their maximum supported feature set, which can result in KVM
     creating invalid vCPU state. E.g. initializing PERF_CAPABILITIES to
     a non-zero value results in the vCPU having invalid state if
     userspace hides PDCM from the guest, which in turn can lead to
     save/restore failures.

   - Fix KVM's handling of non-canonical checks for vCPUs that support
     LA57 to better follow the "architecture", in quotes because the
     actual behavior is poorly documented. E.g. most MSR writes and
     descriptor table loads ignore CR4.LA57 and operate purely on
     whether the CPU supports LA57.

   - Bypass the register cache when querying CPL from kvm_sched_out(),
     as filling the cache from IRQ context is generally unsafe; harden
     the cache accessors to try to prevent similar issues from occuring
     in the future. The issue that triggered this change was already
     fixed in 6.12, but was still kinda latent.

   - Advertise AMD_IBPB_RET to userspace, and fix a related bug where
     KVM over-advertises SPEC_CTRL when trying to support cross-vendor
     VMs.

   - Minor cleanups

   - Switch hugepage recovery thread to use vhost_task.

     These kthreads can consume significant amounts of CPU time on
     behalf of a VM or in response to how the VM behaves (for example
     how it accesses its memory); therefore KVM tried to place the
     thread in the VM's cgroups and charge the CPU time consumed by that
     work to the VM's container.

     However the kthreads did not process SIGSTOP/SIGCONT, and therefore
     cgroups which had KVM instances inside could not complete freezing.

     Fix this by replacing the kthread with a PF_USER_WORKER thread, via
     the vhost_task abstraction. Another 100+ lines removed, with
     generally better behavior too like having these threads properly
     parented in the process tree.

   - Revert a workaround for an old CPU erratum (Nehalem/Westmere) that
     didn't really work; there was really nothing to work around anyway:
     the broken patch was meant to fix nested virtualization, but the
     PERF_GLOBAL_CTRL MSR is virtualized and therefore unaffected by the
     erratum.

   - Fix 6.12 regression where CONFIG_KVM will be built as a module even
     if asked to be builtin, as long as neither KVM_INTEL nor KVM_AMD is
     'y'.

  x86 selftests:

   - x86 selftests can now use AVX.

  Documentation:

   - Use rST internal links

   - Reorganize the introduction to the API document

  Generic:

   - Protect vcpu->pid accesses outside of vcpu->mutex with a rwlock
     instead of RCU, so that running a vCPU on a different task doesn't
     encounter long due to having to wait for all CPUs become quiescent.

     In general both reads and writes are rare, but userspace that
     supports confidential computing is introducing the use of "helper"
     vCPUs that may jump from one host processor to another. Those will
     be very happy to trigger a synchronize_rcu(), and the effect on
     performance is quite the disaster"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (298 commits)
  KVM: x86: Break CONFIG_KVM_X86's direct dependency on KVM_INTEL || KVM_AMD
  KVM: x86: add back X86_LOCAL_APIC dependency
  Revert "KVM: VMX: Move LOAD_IA32_PERF_GLOBAL_CTRL errata handling out of setup_vmcs_config()"
  KVM: x86: switch hugepage recovery thread to vhost_task
  KVM: x86: expose MSR_PLATFORM_INFO as a feature MSR
  x86: KVM: Advertise CPUIDs for new instructions in Clearwater Forest
  Documentation: KVM: fix malformed table
  irqchip/loongson-eiointc: Add virt extension support
  LoongArch: KVM: Add irqfd support
  LoongArch: KVM: Add PCHPIC user mode read and write functions
  LoongArch: KVM: Add PCHPIC read and write functions
  LoongArch: KVM: Add PCHPIC device support
  LoongArch: KVM: Add EIOINTC user mode read and write functions
  LoongArch: KVM: Add EIOINTC read and write functions
  LoongArch: KVM: Add EIOINTC device support
  LoongArch: KVM: Add IPI user mode read and write function
  LoongArch: KVM: Add IPI read and write function
  LoongArch: KVM: Add IPI device support
  LoongArch: KVM: Add iocsr and mmio bus simulation in kernel
  KVM: arm64: Pass on SVE mapping failures
  ...

19 files changed:
1  2 
arch/arm64/include/asm/cpucaps.h
arch/arm64/include/asm/cpufeature.h
arch/arm64/include/asm/el2_setup.h
arch/arm64/include/asm/sysreg.h
arch/arm64/kernel/cpufeature.c
arch/arm64/kernel/cpuinfo.c
arch/arm64/kvm/guest.c
arch/arm64/kvm/mmu.c
arch/arm64/tools/cpucaps
arch/arm64/tools/sysreg
arch/powerpc/kvm/book3s_hv_nested.c
arch/powerpc/kvm/powerpc.c
arch/riscv/include/asm/perf_event.h
arch/riscv/kernel/perf_callchain.c
arch/s390/include/asm/kvm_host.h
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/vsie.c
arch/x86/include/asm/cpufeatures.h
arch/x86/kvm/svm/sev.c

index a08a1212ffbb80bd851d3cb4f2f90f52e0e57002,f20e6e4212a77dce5b22dce8736f4120b0c54dca..201a46efd9188e51daa833578ea241faf866d720
@@@ -42,8 -42,6 +42,8 @@@ cpucap_is_possible(const unsigned int c
                return IS_ENABLED(CONFIG_ARM64_BTI);
        case ARM64_HAS_TLB_RANGE:
                return IS_ENABLED(CONFIG_ARM64_TLB_RANGE);
 +      case ARM64_HAS_S1POE:
 +              return IS_ENABLED(CONFIG_ARM64_POE);
        case ARM64_UNMAP_KERNEL_AT_EL0:
                return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0);
        case ARM64_WORKAROUND_843419:
                return IS_ENABLED(CONFIG_ARM64_WORKAROUND_REPEAT_TLBI);
        case ARM64_WORKAROUND_SPECULATIVE_SSBS:
                return IS_ENABLED(CONFIG_ARM64_ERRATUM_3194386);
+       case ARM64_MPAM:
+               /*
+                * KVM MPAM support doesn't rely on the host kernel supporting MPAM.
+               */
+               return true;
        }
  
        return true;
index 3d63c20ccefcd4391b37723b5b17c41f1d2bdcbe,93fe8e6beb64911eca4344d2980f030fd927bb3f..b64e49bd9d106109e3319e596aec5917b65b0783
@@@ -12,7 -12,7 +12,7 @@@
  #include <asm/hwcap.h>
  #include <asm/sysreg.h>
  
 -#define MAX_CPU_FEATURES      128
 +#define MAX_CPU_FEATURES      192
  #define cpu_feature(x)                KERNEL_HWCAP_ ## x
  
  #define ARM64_SW_FEATURE_OVERRIDE_NOKASLR     0
@@@ -438,7 -438,6 +438,7 @@@ void cpu_set_feature(unsigned int num)
  bool cpu_have_feature(unsigned int num);
  unsigned long cpu_get_elf_hwcap(void);
  unsigned long cpu_get_elf_hwcap2(void);
 +unsigned long cpu_get_elf_hwcap3(void);
  
  #define cpu_set_named_feature(name) cpu_set_feature(cpu_feature(name))
  #define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name))
@@@ -613,6 -612,13 +613,13 @@@ static inline bool id_aa64pfr1_sme(u64 
        return val > 0;
  }
  
+ static inline bool id_aa64pfr0_mpam(u64 pfr0)
+ {
+       u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT);
+       return val > 0;
+ }
  static inline bool id_aa64pfr1_mte(u64 pfr1)
  {
        u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_EL1_MTE_SHIFT);
@@@ -835,21 -841,20 +842,31 @@@ static inline bool system_supports_lpa2
  
  static inline bool system_supports_poe(void)
  {
 -      return IS_ENABLED(CONFIG_ARM64_POE) &&
 -              alternative_has_cap_unlikely(ARM64_HAS_S1POE);
 +      return alternative_has_cap_unlikely(ARM64_HAS_S1POE);
 +}
 +
 +static inline bool system_supports_gcs(void)
 +{
 +      return IS_ENABLED(CONFIG_ARM64_GCS) &&
 +              alternative_has_cap_unlikely(ARM64_HAS_GCS);
 +}
 +
 +static inline bool system_supports_haft(void)
 +{
 +      return IS_ENABLED(CONFIG_ARM64_HAFT) &&
 +              cpus_have_final_cap(ARM64_HAFT);
  }
  
+ static __always_inline bool system_supports_mpam(void)
+ {
+       return alternative_has_cap_unlikely(ARM64_MPAM);
+ }
+ static __always_inline bool system_supports_mpam_hcr(void)
+ {
+       return alternative_has_cap_unlikely(ARM64_MPAM_HCR);
+ }
  int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
  bool try_emulate_mrs(struct pt_regs *regs, u32 isn);
  
index 27086a81eae34483a682681ab1be1959a339527a,4cd41464be3f25e271c4ba808d5f4f77ef48ee82..85ef966c08cd231a5872d2cf988470391ad2973e
        ubfx    x0, x0, #ID_AA64MMFR1_EL1_HCX_SHIFT, #4
        cbz     x0, .Lskip_hcrx_\@
        mov_q   x0, HCRX_HOST_FLAGS
 +
 +        /* Enable GCS if supported */
 +      mrs_s   x1, SYS_ID_AA64PFR1_EL1
 +      ubfx    x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4
 +      cbz     x1, .Lset_hcrx_\@
 +      orr     x0, x0, #HCRX_EL2_GCSEn
 +
 +.Lset_hcrx_\@:
        msr_s   SYS_HCRX_EL2, x0
  .Lskip_hcrx_\@:
  .endm
        orr     x0, x0, #HFGxTR_EL2_nPOR_EL0
  
  .Lskip_poe_fgt_\@:
 +      /* GCS depends on PIE so we don't check it if PIE is absent */
 +      mrs_s   x1, SYS_ID_AA64PFR1_EL1
 +      ubfx    x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4
 +      cbz     x1, .Lset_fgt_\@
 +
 +      /* Disable traps of access to GCS registers at EL0 and EL1 */
 +      orr     x0, x0, #HFGxTR_EL2_nGCS_EL1_MASK
 +      orr     x0, x0, #HFGxTR_EL2_nGCS_EL0_MASK
 +
 +.Lset_fgt_\@:
        msr_s   SYS_HFGRTR_EL2, x0
        msr_s   SYS_HFGWTR_EL2, x0
        msr_s   SYS_HFGITR_EL2, xzr
  .Lskip_fgt_\@:
  .endm
  
 +.macro __init_el2_gcs
 +      mrs_s   x1, SYS_ID_AA64PFR1_EL1
 +      ubfx    x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4
 +      cbz     x1, .Lskip_gcs_\@
 +
 +      /* Ensure GCS is not enabled when we start trying to do BLs */
 +      msr_s   SYS_GCSCR_EL1, xzr
 +      msr_s   SYS_GCSCRE0_EL1, xzr
 +.Lskip_gcs_\@:
 +.endm
 +
  .macro __init_el2_nvhe_prepare_eret
        mov     x0, #INIT_PSTATE_EL1
        msr     spsr_el2, x0
  .endm
  
+ .macro __init_el2_mpam
+       /* Memory Partitioning And Monitoring: disable EL2 traps */
+       mrs     x1, id_aa64pfr0_el1
+       ubfx    x0, x1, #ID_AA64PFR0_EL1_MPAM_SHIFT, #4
+       cbz     x0, .Lskip_mpam_\@              // skip if no MPAM
+       msr_s   SYS_MPAM2_EL2, xzr              // use the default partition
+                                               // and disable lower traps
+       mrs_s   x0, SYS_MPAMIDR_EL1
+       tbz     x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@  // skip if no MPAMHCR reg
+       msr_s   SYS_MPAMHCR_EL2, xzr            // clear TRAP_MPAMIDR_EL1 -> EL2
+ .Lskip_mpam_\@:
+ .endm
  /**
   * Initialize EL2 registers to sane values. This should be called early on all
   * cores that were booted in EL2. Note that everything gets initialised as
        __init_el2_stage2
        __init_el2_gicv3
        __init_el2_hstr
+       __init_el2_mpam
        __init_el2_nvhe_idregs
        __init_el2_cptr
        __init_el2_fgt
 +        __init_el2_gcs
  .endm
  
  #ifndef __KVM_NVHE_HYPERVISOR__
index 9c98ff448bd9e824cdc94422cf56e46997076a55,345e81e0d2b3299ff26cb5b7031cbcdc5142b7cb..b8303a83c0bff5fbac56f467a11ec4fc735fa8db
  
  #define SYS_MAIR_EL2                  sys_reg(3, 4, 10, 2, 0)
  #define SYS_AMAIR_EL2                 sys_reg(3, 4, 10, 3, 0)
- #define SYS_MPAMHCR_EL2                       sys_reg(3, 4, 10, 4, 0)
- #define SYS_MPAMVPMV_EL2              sys_reg(3, 4, 10, 4, 1)
- #define SYS_MPAM2_EL2                 sys_reg(3, 4, 10, 5, 0)
- #define __SYS__MPAMVPMx_EL2(x)                sys_reg(3, 4, 10, 6, x)
- #define SYS_MPAMVPM0_EL2              __SYS__MPAMVPMx_EL2(0)
- #define SYS_MPAMVPM1_EL2              __SYS__MPAMVPMx_EL2(1)
- #define SYS_MPAMVPM2_EL2              __SYS__MPAMVPMx_EL2(2)
- #define SYS_MPAMVPM3_EL2              __SYS__MPAMVPMx_EL2(3)
- #define SYS_MPAMVPM4_EL2              __SYS__MPAMVPMx_EL2(4)
- #define SYS_MPAMVPM5_EL2              __SYS__MPAMVPMx_EL2(5)
- #define SYS_MPAMVPM6_EL2              __SYS__MPAMVPMx_EL2(6)
- #define SYS_MPAMVPM7_EL2              __SYS__MPAMVPMx_EL2(7)
  
  #define SYS_VBAR_EL2                  sys_reg(3, 4, 12, 0, 0)
  #define SYS_RVBAR_EL2                 sys_reg(3, 4, 12, 0, 1)
  /* Initial value for Permission Overlay Extension for EL0 */
  #define POR_EL0_INIT  POE_RXW
  
 +/*
 + * Definitions for Guarded Control Stack
 + */
 +
 +#define GCS_CAP_ADDR_MASK             GENMASK(63, 12)
 +#define GCS_CAP_ADDR_SHIFT            12
 +#define GCS_CAP_ADDR_WIDTH            52
 +#define GCS_CAP_ADDR(x)                       FIELD_GET(GCS_CAP_ADDR_MASK, x)
 +
 +#define GCS_CAP_TOKEN_MASK            GENMASK(11, 0)
 +#define GCS_CAP_TOKEN_SHIFT           0
 +#define GCS_CAP_TOKEN_WIDTH           12
 +#define GCS_CAP_TOKEN(x)              FIELD_GET(GCS_CAP_TOKEN_MASK, x)
 +
 +#define GCS_CAP_VALID_TOKEN           0x1
 +#define GCS_CAP_IN_PROGRESS_TOKEN     0x5
 +
 +#define GCS_CAP(x)    ((((unsigned long)x) & GCS_CAP_ADDR_MASK) | \
 +                                             GCS_CAP_VALID_TOKEN)
 +
  #define ARM64_FEATURE_FIELD_BITS      4
  
  /* Defined for compatibility only, do not add new users. */
index 351aa825ec40bdb5e040258db6879646bdf5032b,33dce6b9c49bbb70622845d7dec07b694b61bb9d..6ce71f444ed84f9056196bb21bbfac61c9687e30
@@@ -103,7 -103,6 +103,7 @@@ static DECLARE_BITMAP(elf_hwcap, MAX_CP
                                 COMPAT_HWCAP_LPAE)
  unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT;
  unsigned int compat_elf_hwcap2 __read_mostly;
 +unsigned int compat_elf_hwcap3 __read_mostly;
  #endif
  
  DECLARE_BITMAP(system_cpucaps, ARM64_NCAPS);
@@@ -229,7 -228,6 +229,7 @@@ static const struct arm64_ftr_bits ftr_
  };
  
  static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
 +      ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_XS_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, 0),
@@@ -293,8 -291,6 +293,8 @@@ static const struct arm64_ftr_bits ftr_
  };
  
  static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
 +      ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS),
 +                     FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0),
@@@ -688,6 -684,14 +688,14 @@@ static const struct arm64_ftr_bits ftr_
        ARM64_FTR_END,
  };
  
+ static const struct arm64_ftr_bits ftr_mpamidr[] = {
+       ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PMG_MAX_SHIFT, MPAMIDR_EL1_PMG_MAX_WIDTH, 0),
+       ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_VPMR_MAX_SHIFT, MPAMIDR_EL1_VPMR_MAX_WIDTH, 0),
+       ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_HAS_HCR_SHIFT, 1, 0),
+       ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PARTID_MAX_SHIFT, MPAMIDR_EL1_PARTID_MAX_WIDTH, 0),
+       ARM64_FTR_END,
+ };
  /*
   * Common ftr bits for a 32bit register with all hidden, strict
   * attributes, with 4bit feature fields and a default safe value of
@@@ -808,6 -812,9 +816,9 @@@ static const struct __ftr_reg_entry 
        ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3),
        ARM64_FTR_REG(SYS_ID_AA64MMFR4_EL1, ftr_id_aa64mmfr4),
  
+       /* Op1 = 0, CRn = 10, CRm = 4 */
+       ARM64_FTR_REG(SYS_MPAMIDR_EL1, ftr_mpamidr),
        /* Op1 = 1, CRn = 0, CRm = 0 */
        ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid),
  
@@@ -1167,6 -1174,9 +1178,9 @@@ void __init init_cpu_features(struct cp
                cpacr_restore(cpacr);
        }
  
+       if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0))
+               init_cpu_ftr_reg(SYS_MPAMIDR_EL1, info->reg_mpamidr);
        if (id_aa64pfr1_mte(info->reg_id_aa64pfr1))
                init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid);
  }
@@@ -1423,6 -1433,11 +1437,11 @@@ void update_cpu_features(int cpu
                cpacr_restore(cpacr);
        }
  
+       if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) {
+               taint |= check_update_ftr_reg(SYS_MPAMIDR_EL1, cpu,
+                                       info->reg_mpamidr, boot->reg_mpamidr);
+       }
        /*
         * The kernel uses the LDGM/STGM instructions and the number of tags
         * they read/write depends on the GMID_EL1.BS field. Check that the
@@@ -2362,14 -2377,6 +2381,14 @@@ static void cpu_enable_poe(const struc
  }
  #endif
  
 +#ifdef CONFIG_ARM64_GCS
 +static void cpu_enable_gcs(const struct arm64_cpu_capabilities *__unused)
 +{
 +      /* GCSPR_EL0 is always readable */
 +      write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1);
 +}
 +#endif
 +
  /* Internal helper functions to match cpu capability type */
  static bool
  cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap)
@@@ -2389,6 -2396,36 +2408,36 @@@ cpucap_panic_on_conflict(const struct a
        return !!(cap->type & ARM64_CPUCAP_PANIC_ON_CONFLICT);
  }
  
+ static bool
+ test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope)
+ {
+       if (!has_cpuid_feature(entry, scope))
+               return false;
+       /* Check firmware actually enabled MPAM on this cpu. */
+       return (read_sysreg_s(SYS_MPAM1_EL1) & MPAM1_EL1_MPAMEN);
+ }
+ static void
+ cpu_enable_mpam(const struct arm64_cpu_capabilities *entry)
+ {
+       /*
+        * Access by the kernel (at EL1) should use the reserved PARTID
+        * which is configured unrestricted. This avoids priority-inversion
+        * where latency sensitive tasks have to wait for a task that has
+        * been throttled to release the lock.
+        */
+       write_sysreg_s(0, SYS_MPAM1_EL1);
+ }
+ static bool
+ test_has_mpam_hcr(const struct arm64_cpu_capabilities *entry, int scope)
+ {
+       u64 idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1);
+       return idr & MPAMIDR_EL1_HAS_HCR;
+ }
  static const struct arm64_cpu_capabilities arm64_features[] = {
        {
                .capability = ARM64_ALWAYS_BOOT,
                .cpus = &dbm_cpus,
                ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, DBM)
        },
 +#endif
 +#ifdef CONFIG_ARM64_HAFT
 +      {
 +              .desc = "Hardware managed Access Flag for Table Descriptors",
 +              /*
 +               * Contrary to the page/block access flag, the table access flag
 +               * cannot be emulated in software (no access fault will occur).
 +               * Therefore this should be used only if it's supported system
 +               * wide.
 +               */
 +              .type = ARM64_CPUCAP_SYSTEM_FEATURE,
 +              .capability = ARM64_HAFT,
 +              .matches = has_cpuid_feature,
 +              ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, HAFT)
 +      },
  #endif
        {
                .desc = "CRC32 instructions",
  #endif
        },
  #endif
+       {
+               .desc = "Memory Partitioning And Monitoring",
+               .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+               .capability = ARM64_MPAM,
+               .matches = test_has_mpam,
+               .cpu_enable = cpu_enable_mpam,
+               ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, MPAM, 1)
+       },
+       {
+               .desc = "Memory Partitioning And Monitoring Virtualisation",
+               .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+               .capability = ARM64_MPAM_HCR,
+               .matches = test_has_mpam_hcr,
+       },
        {
                .desc = "NV1",
                .capability = ARM64_HAS_HCR_NV1,
                .cpu_enable = cpu_enable_poe,
                ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP)
        },
 +#endif
 +#ifdef CONFIG_ARM64_GCS
 +      {
 +              .desc = "Guarded Control Stack (GCS)",
 +              .capability = ARM64_HAS_GCS,
 +              .type = ARM64_CPUCAP_SYSTEM_FEATURE,
 +              .cpu_enable = cpu_enable_gcs,
 +              .matches = has_cpuid_feature,
 +              ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP)
 +      },
  #endif
        {},
  };
@@@ -3042,9 -3068,6 +3105,9 @@@ static const struct arm64_cpu_capabilit
        HWCAP_CAP(ID_AA64ZFR0_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM),
        HWCAP_CAP(ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM),
        HWCAP_CAP(ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM),
 +#endif
 +#ifdef CONFIG_ARM64_GCS
 +      HWCAP_CAP(ID_AA64PFR1_EL1, GCS, IMP, CAP_HWCAP, KERNEL_HWCAP_GCS),
  #endif
        HWCAP_CAP(ID_AA64PFR1_EL1, SSBS, SSBS2, CAP_HWCAP, KERNEL_HWCAP_SSBS),
  #ifdef CONFIG_ARM64_BTI
@@@ -3436,6 -3459,36 +3499,36 @@@ static void verify_hyp_capabilities(voi
        }
  }
  
+ static void verify_mpam_capabilities(void)
+ {
+       u64 cpu_idr = read_cpuid(ID_AA64PFR0_EL1);
+       u64 sys_idr = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+       u16 cpu_partid_max, cpu_pmg_max, sys_partid_max, sys_pmg_max;
+       if (FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, cpu_idr) !=
+           FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, sys_idr)) {
+               pr_crit("CPU%d: MPAM version mismatch\n", smp_processor_id());
+               cpu_die_early();
+       }
+       cpu_idr = read_cpuid(MPAMIDR_EL1);
+       sys_idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1);
+       if (FIELD_GET(MPAMIDR_EL1_HAS_HCR, cpu_idr) !=
+           FIELD_GET(MPAMIDR_EL1_HAS_HCR, sys_idr)) {
+               pr_crit("CPU%d: Missing MPAM HCR\n", smp_processor_id());
+               cpu_die_early();
+       }
+       cpu_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, cpu_idr);
+       cpu_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, cpu_idr);
+       sys_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, sys_idr);
+       sys_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, sys_idr);
+       if (cpu_partid_max < sys_partid_max || cpu_pmg_max < sys_pmg_max) {
+               pr_crit("CPU%d: MPAM PARTID/PMG max values are mismatched\n", smp_processor_id());
+               cpu_die_early();
+       }
+ }
  /*
   * Run through the enabled system capabilities and enable() it on this CPU.
   * The capabilities were decided based on the available CPUs at the boot time.
@@@ -3462,6 -3515,9 +3555,9 @@@ static void verify_local_cpu_capabiliti
  
        if (is_hyp_mode_available())
                verify_hyp_capabilities();
+       if (system_supports_mpam())
+               verify_mpam_capabilities();
  }
  
  void check_local_cpu_capabilities(void)
@@@ -3539,11 -3595,6 +3635,11 @@@ unsigned long cpu_get_elf_hwcap2(void
        return elf_hwcap[1];
  }
  
 +unsigned long cpu_get_elf_hwcap3(void)
 +{
 +      return elf_hwcap[2];
 +}
 +
  static void __init setup_boot_cpu_capabilities(void)
  {
        /*
index f2f92c6b1c850fa28145e6828d7fd42936482c1a,46ba30d42b9bd09345fcde580125b70df8f938d2..d79e88fccdfce427507e7a34c5959ce6309cbd12
@@@ -80,7 -80,6 +80,7 @@@ static const char *const hwcap_str[] = 
        [KERNEL_HWCAP_SB]               = "sb",
        [KERNEL_HWCAP_PACA]             = "paca",
        [KERNEL_HWCAP_PACG]             = "pacg",
 +      [KERNEL_HWCAP_GCS]              = "gcs",
        [KERNEL_HWCAP_DCPODP]           = "dcpodp",
        [KERNEL_HWCAP_SVE2]             = "sve2",
        [KERNEL_HWCAP_SVEAES]           = "sveaes",
@@@ -479,6 -478,9 +479,9 @@@ static void __cpuinfo_store_cpu(struct 
        if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0))
                __cpuinfo_store_cpu_32bit(&info->aarch32);
  
+       if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0))
+               info->reg_mpamidr = read_cpuid(MPAMIDR_EL1);
        cpuinfo_detect_icache_policy(info);
  }
  
diff --combined arch/arm64/kvm/guest.c
index e738a353b20e43c192ac2b15688e720851077ba8,4cd7ffa7679460cb2424324558e1d58105277f36..12dad841f2a51276eee4d4da7400c1b2a5732ff8
@@@ -1051,60 -1051,47 +1051,58 @@@ int kvm_vm_ioctl_mte_copy_tags(struct k
        }
  
        while (length > 0) {
-               kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
+               struct page *page = __gfn_to_page(kvm, gfn, write);
                void *maddr;
                unsigned long num_tags;
-               struct page *page;
 +              struct folio *folio;
  
-               if (is_error_noslot_pfn(pfn)) {
+               if (!page) {
                        ret = -EFAULT;
                        goto out;
                }
  
-               page = pfn_to_online_page(pfn);
-               if (!page) {
+               if (!pfn_to_online_page(page_to_pfn(page))) {
                        /* Reject ZONE_DEVICE memory */
-                       kvm_release_pfn_clean(pfn);
+                       kvm_release_page_unused(page);
                        ret = -EFAULT;
                        goto out;
                }
 +              folio = page_folio(page);
                maddr = page_address(page);
  
                if (!write) {
 -                      if (page_mte_tagged(page))
 +                      if ((folio_test_hugetlb(folio) &&
 +                           folio_test_hugetlb_mte_tagged(folio)) ||
 +                           page_mte_tagged(page))
                                num_tags = mte_copy_tags_to_user(tags, maddr,
                                                        MTE_GRANULES_PER_PAGE);
                        else
                                /* No tags in memory, so write zeros */
                                num_tags = MTE_GRANULES_PER_PAGE -
                                        clear_user(tags, MTE_GRANULES_PER_PAGE);
-                       kvm_release_pfn_clean(pfn);
+                       kvm_release_page_clean(page);
                } else {
                        /*
                         * Only locking to serialise with a concurrent
                         * __set_ptes() in the VMM but still overriding the
                         * tags, hence ignoring the return value.
                         */
 -                      try_page_mte_tagging(page);
 +                      if (folio_test_hugetlb(folio))
 +                              folio_try_hugetlb_mte_tagging(folio);
 +                      else
 +                              try_page_mte_tagging(page);
                        num_tags = mte_copy_tags_from_user(maddr, tags,
                                                        MTE_GRANULES_PER_PAGE);
  
                        /* uaccess failed, don't leave stale tags */
                        if (num_tags != MTE_GRANULES_PER_PAGE)
                                mte_clear_page_tags(maddr);
 -                      set_page_mte_tagged(page);
 +                      if (folio_test_hugetlb(folio))
 +                              folio_set_hugetlb_mte_tagged(folio);
 +                      else
 +                              set_page_mte_tagged(page);
 +
-                       kvm_release_pfn_dirty(pfn);
+                       kvm_release_page_dirty(page);
                }
  
                if (num_tags != MTE_GRANULES_PER_PAGE) {
diff --combined arch/arm64/kvm/mmu.c
index 56d9a7f414fe15a7e8eab87550ee98058996c52e,a71fe6f6bd90f258378ca40a527e7bee736f591c..c9d46ad57e52d3e06b58089dfe4a7838165ddf2d
@@@ -1402,21 -1402,10 +1402,21 @@@ static void sanitise_mte_tags(struct kv
  {
        unsigned long i, nr_pages = size >> PAGE_SHIFT;
        struct page *page = pfn_to_page(pfn);
 +      struct folio *folio = page_folio(page);
  
        if (!kvm_has_mte(kvm))
                return;
  
 +      if (folio_test_hugetlb(folio)) {
 +              /* Hugetlb has MTE flags set on head page only */
 +              if (folio_try_hugetlb_mte_tagging(folio)) {
 +                      for (i = 0; i < nr_pages; i++, page++)
 +                              mte_clear_page_tags(page_address(page));
 +                      folio_set_hugetlb_mte_tagged(folio);
 +              }
 +              return;
 +      }
 +
        for (i = 0; i < nr_pages; i++, page++) {
                if (try_page_mte_tagging(page)) {
                        mte_clear_page_tags(page_address(page));
@@@ -1451,6 -1440,7 +1451,7 @@@ static int user_mem_abort(struct kvm_vc
        long vma_pagesize, fault_granule;
        enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
        struct kvm_pgtable *pgt;
+       struct page *page;
  
        if (fault_is_perm)
                fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
  
        /*
         * Read mmu_invalidate_seq so that KVM can detect if the results of
-        * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to
+        * vma_lookup() or __kvm_faultin_pfn() become stale prior to
         * acquiring kvm->mmu_lock.
         *
         * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
        mmu_seq = vcpu->kvm->mmu_invalidate_seq;
        mmap_read_unlock(current->mm);
  
-       pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
-                                  write_fault, &writable, NULL);
+       pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
+                               &writable, &page);
        if (pfn == KVM_PFN_ERR_HWPOISON) {
                kvm_send_hwpoison_signal(hva, vma_shift);
                return 0;
                 * If the page was identified as device early by looking at
                 * the VMA flags, vma_pagesize is already representing the
                 * largest quantity we can map.  If instead it was mapped
-                * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
+                * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
                 * and must not be upgraded.
                 *
                 * In both cases, we don't let transparent_hugepage_adjust()
        }
  
  out_unlock:
+       kvm_release_faultin_page(kvm, page, !!ret, writable);
        read_unlock(&kvm->mmu_lock);
  
        /* Mark the page dirty only if the fault is handled successfully */
-       if (writable && !ret) {
-               kvm_set_pfn_dirty(pfn);
+       if (writable && !ret)
                mark_page_dirty_in_slot(kvm, memslot, gfn);
-       }
  
-       kvm_release_pfn_clean(pfn);
        return ret != -EAGAIN ? ret : 0;
  }
  
  /* Resolve the access fault by making the page young again. */
  static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
  {
-       kvm_pte_t pte;
        struct kvm_s2_mmu *mmu;
  
        trace_kvm_access_fault(fault_ipa);
  
        read_lock(&vcpu->kvm->mmu_lock);
        mmu = vcpu->arch.hw_mmu;
-       pte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
+       kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
        read_unlock(&vcpu->kvm->mmu_lock);
-       if (kvm_pte_valid(pte))
-               kvm_set_pfn_accessed(kvm_pte_to_pfn(pte));
  }
  
  /**
diff --combined arch/arm64/tools/cpucaps
index 8dfb2fa51d1274f558b1d107803f7cf50db22c5c,dfac646430ac8d22f7f7a1f1003bb0229fc6ea96..eb17f59e543c49b7f5f254f9aa73dd4e5c94de6e
@@@ -29,7 -29,6 +29,7 @@@ HAS_EV
  HAS_FPMR
  HAS_FGT
  HAS_FPSIMD
 +HAS_GCS
  HAS_GENERIC_AUTH
  HAS_GENERIC_AUTH_ARCH_QARMA3
  HAS_GENERIC_AUTH_ARCH_QARMA5
@@@ -57,11 -56,12 +57,13 @@@ HAS_TLB_RANG
  HAS_VA52
  HAS_VIRT_HOST_EXTN
  HAS_WFXT
 +HAFT
  HW_DBM
  KVM_HVHE
  KVM_PROTECTED_MODE
  MISMATCHED_CACHE_TYPE
+ MPAM
+ MPAM_HCR
  MTE
  MTE_ASYMM
  SME
diff --combined arch/arm64/tools/sysreg
index 283279af932ce60aed4d4c623a4fbff619014a9b,ed3bf6a0f5c18701dfe94f863ed19a67232f909b..b081b54d6d227ed8300a6f129896647316f0b673
@@@ -1200,7 -1200,7 +1200,7 @@@ UnsignedEnum    55:52   BRB
        0b0001  IMP
        0b0010  BRBE_V1P1
  EndEnum
Enum  51:48   MTPMU
SignedEnum    51:48   MTPMU
        0b0000  NI_IMPDEF
        0b0001  IMP
        0b1111  NI
@@@ -1208,6 -1208,7 +1208,7 @@@ EndEnu
  UnsignedEnum  47:44   TraceBuffer
        0b0000  NI
        0b0001  IMP
+       0b0010  TRBE_V1P1
  EndEnum
  UnsignedEnum  43:40   TraceFilt
        0b0000  NI
@@@ -1224,11 -1225,18 +1225,18 @@@ UnsignedEnum 35:32   PMSVe
        0b0011  V1P2
        0b0100  V1P3
        0b0101  V1P4
+       0b0110  V1P5
  EndEnum
  Field 31:28   CTX_CMPs
- Res0  27:24
+ UnsignedEnum  27:24   SEBEP
+       0b0000  NI
+       0b0001  IMP
+ EndEnum
  Field 23:20   WRPs
- Res0  19:16
+ UnsignedEnum  19:16   PMSS
+       0b0000  NI
+       0b0001  IMP
+ EndEnum
  Field 15:12   BRPs
  UnsignedEnum  11:8    PMUVer
        0b0000  NI
@@@ -1288,6 -1296,32 +1296,32 @@@ Field 15:8    BRP
  Field 7:0     SYSPMUID
  EndSysreg
  
+ Sysreg        ID_AA64DFR2_EL1 3       0       0       5       2
+ Res0  63:28
+ UnsignedEnum  27:24   TRBE_EXC
+       0b0000  NI
+       0b0001  IMP
+ EndEnum
+ UnsignedEnum  23:20   SPE_nVM
+       0b0000  NI
+       0b0001  IMP
+ EndEnum
+ UnsignedEnum  19:16   SPE_EXC
+       0b0000  NI
+       0b0001  IMP
+ EndEnum
+ Res0  15:8
+ UnsignedEnum  7:4     BWE
+       0b0000  NI
+       0b0001  FEAT_BWE
+       0b0002  FEAT_BWE2
+ EndEnum
+ UnsignedEnum  3:0     STEP
+       0b0000  NI
+       0b0001  IMP
+ EndEnum
+ EndSysreg
  Sysreg        ID_AA64AFR0_EL1 3       0       0       5       4
  Res0  63:32
  Field 31:28   IMPDEF7
@@@ -1649,8 -1683,6 +1683,8 @@@ EndEnu
  UnsignedEnum  39:36   ETS
        0b0000  NI
        0b0001  IMP
 +      0b0010  ETS2
 +      0b0011  ETS3
  EndEnum
  UnsignedEnum  35:32   TWED
        0b0000  NI
@@@ -1692,7 -1724,6 +1726,7 @@@ UnsignedEnum    3:0     HAFDB
        0b0001  AF
        0b0010  DBM
        0b0011  HAFT
 +      0b0100  HDBSS
  EndEnum
  EndSysreg
  
@@@ -2183,13 -2214,6 +2217,13 @@@ Field 4       
  Field 3:0     ALIGN
  EndSysreg
  
 +Sysreg        PMUACR_EL1      3       0       9       14      4
 +Res0  63:33
 +Field 32      F0
 +Field 31      C
 +Field 30:0    P
 +EndSysreg
 +
  Sysreg        PMSELR_EL0      3       3       9       12      5
  Res0  63:5
  Field 4:0     SEL
@@@ -2400,6 -2424,41 +2434,41 @@@ Field 1       AFSR1_EL
  Field 0       AFSR0_EL1
  EndSysregFields
  
+ Sysreg MDCR_EL2               3       4       1       1       1
+ Res0  63:51
+ Field 50      EnSTEPOP
+ Res0  49:44
+ Field 43      EBWE
+ Res0  42
+ Field 41:40   PMEE
+ Res0  39:37
+ Field 36      HPMFZS
+ Res0  35:32
+ Field 31:30   PMSSE
+ Field 29      HPMFZO
+ Field 28      MTPME
+ Field 27      TDCC
+ Field 26      HLP
+ Field 25:24   E2TB
+ Field 23      HCCD
+ Res0  22:20
+ Field 19      TTRF
+ Res0  18
+ Field 17      HPMD
+ Res0  16
+ Field 15      EnSPM
+ Field 14      TPMS
+ Field 13:12   E2PB
+ Field 11      TDRA
+ Field 10      TDOSA
+ Field 9       TDA
+ Field 8       TDE
+ Field 7       HPME
+ Field 6       TPM
+ Field 5       TPMCR
+ Field 4:0     HPMN
+ EndSysreg
  Sysreg HFGRTR_EL2     3       4       1       1       4
  Fields        HFGxTR_EL2
  EndSysreg
@@@ -2749,6 -2808,126 +2818,126 @@@ Field       1       E2SP
  Field 0       E0HSPE
  EndSysreg
  
+ Sysreg        MPAMHCR_EL2     3       4       10      4       0
+ Res0  63:32
+ Field 31      TRAP_MPAMIDR_EL1
+ Res0  30:9
+ Field 8       GSTAPP_PLK
+ Res0  7:2
+ Field 1       EL1_VPMEN
+ Field 0       EL0_VPMEN
+ EndSysreg
+ Sysreg        MPAMVPMV_EL2    3       4       10      4       1
+ Res0  63:32
+ Field 31      VPM_V31
+ Field 30      VPM_V30
+ Field 29      VPM_V29
+ Field 28      VPM_V28
+ Field 27      VPM_V27
+ Field 26      VPM_V26
+ Field 25      VPM_V25
+ Field 24      VPM_V24
+ Field 23      VPM_V23
+ Field 22      VPM_V22
+ Field 21      VPM_V21
+ Field 20      VPM_V20
+ Field 19      VPM_V19
+ Field 18      VPM_V18
+ Field 17      VPM_V17
+ Field 16      VPM_V16
+ Field 15      VPM_V15
+ Field 14      VPM_V14
+ Field 13      VPM_V13
+ Field 12      VPM_V12
+ Field 11      VPM_V11
+ Field 10      VPM_V10
+ Field 9       VPM_V9
+ Field 8       VPM_V8
+ Field 7       VPM_V7
+ Field 6       VPM_V6
+ Field 5       VPM_V5
+ Field 4       VPM_V4
+ Field 3       VPM_V3
+ Field 2       VPM_V2
+ Field 1       VPM_V1
+ Field 0       VPM_V0
+ EndSysreg
+ Sysreg        MPAM2_EL2       3       4       10      5       0
+ Field 63      MPAMEN
+ Res0  62:59
+ Field 58      TIDR
+ Res0  57
+ Field 56      ALTSP_HFC
+ Field 55      ALTSP_EL2
+ Field 54      ALTSP_FRCD
+ Res0  53:51
+ Field 50      EnMPAMSM
+ Field 49      TRAPMPAM0EL1
+ Field 48      TRAPMPAM1EL1
+ Field 47:40   PMG_D
+ Field 39:32   PMG_I
+ Field 31:16   PARTID_D
+ Field 15:0    PARTID_I
+ EndSysreg
+ Sysreg        MPAMVPM0_EL2    3       4       10      6       0
+ Field 63:48   PhyPARTID3
+ Field 47:32   PhyPARTID2
+ Field 31:16   PhyPARTID1
+ Field 15:0    PhyPARTID0
+ EndSysreg
+ Sysreg        MPAMVPM1_EL2    3       4       10      6       1
+ Field 63:48   PhyPARTID7
+ Field 47:32   PhyPARTID6
+ Field 31:16   PhyPARTID5
+ Field 15:0    PhyPARTID4
+ EndSysreg
+ Sysreg        MPAMVPM2_EL2    3       4       10      6       2
+ Field 63:48   PhyPARTID11
+ Field 47:32   PhyPARTID10
+ Field 31:16   PhyPARTID9
+ Field 15:0    PhyPARTID8
+ EndSysreg
+ Sysreg        MPAMVPM3_EL2    3       4       10      6       3
+ Field 63:48   PhyPARTID15
+ Field 47:32   PhyPARTID14
+ Field 31:16   PhyPARTID13
+ Field 15:0    PhyPARTID12
+ EndSysreg
+ Sysreg        MPAMVPM4_EL2    3       4       10      6       4
+ Field 63:48   PhyPARTID19
+ Field 47:32   PhyPARTID18
+ Field 31:16   PhyPARTID17
+ Field 15:0    PhyPARTID16
+ EndSysreg
+ Sysreg        MPAMVPM5_EL2    3       4       10      6       5
+ Field 63:48   PhyPARTID23
+ Field 47:32   PhyPARTID22
+ Field 31:16   PhyPARTID21
+ Field 15:0    PhyPARTID20
+ EndSysreg
+ Sysreg        MPAMVPM6_EL2    3       4       10      6       6
+ Field 63:48   PhyPARTID27
+ Field 47:32   PhyPARTID26
+ Field 31:16   PhyPARTID25
+ Field 15:0    PhyPARTID24
+ EndSysreg
+ Sysreg        MPAMVPM7_EL2    3       4       10      6       7
+ Field 63:48   PhyPARTID31
+ Field 47:32   PhyPARTID30
+ Field 31:16   PhyPARTID29
+ Field 15:0    PhyPARTID28
+ EndSysreg
  Sysreg        CONTEXTIDR_EL2  3       4       13      0       1
  Fields        CONTEXTIDR_ELx
  EndSysreg
@@@ -2781,6 -2960,10 +2970,10 @@@ Sysreg        FAR_EL12        3       5       6       0       
  Field 63:0    ADDR
  EndSysreg
  
+ Sysreg        MPAM1_EL12      3       5       10      5       0
+ Fields        MPAM1_ELx
+ EndSysreg
  Sysreg        CONTEXTIDR_EL12 3       5       13      0       1
  Fields        CONTEXTIDR_ELx
  EndSysreg
@@@ -2831,8 -3014,7 +3024,7 @@@ Field   13      AMEC
  Field 12      AMEC0
  Field 11      HAFT
  Field 10      PTTWI
- Field 9:8     SKL1
- Field 7:6     SKL0
+ Res0  9:6
  Field 5       D128
  Field 4       AIE
  Field 3       POE
@@@ -2895,6 -3077,10 +3087,10 @@@ Sysreg        PIRE0_EL12      3       5       10      2       
  Fields        PIRx_ELx
  EndSysreg
  
+ Sysreg        PIRE0_EL2       3       4       10      2       2
+ Fields        PIRx_ELx
+ EndSysreg
  Sysreg        PIR_EL1         3       0       10      2       3
  Fields        PIRx_ELx
  EndSysreg
@@@ -2915,6 -3101,10 +3111,10 @@@ Sysreg        POR_EL1         3       0       10      2       
  Fields        PIRx_ELx
  EndSysreg
  
+ Sysreg        POR_EL2         3       4       10      2       4
+ Fields        PIRx_ELx
+ EndSysreg
  Sysreg        POR_EL12        3       5       10      2       4
  Fields        PIRx_ELx
  EndSysreg
@@@ -2953,6 -3143,22 +3153,22 @@@ Res0  
  Field 0       EN
  EndSysreg
  
+ Sysreg        MPAMIDR_EL1     3       0       10      4       4
+ Res0  63:62
+ Field 61      HAS_SDEFLT
+ Field 60      HAS_FORCE_NS
+ Field 59      SP4
+ Field 58      HAS_TIDR
+ Field 57      HAS_ALTSP
+ Res0  56:40
+ Field 39:32   PMG_MAX
+ Res0  31:21
+ Field 20:18   VPMR_MAX
+ Field 17      HAS_HCR
+ Res0  16
+ Field 15:0    PARTID_MAX
+ EndSysreg
  Sysreg        LORID_EL1       3       0       10      4       7
  Res0  63:24
  Field 23:16   LD
@@@ -2960,6 -3166,27 +3176,27 @@@ Res0  15:
  Field 7:0     LR
  EndSysreg
  
+ Sysreg        MPAM1_EL1       3       0       10      5       0
+ Field 63      MPAMEN
+ Res0  62:61
+ Field 60 FORCED_NS
+ Res0  59:55
+ Field 54      ALTSP_FRCD
+ Res0  53:48
+ Field 47:40   PMG_D
+ Field 39:32   PMG_I
+ Field 31:16   PARTID_D
+ Field 15:0    PARTID_I
+ EndSysreg
+ Sysreg        MPAM0_EL1       3       0       10      5       1
+ Res0  63:48
+ Field 47:40   PMG_D
+ Field 39:32   PMG_I
+ Field 31:16   PARTID_D
+ Field 15:0    PARTID_I
+ EndSysreg
  Sysreg        ISR_EL1 3       0       12      1       0
  Res0  63:11
  Field 10      IS
index ef97f58d0d9738d00255335f418865806b6d5e8a,771173509617a7fb4be923a2350bdaee6c31284b..5f8c2321cfb52094d02c8c82cf59ede518e552dd
@@@ -32,7 -32,7 +32,7 @@@ void kvmhv_save_hv_regs(struct kvm_vcp
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
  
        hr->pcr = vc->pcr | PCR_MASK;
 -      hr->dpdes = vc->dpdes;
 +      hr->dpdes = vcpu->arch.doorbell_request;
        hr->hfscr = vcpu->arch.hfscr;
        hr->tb_offset = vc->tb_offset;
        hr->dawr0 = vcpu->arch.dawr0;
@@@ -105,7 -105,7 +105,7 @@@ static void save_hv_return_state(struc
  {
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
  
 -      hr->dpdes = vc->dpdes;
 +      hr->dpdes = vcpu->arch.doorbell_request;
        hr->purr = vcpu->arch.purr;
        hr->spurr = vcpu->arch.spurr;
        hr->ic = vcpu->arch.ic;
@@@ -143,7 -143,7 +143,7 @@@ static void restore_hv_regs(struct kvm_
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
  
        vc->pcr = hr->pcr | PCR_MASK;
 -      vc->dpdes = hr->dpdes;
 +      vcpu->arch.doorbell_request = hr->dpdes;
        vcpu->arch.hfscr = hr->hfscr;
        vcpu->arch.dawr0 = hr->dawr0;
        vcpu->arch.dawrx0 = hr->dawrx0;
@@@ -170,13 -170,7 +170,13 @@@ void kvmhv_restore_hv_return_state(stru
  {
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
  
 -      vc->dpdes = hr->dpdes;
 +      /*
 +       * This L2 vCPU might have received a doorbell while H_ENTER_NESTED was being handled.
 +       * Make sure we preserve the doorbell if it was either:
 +       *   a) Sent after H_ENTER_NESTED was called on this vCPU (arch.doorbell_request would be 1)
 +       *   b) Doorbell was not handled and L2 exited for some other reason (hr->dpdes would be 1)
 +       */
 +      vcpu->arch.doorbell_request = vcpu->arch.doorbell_request | hr->dpdes;
        vcpu->arch.hfscr = hr->hfscr;
        vcpu->arch.purr = hr->purr;
        vcpu->arch.spurr = hr->spurr;
@@@ -451,8 -445,6 +451,8 @@@ long kvmhv_nested_init(void
        if (rc == H_SUCCESS) {
                unsigned long capabilities = 0;
  
 +              if (cpu_has_feature(CPU_FTR_P11_PVR))
 +                      capabilities |= H_GUEST_CAP_POWER11;
                if (cpu_has_feature(CPU_FTR_ARCH_31))
                        capabilities |= H_GUEST_CAP_POWER10;
                if (cpu_has_feature(CPU_FTR_ARCH_300))
@@@ -1535,7 -1527,6 +1535,6 @@@ static long int __kvmhv_nested_page_fau
        unsigned long n_gpa, gpa, gfn, perm = 0UL;
        unsigned int shift, l1_shift, level;
        bool writing = !!(dsisr & DSISR_ISSTORE);
-       bool kvm_ro = false;
        long int ret;
  
        if (!gp->l1_gr_to_hr) {
                                        ea, DSISR_ISSTORE | DSISR_PROTFAULT);
                        return RESUME_GUEST;
                }
-               kvm_ro = true;
        }
  
        /* 2. Find the host pte for this L1 guest real address */
        if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) {
                /* No suitable pte found -> try to insert a mapping */
                ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot,
-                                       writing, kvm_ro, &pte, &level);
+                                       writing, &pte, &level);
                if (ret == -EAGAIN)
                        return RESUME_GUEST;
                else if (ret)
index b3b37ea778496ee9126f8de6263e2b504aa4506d,76446604332c99099445df59bbed815cda34f566..ce1d91eed231bcf9e5835b63adc57c7ed2646bf1
@@@ -612,9 -612,6 +612,6 @@@ int kvm_vm_ioctl_check_extension(struc
                                r = 8 | 4 | 2 | 1;
                }
                break;
-       case KVM_CAP_PPC_RMA:
-               r = 0;
-               break;
        case KVM_CAP_PPC_HWRNG:
                r = kvmppc_hwrng_present();
                break;
@@@ -1933,11 -1930,12 +1930,11 @@@ static int kvm_vcpu_ioctl_enable_cap(st
  #endif
  #ifdef CONFIG_KVM_MPIC
        case KVM_CAP_IRQ_MPIC: {
 -              struct fd f;
 +              CLASS(fd, f)(cap->args[0]);
                struct kvm_device *dev;
  
                r = -EBADF;
 -              f = fdget(cap->args[0]);
 -              if (!fd_file(f))
 +              if (fd_empty(f))
                        break;
  
                r = -EPERM;
                if (dev)
                        r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]);
  
 -              fdput(f);
                break;
        }
  #endif
  #ifdef CONFIG_KVM_XICS
        case KVM_CAP_IRQ_XICS: {
 -              struct fd f;
 +              CLASS(fd, f)(cap->args[0]);
                struct kvm_device *dev;
  
                r = -EBADF;
 -              f = fdget(cap->args[0]);
 -              if (!fd_file(f))
 +              if (fd_empty(f))
                        break;
  
                r = -EPERM;
                        else
                                r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
                }
 -
 -              fdput(f);
                break;
        }
  #endif /* CONFIG_KVM_XICS */
  #ifdef CONFIG_KVM_XIVE
        case KVM_CAP_PPC_IRQ_XIVE: {
 -              struct fd f;
 +              CLASS(fd, f)(cap->args[0]);
                struct kvm_device *dev;
  
                r = -EBADF;
 -              f = fdget(cap->args[0]);
 -              if (!fd_file(f))
 +              if (fd_empty(f))
                        break;
  
                r = -ENXIO;
 -              if (!xive_enabled()) {
 -                      fdput(f);
 +              if (!xive_enabled())
                        break;
 -              }
  
                r = -EPERM;
                dev = kvm_device_from_filp(fd_file(f));
                if (dev)
                        r = kvmppc_xive_native_connect_vcpu(dev, vcpu,
                                                            cap->args[1]);
 -
 -              fdput(f);
                break;
        }
  #endif /* CONFIG_KVM_XIVE */
index 665bbc9b2f840a5272eb493d799f21dab73dfd25,38926b4a902d3c813d85108913870d77a8f2751a..bcc928fd3785c16b54a9a802525bad07c86f3690
@@@ -8,7 -8,11 +8,8 @@@
  #ifndef _ASM_RISCV_PERF_EVENT_H
  #define _ASM_RISCV_PERF_EVENT_H
  
+ #ifdef CONFIG_PERF_EVENTS
  #include <linux/perf_event.h>
 -extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
 -extern unsigned long perf_misc_flags(struct pt_regs *regs);
 -#define perf_misc_flags(regs) perf_misc_flags(regs)
  #define perf_arch_bpf_user_pt_regs(regs) (struct user_regs_struct *)regs
  
  #define perf_arch_fetch_caller_regs(regs, __ip) { \
@@@ -17,4 -21,6 +18,6 @@@
        (regs)->sp = current_stack_pointer; \
        (regs)->status = SR_PP; \
  }
+ #endif
  #endif /* _ASM_RISCV_PERF_EVENT_H */
index c7468af77c663ab30d5728476e882d0882d1afa5,c2c81a80f81640fc58ad5f96dd1a569aa567a301..b465bc9eb870ec1ab8a638d07f3fd9edcd047bc3
@@@ -28,11 -28,49 +28,21 @@@ static bool fill_callchain(void *entry
  void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
                         struct pt_regs *regs)
  {
+       if (perf_guest_state()) {
+               /* TODO: We don't support guest os callchain now */
+               return;
+       }
        arch_stack_walk_user(fill_callchain, entry, regs);
  }
  
  void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
                           struct pt_regs *regs)
  {
+       if (perf_guest_state()) {
+               /* TODO: We don't support guest os callchain now */
+               return;
+       }
        walk_stackframe(NULL, regs, fill_callchain, entry);
  }
 -
 -unsigned long perf_instruction_pointer(struct pt_regs *regs)
 -{
 -      if (perf_guest_state())
 -              return perf_guest_get_ip();
 -
 -      return instruction_pointer(regs);
 -}
 -
 -unsigned long perf_misc_flags(struct pt_regs *regs)
 -{
 -      unsigned int guest_state = perf_guest_state();
 -      unsigned long misc = 0;
 -
 -      if (guest_state) {
 -              if (guest_state & PERF_GUEST_USER)
 -                      misc |= PERF_RECORD_MISC_GUEST_USER;
 -              else
 -                      misc |= PERF_RECORD_MISC_GUEST_KERNEL;
 -      } else {
 -              if (user_mode(regs))
 -                      misc |= PERF_RECORD_MISC_USER;
 -              else
 -                      misc |= PERF_RECORD_MISC_KERNEL;
 -      }
 -
 -      return misc;
 -}
index 51201b4ac93a5d1903c6b60d64326c5530147ea6,851cfe5042f389574b00d07e6491fb5d86f3dc1d..1cd8eaebd3c0d781b09ec0e6172e3710ab302819
@@@ -356,6 -356,7 +356,7 @@@ struct kvm_s390_sie_block 
  #define ECD_MEF               0x08000000
  #define ECD_ETOKENF   0x02000000
  #define ECD_ECC               0x00200000
+ #define ECD_HMAC      0x00004000
        __u32   ecd;                    /* 0x01c8 */
        __u8    reserved1cc[18];        /* 0x01cc */
        __u64   pp;                     /* 0x01de */
@@@ -527,9 -528,6 +528,9 @@@ struct kvm_vcpu_stat 
  #define PGM_REGION_FIRST_TRANS                0x39
  #define PGM_REGION_SECOND_TRANS               0x3a
  #define PGM_REGION_THIRD_TRANS                0x3b
 +#define PGM_SECURE_STORAGE_ACCESS     0x3d
 +#define PGM_NON_SECURE_STORAGE_ACCESS 0x3e
 +#define PGM_SECURE_STORAGE_VIOLATION  0x3f
  #define PGM_MONITOR                   0x40
  #define PGM_PER                               0x80
  #define PGM_CRYPTO_OPERATION          0x119
@@@ -750,6 -748,8 +751,6 @@@ struct kvm_vcpu_arch 
        struct hrtimer    ckc_timer;
        struct kvm_s390_pgm_info pgm;
        struct gmap *gmap;
 -      /* backup location for the currently enabled gmap when scheduled out */
 -      struct gmap *enabled_gmap;
        struct kvm_guestdbg_info_arch guestdbg;
        unsigned long pfault_token;
        unsigned long pfault_select;
diff --combined arch/s390/kvm/kvm-s390.c
index deeb32034ad5e98779795df61666afffa73549a9,0676c41ac9b8f30f0380f037e6a06ae26b4f18a9..442d4a227c0e68e69d5522c79f9a1c2d23af1d2f
@@@ -43,7 -43,6 +43,7 @@@
  #include <asm/sclp.h>
  #include <asm/cpacf.h>
  #include <asm/timex.h>
 +#include <asm/asm.h>
  #include <asm/fpu.h>
  #include <asm/ap.h>
  #include <asm/uv.h>
@@@ -341,13 -340,24 +341,23 @@@ static inline int plo_test_bit(unsigne
                "       lgr     0,%[function]\n"
                /* Parameter registers are ignored for "test bit" */
                "       plo     0,0,0,0(0)\n"
 -              "       ipm     %0\n"
 -              "       srl     %0,28\n"
 -              : "=d" (cc)
 +              CC_IPM(cc)
 +              : CC_OUT(cc, cc)
                : [function] "d" (function)
 -              : "cc", "0");
 -      return cc == 0;
 +              : CC_CLOBBER_LIST("0"));
 +      return CC_TRANSFORM(cc) == 0;
  }
  
+ static __always_inline void pfcr_query(u8 (*query)[16])
+ {
+       asm volatile(
+               "       lghi    0,0\n"
+               "       .insn   rsy,0xeb0000000016,0,0,%[query]\n"
+               : [query] "=QS" (*query)
+               :
+               : "cc", "0");
+ }
  static __always_inline void __sortl_query(u8 (*query)[32])
  {
        asm volatile(
@@@ -429,6 -439,9 +439,9 @@@ static void __init kvm_s390_cpu_feat_in
        if (test_facility(151)) /* DFLTCC */
                __dfltcc_query(&kvm_s390_available_subfunc.dfltcc);
  
+       if (test_facility(201)) /* PFCR */
+               pfcr_query(&kvm_s390_available_subfunc.pfcr);
        if (MACHINE_HAS_ESOP)
                allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
        /*
@@@ -799,6 -812,14 +812,14 @@@ int kvm_vm_ioctl_enable_cap(struct kvm 
                                set_kvm_facility(kvm->arch.model.fac_mask, 192);
                                set_kvm_facility(kvm->arch.model.fac_list, 192);
                        }
+                       if (test_facility(198)) {
+                               set_kvm_facility(kvm->arch.model.fac_mask, 198);
+                               set_kvm_facility(kvm->arch.model.fac_list, 198);
+                       }
+                       if (test_facility(199)) {
+                               set_kvm_facility(kvm->arch.model.fac_mask, 199);
+                               set_kvm_facility(kvm->arch.model.fac_list, 199);
+                       }
                        r = 0;
                } else
                        r = -EINVAL;
@@@ -1543,6 -1564,9 +1564,9 @@@ static int kvm_s390_set_processor_subfu
                 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[1],
                 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[2],
                 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[3]);
+       VM_EVENT(kvm, 3, "GET: guest PFCR   subfunc 0x%16.16lx.%16.16lx",
+                ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[0],
+                ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[1]);
  
        return 0;
  }
@@@ -1757,6 -1781,9 +1781,9 @@@ static int kvm_s390_get_processor_subfu
                 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[1],
                 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[2],
                 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[3]);
+       VM_EVENT(kvm, 3, "GET: guest PFCR   subfunc 0x%16.16lx.%16.16lx",
+                ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[0],
+                ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[1]);
  
        return 0;
  }
@@@ -1825,6 -1852,9 +1852,9 @@@ static int kvm_s390_get_machine_subfunc
                 ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[1],
                 ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[2],
                 ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[3]);
+       VM_EVENT(kvm, 3, "GET: host  PFCR   subfunc 0x%16.16lx.%16.16lx",
+                ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[0],
+                ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[1]);
  
        return 0;
  }
@@@ -3719,6 -3749,7 +3749,6 @@@ __u64 kvm_s390_get_cpu_timer(struct kvm
  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  {
  
 -      gmap_enable(vcpu->arch.enabled_gmap);
        kvm_s390_set_cpuflags(vcpu, CPUSTAT_RUNNING);
        if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
                __start_cpu_timer_accounting(vcpu);
@@@ -3731,6 -3762,8 +3761,6 @@@ void kvm_arch_vcpu_put(struct kvm_vcpu 
        if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
                __stop_cpu_timer_accounting(vcpu);
        kvm_s390_clear_cpuflags(vcpu, CPUSTAT_RUNNING);
 -      vcpu->arch.enabled_gmap = gmap_get_enabled();
 -      gmap_disable(vcpu->arch.enabled_gmap);
  
  }
  
@@@ -3748,6 -3781,8 +3778,6 @@@ void kvm_arch_vcpu_postcreate(struct kv
        }
        if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0)
                vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
 -      /* make vcpu_load load the right gmap on the first trigger */
 -      vcpu->arch.enabled_gmap = vcpu->arch.gmap;
  }
  
  static bool kvm_has_pckmo_subfunc(struct kvm *kvm, unsigned long nr)
@@@ -3769,6 -3804,13 +3799,13 @@@ static bool kvm_has_pckmo_ecc(struct kv
  
  }
  
+ static bool kvm_has_pckmo_hmac(struct kvm *kvm)
+ {
+       /* At least one HMAC subfunction must be present */
+       return kvm_has_pckmo_subfunc(kvm, 118) ||
+              kvm_has_pckmo_subfunc(kvm, 122);
+ }
  static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
  {
        /*
        vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd;
        vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA);
        vcpu->arch.sie_block->eca &= ~ECA_APIE;
-       vcpu->arch.sie_block->ecd &= ~ECD_ECC;
+       vcpu->arch.sie_block->ecd &= ~(ECD_ECC | ECD_HMAC);
  
        if (vcpu->kvm->arch.crypto.apie)
                vcpu->arch.sie_block->eca |= ECA_APIE;
        /* Set up protected key support */
        if (vcpu->kvm->arch.crypto.aes_kw) {
                vcpu->arch.sie_block->ecb3 |= ECB3_AES;
-               /* ecc is also wrapped with AES key */
+               /* ecc/hmac is also wrapped with AES key */
                if (kvm_has_pckmo_ecc(vcpu->kvm))
                        vcpu->arch.sie_block->ecd |= ECD_ECC;
+               if (kvm_has_pckmo_hmac(vcpu->kvm))
+                       vcpu->arch.sie_block->ecd |= ECD_HMAC;
        }
  
        if (vcpu->kvm->arch.crypto.dea_kw)
@@@ -4574,6 -4618,22 +4613,6 @@@ int kvm_s390_try_set_tod_clock(struct k
        return 1;
  }
  
 -/**
 - * kvm_arch_fault_in_page - fault-in guest page if necessary
 - * @vcpu: The corresponding virtual cpu
 - * @gpa: Guest physical address
 - * @writable: Whether the page should be writable or not
 - *
 - * Make sure that a guest page has been faulted-in on the host.
 - *
 - * Return: Zero on success, negative error code otherwise.
 - */
 -long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable)
 -{
 -      return gmap_fault(vcpu->arch.gmap, gpa,
 -                        writable ? FAULT_FLAG_WRITE : 0);
 -}
 -
  static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token,
                                      unsigned long token)
  {
@@@ -4641,11 -4701,12 +4680,11 @@@ static bool kvm_arch_setup_async_pf(str
        if (!vcpu->arch.gmap->pfault_enabled)
                return false;
  
 -      hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(current->thread.gmap_addr));
 -      hva += current->thread.gmap_addr & ~PAGE_MASK;
 +      hva = gfn_to_hva(vcpu->kvm, current->thread.gmap_teid.addr);
        if (read_guest_real(vcpu, vcpu->arch.pfault_token, &arch.pfault_token, 8))
                return false;
  
 -      return kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch);
 +      return kvm_setup_async_pf(vcpu, current->thread.gmap_teid.addr * PAGE_SIZE, hva, &arch);
  }
  
  static int vcpu_pre_run(struct kvm_vcpu *vcpu)
        clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask);
  
        vcpu->arch.sie_block->icptcode = 0;
 +      current->thread.gmap_int_code = 0;
        cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
        VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags);
        trace_kvm_s390_sie_enter(vcpu, cpuflags);
        return 0;
  }
  
 -static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
 +static int vcpu_post_run_addressing_exception(struct kvm_vcpu *vcpu)
  {
        struct kvm_s390_pgm_info pgm_info = {
                .code = PGM_ADDRESSING,
        return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
  }
  
 +static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
 +{
 +      unsigned int flags = 0;
 +      unsigned long gaddr;
 +      int rc = 0;
 +
 +      gaddr = current->thread.gmap_teid.addr * PAGE_SIZE;
 +      if (kvm_s390_cur_gmap_fault_is_write())
 +              flags = FAULT_FLAG_WRITE;
 +
 +      switch (current->thread.gmap_int_code & PGM_INT_CODE_MASK) {
 +      case 0:
 +              vcpu->stat.exit_null++;
 +              break;
 +      case PGM_NON_SECURE_STORAGE_ACCESS:
 +              KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm,
 +                      "Unexpected program interrupt 0x%x, TEID 0x%016lx",
 +                      current->thread.gmap_int_code, current->thread.gmap_teid.val);
 +              /*
 +               * This is normal operation; a page belonging to a protected
 +               * guest has not been imported yet. Try to import the page into
 +               * the protected guest.
 +               */
 +              if (gmap_convert_to_secure(vcpu->arch.gmap, gaddr) == -EINVAL)
 +                      send_sig(SIGSEGV, current, 0);
 +              break;
 +      case PGM_SECURE_STORAGE_ACCESS:
 +      case PGM_SECURE_STORAGE_VIOLATION:
 +              KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm,
 +                      "Unexpected program interrupt 0x%x, TEID 0x%016lx",
 +                      current->thread.gmap_int_code, current->thread.gmap_teid.val);
 +              /*
 +               * This can happen after a reboot with asynchronous teardown;
 +               * the new guest (normal or protected) will run on top of the
 +               * previous protected guest. The old pages need to be destroyed
 +               * so the new guest can use them.
 +               */
 +              if (gmap_destroy_page(vcpu->arch.gmap, gaddr)) {
 +                      /*
 +                       * Either KVM messed up the secure guest mapping or the
 +                       * same page is mapped into multiple secure guests.
 +                       *
 +                       * This exception is only triggered when a guest 2 is
 +                       * running and can therefore never occur in kernel
 +                       * context.
 +                       */
 +                      pr_warn_ratelimited("Secure storage violation (%x) in task: %s, pid %d\n",
 +                                          current->thread.gmap_int_code, current->comm,
 +                                          current->pid);
 +                      send_sig(SIGSEGV, current, 0);
 +              }
 +              break;
 +      case PGM_PROTECTION:
 +      case PGM_SEGMENT_TRANSLATION:
 +      case PGM_PAGE_TRANSLATION:
 +      case PGM_ASCE_TYPE:
 +      case PGM_REGION_FIRST_TRANS:
 +      case PGM_REGION_SECOND_TRANS:
 +      case PGM_REGION_THIRD_TRANS:
 +              KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm,
 +                      "Unexpected program interrupt 0x%x, TEID 0x%016lx",
 +                      current->thread.gmap_int_code, current->thread.gmap_teid.val);
 +              if (vcpu->arch.gmap->pfault_enabled) {
 +                      rc = gmap_fault(vcpu->arch.gmap, gaddr, flags | FAULT_FLAG_RETRY_NOWAIT);
 +                      if (rc == -EFAULT)
 +                              return vcpu_post_run_addressing_exception(vcpu);
 +                      if (rc == -EAGAIN) {
 +                              trace_kvm_s390_major_guest_pfault(vcpu);
 +                              if (kvm_arch_setup_async_pf(vcpu))
 +                                      return 0;
 +                              vcpu->stat.pfault_sync++;
 +                      } else {
 +                              return rc;
 +                      }
 +              }
 +              rc = gmap_fault(vcpu->arch.gmap, gaddr, flags);
 +              if (rc == -EFAULT) {
 +                      if (kvm_is_ucontrol(vcpu->kvm)) {
 +                              vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
 +                              vcpu->run->s390_ucontrol.trans_exc_code = gaddr;
 +                              vcpu->run->s390_ucontrol.pgm_code = 0x10;
 +                              return -EREMOTE;
 +                      }
 +                      return vcpu_post_run_addressing_exception(vcpu);
 +              }
 +              break;
 +      default:
 +              KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx",
 +                      current->thread.gmap_int_code, current->thread.gmap_teid.val);
 +              send_sig(SIGSEGV, current, 0);
 +              break;
 +      }
 +      return rc;
 +}
 +
  static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
  {
        struct mcck_volatile_info *mcck_info;
        struct sie_page *sie_page;
 +      int rc;
  
        VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
                   vcpu->arch.sie_block->icptcode);
        }
  
        if (vcpu->arch.sie_block->icptcode > 0) {
 -              int rc = kvm_handle_sie_intercept(vcpu);
 +              rc = kvm_handle_sie_intercept(vcpu);
  
                if (rc != -EOPNOTSUPP)
                        return rc;
                vcpu->run->s390_sieic.ipa = vcpu->arch.sie_block->ipa;
                vcpu->run->s390_sieic.ipb = vcpu->arch.sie_block->ipb;
                return -EREMOTE;
 -      } else if (exit_reason != -EFAULT) {
 -              vcpu->stat.exit_null++;
 -              return 0;
 -      } else if (kvm_is_ucontrol(vcpu->kvm)) {
 -              vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
 -              vcpu->run->s390_ucontrol.trans_exc_code =
 -                                              current->thread.gmap_addr;
 -              vcpu->run->s390_ucontrol.pgm_code = 0x10;
 -              return -EREMOTE;
 -      } else if (current->thread.gmap_pfault) {
 -              trace_kvm_s390_major_guest_pfault(vcpu);
 -              current->thread.gmap_pfault = 0;
 -              if (kvm_arch_setup_async_pf(vcpu))
 -                      return 0;
 -              vcpu->stat.pfault_sync++;
 -              return kvm_arch_fault_in_page(vcpu, current->thread.gmap_addr, 1);
        }
 -      return vcpu_post_run_fault_in_sie(vcpu);
 +
 +      return vcpu_post_run_handle_fault(vcpu);
  }
  
  #define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK)
@@@ -4895,7 -4874,7 +4934,7 @@@ static int __vcpu_run(struct kvm_vcpu *
                }
                exit_reason = sie64a(vcpu->arch.sie_block,
                                     vcpu->run->s.regs.gprs,
 -                                   gmap_get_enabled()->asce);
 +                                   vcpu->arch.gmap->asce);
                if (kvm_s390_pv_cpu_is_protected(vcpu)) {
                        memcpy(vcpu->run->s.regs.gprs,
                               sie_page->pv_grregs,
diff --combined arch/s390/kvm/vsie.c
index d3cdde1b18e5b10e3fe86ec92d3a7cfce5e2a34f,f3ae697089ee6acec492179b76423393ca0a7286..150b9387860ad2e511a5e8b4a0e9c73b8a12ddbb
@@@ -335,7 -335,8 +335,8 @@@ static int shadow_crycb(struct kvm_vcp
        /* we may only allow it if enabled for guest 2 */
        ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
                     (ECB3_AES | ECB3_DEA);
-       ecd_flags = scb_o->ecd & vcpu->arch.sie_block->ecd & ECD_ECC;
+       ecd_flags = scb_o->ecd & vcpu->arch.sie_block->ecd &
+                    (ECD_ECC | ECD_HMAC);
        if (!ecb3_flags && !ecd_flags)
                goto end;
  
@@@ -661,7 -662,7 +662,7 @@@ static int pin_guest_page(struct kvm *k
        struct page *page;
  
        page = gfn_to_page(kvm, gpa_to_gfn(gpa));
-       if (is_error_page(page))
+       if (!page)
                return -EINVAL;
        *hpa = (hpa_t)page_to_phys(page) + (gpa & ~PAGE_MASK);
        return 0;
  /* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
  static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
  {
-       kvm_release_pfn_dirty(hpa >> PAGE_SHIFT);
+       kvm_release_page_dirty(pfn_to_page(hpa >> PAGE_SHIFT));
        /* mark the page always as dirty for migration */
        mark_page_dirty(kvm, gpa_to_gfn(gpa));
  }
@@@ -922,19 -923,19 +923,19 @@@ static int handle_fault(struct kvm_vcp
  {
        int rc;
  
 -      if (current->thread.gmap_int_code == PGM_PROTECTION)
 +      if ((current->thread.gmap_int_code & PGM_INT_CODE_MASK) == PGM_PROTECTION)
                /* we can directly forward all protection exceptions */
                return inject_fault(vcpu, PGM_PROTECTION,
 -                                  current->thread.gmap_addr, 1);
 +                                  current->thread.gmap_teid.addr * PAGE_SIZE, 1);
  
        rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
 -                                 current->thread.gmap_addr, NULL);
 +                                 current->thread.gmap_teid.addr * PAGE_SIZE, NULL);
        if (rc > 0) {
                rc = inject_fault(vcpu, rc,
 -                                current->thread.gmap_addr,
 -                                current->thread.gmap_write_flag);
 +                                current->thread.gmap_teid.addr * PAGE_SIZE,
 +                                kvm_s390_cur_gmap_fault_is_write());
                if (rc >= 0)
 -                      vsie_page->fault_addr = current->thread.gmap_addr;
 +                      vsie_page->fault_addr = current->thread.gmap_teid.addr * PAGE_SIZE;
        }
        return rc;
  }
@@@ -1148,10 -1149,9 +1149,10 @@@ static int do_vsie_run(struct kvm_vcpu 
         * also kick the vSIE.
         */
        vcpu->arch.sie_block->prog0c |= PROG_IN_SIE;
 +      current->thread.gmap_int_code = 0;
        barrier();
        if (!kvm_s390_vcpu_sie_inhibited(vcpu))
 -              rc = sie64a(scb_s, vcpu->run->s.regs.gprs, gmap_get_enabled()->asce);
 +              rc = sie64a(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce);
        barrier();
        vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE;
  
  
        if (rc > 0)
                rc = 0; /* we could still have an icpt */
 -      else if (rc == -EFAULT)
 +      else if (current->thread.gmap_int_code)
                return handle_fault(vcpu, vsie_page);
  
        switch (scb_s->icptcode) {
@@@ -1296,8 -1296,10 +1297,8 @@@ static int vsie_run(struct kvm_vcpu *vc
                if (!rc)
                        rc = map_prefix(vcpu, vsie_page);
                if (!rc) {
 -                      gmap_enable(vsie_page->gmap);
                        update_intervention_requests(vsie_page);
                        rc = do_vsie_run(vcpu, vsie_page);
 -                      gmap_enable(vcpu->arch.gmap);
                }
                atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
  
index ea33439a5d006481fd0e960a01568662d196e3be,d96277dceabfe9b07e1b8dbe308aa0fc85e28087..17b6590748c00cc11f4a527255679d3eb2475a31
  #define X86_FEATURE_ZEN1              (11*32+31) /* CPU based on Zen1 microarchitecture */
  
  /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
+ #define X86_FEATURE_SHA512            (12*32+ 0) /* SHA512 instructions */
+ #define X86_FEATURE_SM3                       (12*32+ 1) /* SM3 instructions */
+ #define X86_FEATURE_SM4                       (12*32+ 2) /* SM4 instructions */
  #define X86_FEATURE_AVX_VNNI          (12*32+ 4) /* "avx_vnni" AVX VNNI instructions */
  #define X86_FEATURE_AVX512_BF16               (12*32+ 5) /* "avx512_bf16" AVX512 BFLOAT16 instructions */
  #define X86_FEATURE_CMPCCXADD           (12*32+ 7) /* CMPccXADD instructions */
  #define X86_FEATURE_BHI_CTRL          (21*32+ 2) /* BHI_DIS_S HW control available */
  #define X86_FEATURE_CLEAR_BHB_HW      (21*32+ 3) /* BHI_DIS_S HW control enabled */
  #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */
 -#define X86_FEATURE_FAST_CPPC         (21*32 + 5) /* AMD Fast CPPC */
 +#define X86_FEATURE_AMD_FAST_CPPC     (21*32 + 5) /* Fast CPPC */
 +#define X86_FEATURE_AMD_HETEROGENEOUS_CORES (21*32 + 6) /* Heterogeneous Core Topology */
 +#define X86_FEATURE_AMD_WORKLOAD_CLASS        (21*32 + 7) /* Workload Classification */
  
  /*
   * BUG word(s)
diff --combined arch/x86/kvm/svm/sev.c
index 92d4711fd1e44f387f550ece48afc0b0c3ce2dce,72674b8825c47e39ff04e7d6562c50b1a9fc333e..943bd074a5d37212a1fdf1f01e42f1c8f0e416c6
@@@ -533,12 -533,17 +533,12 @@@ static int sev_bind_asid(struct kvm *kv
  
  static int __sev_issue_cmd(int fd, int id, void *data, int *error)
  {
 -      struct fd f;
 -      int ret;
 +      CLASS(fd, f)(fd);
  
 -      f = fdget(fd);
 -      if (!fd_file(f))
 +      if (fd_empty(f))
                return -EBADF;
  
 -      ret = sev_issue_cmd_external_user(fd_file(f), id, data, error);
 -
 -      fdput(f);
 -      return ret;
 +      return sev_issue_cmd_external_user(fd_file(f), id, data, error);
  }
  
  static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
@@@ -2071,21 -2076,23 +2071,21 @@@ int sev_vm_move_enc_context_from(struc
  {
        struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info;
        struct kvm_sev_info *src_sev, *cg_cleanup_sev;
 -      struct fd f = fdget(source_fd);
 +      CLASS(fd, f)(source_fd);
        struct kvm *source_kvm;
        bool charged = false;
        int ret;
  
 -      if (!fd_file(f))
 +      if (fd_empty(f))
                return -EBADF;
  
 -      if (!file_is_kvm(fd_file(f))) {
 -              ret = -EBADF;
 -              goto out_fput;
 -      }
 +      if (!file_is_kvm(fd_file(f)))
 +              return -EBADF;
  
        source_kvm = fd_file(f)->private_data;
        ret = sev_lock_two_vms(kvm, source_kvm);
        if (ret)
 -              goto out_fput;
 +              return ret;
  
        if (kvm->arch.vm_type != source_kvm->arch.vm_type ||
            sev_guest(kvm) || !sev_guest(source_kvm)) {
@@@ -2132,6 -2139,8 +2132,6 @@@ out_dst_cgroup
        cg_cleanup_sev->misc_cg = NULL;
  out_unlock:
        sev_unlock_two_vms(kvm, source_kvm);
 -out_fput:
 -      fdput(f);
        return ret;
  }
  
@@@ -2792,21 -2801,23 +2792,21 @@@ failed
  
  int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
  {
 -      struct fd f = fdget(source_fd);
 +      CLASS(fd, f)(source_fd);
        struct kvm *source_kvm;
        struct kvm_sev_info *source_sev, *mirror_sev;
        int ret;
  
 -      if (!fd_file(f))
 +      if (fd_empty(f))
                return -EBADF;
  
 -      if (!file_is_kvm(fd_file(f))) {
 -              ret = -EBADF;
 -              goto e_source_fput;
 -      }
 +      if (!file_is_kvm(fd_file(f)))
 +              return -EBADF;
  
        source_kvm = fd_file(f)->private_data;
        ret = sev_lock_two_vms(kvm, source_kvm);
        if (ret)
 -              goto e_source_fput;
 +              return ret;
  
        /*
         * Mirrors of mirrors should work, but let's not get silly.  Also
  
  e_unlock:
        sev_unlock_two_vms(kvm, source_kvm);
 -e_source_fput:
 -      fdput(f);
        return ret;
  }
  
@@@ -3458,7 -3471,7 +3458,7 @@@ void sev_es_unmap_ghcb(struct vcpu_svm 
  
        sev_es_sync_to_ghcb(svm);
  
-       kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true);
+       kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map);
        svm->sev_es.ghcb = NULL;
  }
  
@@@ -3839,6 -3852,7 +3839,7 @@@ static int __sev_snp_update_protected_g
        if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) {
                gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
                struct kvm_memory_slot *slot;
+               struct page *page;
                kvm_pfn_t pfn;
  
                slot = gfn_to_memslot(vcpu->kvm, gfn);
                 * The new VMSA will be private memory guest memory, so
                 * retrieve the PFN from the gmem backend.
                 */
-               if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, NULL))
+               if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL))
                        return -EINVAL;
  
                /*
                 * changes then care should be taken to ensure
                 * svm->sev_es.vmsa is pinned through some other means.
                 */
-               kvm_release_pfn_clean(pfn);
+               kvm_release_page_clean(page);
        }
  
        /*
@@@ -4678,6 -4692,7 +4679,7 @@@ void sev_handle_rmp_fault(struct kvm_vc
        struct kvm_memory_slot *slot;
        struct kvm *kvm = vcpu->kvm;
        int order, rmp_level, ret;
+       struct page *page;
        bool assigned;
        kvm_pfn_t pfn;
        gfn_t gfn;
                return;
        }
  
-       ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &order);
+       ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order);
        if (ret) {
                pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n",
                                    gpa);
  out:
        trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret);
  out_no_trace:
-       put_page(pfn_to_page(pfn));
+       kvm_release_page_unused(page);
  }
  
  static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end)
This page took 0.262285 seconds and 4 git commands to generate.