]> Git Repo - linux.git/commitdiff
Merge tag 'x86_sev_for_v5.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <[email protected]>
Tue, 24 May 2022 00:38:01 +0000 (17:38 -0700)
committerLinus Torvalds <[email protected]>
Tue, 24 May 2022 00:38:01 +0000 (17:38 -0700)
Pull AMD SEV-SNP support from Borislav Petkov:
 "The third AMD confidential computing feature called Secure Nested
  Paging.

  Add to confidential guests the necessary memory integrity protection
  against malicious hypervisor-based attacks like data replay, memory
  remapping and others, thus achieving a stronger isolation from the
  hypervisor.

  At the core of the functionality is a new structure called a reverse
  map table (RMP) with which the guest has a say in which pages get
  assigned to it and gets notified when a page which it owns, gets
  accessed/modified under the covers so that the guest can take an
  appropriate action.

  In addition, add support for the whole machinery needed to launch a
  SNP guest, details of which is properly explained in each patch.

  And last but not least, the series refactors and improves parts of the
  previous SEV support so that the new code is accomodated properly and
  not just bolted on"

* tag 'x86_sev_for_v5.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (60 commits)
  x86/entry: Fixup objtool/ibt validation
  x86/sev: Mark the code returning to user space as syscall gap
  x86/sev: Annotate stack change in the #VC handler
  x86/sev: Remove duplicated assignment to variable info
  x86/sev: Fix address space sparse warning
  x86/sev: Get the AP jump table address from secrets page
  x86/sev: Add missing __init annotations to SEV init routines
  virt: sevguest: Rename the sevguest dir and files to sev-guest
  virt: sevguest: Change driver name to reflect generic SEV support
  x86/boot: Put globals that are accessed early into the .data section
  x86/boot: Add an efi.h header for the decompressor
  virt: sevguest: Fix bool function returning negative value
  virt: sevguest: Fix return value check in alloc_shared_pages()
  x86/sev-es: Replace open-coded hlt-loop with sev_es_terminate()
  virt: sevguest: Add documentation for SEV-SNP CPUID Enforcement
  virt: sevguest: Add support to get extended report
  virt: sevguest: Add support to derive key
  virt: Add SEV-SNP guest driver
  x86/sev: Register SEV-SNP guest request platform device
  x86/sev: Provide support for SNP guest request NAEs
  ...

1  2 
Documentation/admin-guide/kernel-parameters.txt
arch/x86/entry/entry_64.S
arch/x86/include/asm/msr-index.h
arch/x86/kernel/cpu/common.c
arch/x86/kvm/cpuid.c
arch/x86/kvm/svm/sev.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/svm/svm.h
drivers/virt/Kconfig
drivers/virt/Makefile
include/linux/efi.h

index 297c852c53f8c8b4e0014daa8ec53999aba05c3d,48ad2ec0770b7a8ec3b14cc756db7a056f8b225e..0c648a867dfa22907ba778269371c79ef128c4a4
                        when set.
                        Format: <int>
  
 -      libata.force=   [LIBATA] Force configurations.  The format is comma-
 -                      separated list of "[ID:]VAL" where ID is
 -                      PORT[.DEVICE].  PORT and DEVICE are decimal numbers
 -                      matching port, link or device.  Basically, it matches
 -                      the ATA ID string printed on console by libata.  If
 -                      the whole ID part is omitted, the last PORT and DEVICE
 -                      values are used.  If ID hasn't been specified yet, the
 -                      configuration applies to all ports, links and devices.
 +      libata.force=   [LIBATA] Force configurations.  The format is comma-
 +                      separated list of "[ID:]VAL" where ID is PORT[.DEVICE].
 +                      PORT and DEVICE are decimal numbers matching port, link
 +                      or device.  Basically, it matches the ATA ID string
 +                      printed on console by libata.  If the whole ID part is
 +                      omitted, the last PORT and DEVICE values are used.  If
 +                      ID hasn't been specified yet, the configuration applies
 +                      to all ports, links and devices.
  
                        If only DEVICE is omitted, the parameter applies to
                        the port and all links and devices behind it.  DEVICE
                        host link and device attached to it.
  
                        The VAL specifies the configuration to force.  As long
 -                      as there's no ambiguity shortcut notation is allowed.
 +                      as there is no ambiguity, shortcut notation is allowed.
                        For example, both 1.5 and 1.5G would work for 1.5Gbps.
                        The following configurations can be forced.
  
                          udma[/][16,25,33,44,66,100,133] notation is also
                          allowed.
  
 +                      * nohrst, nosrst, norst: suppress hard, soft and both
 +                        resets.
 +
 +                      * rstonce: only attempt one reset during hot-unplug
 +                        link recovery.
 +
 +                      * [no]dbdelay: Enable or disable the extra 200ms delay
 +                        before debouncing a link PHY and device presence
 +                        detection.
 +
                        * [no]ncq: Turn on or off NCQ.
  
 -                      * [no]ncqtrim: Turn off queued DSM TRIM.
 +                      * [no]ncqtrim: Enable or disable queued DSM TRIM.
 +
 +                      * [no]ncqati: Enable or disable NCQ trim on ATI chipset.
 +
 +                      * [no]trim: Enable or disable (unqueued) TRIM.
 +
 +                      * trim_zero: Indicate that TRIM command zeroes data.
 +
 +                      * max_trim_128m: Set 128M maximum trim size limit.
 +
 +                      * [no]dma: Turn on or off DMA transfers.
 +
 +                      * atapi_dmadir: Enable ATAPI DMADIR bridge support.
 +
 +                      * atapi_mod16_dma: Enable the use of ATAPI DMA for
 +                        commands that are not a multiple of 16 bytes.
  
 -                      * nohrst, nosrst, norst: suppress hard, soft
 -                        and both resets.
 +                      * [no]dmalog: Enable or disable the use of the
 +                        READ LOG DMA EXT command to access logs.
  
 -                      * rstonce: only attempt one reset during
 -                        hot-unplug link recovery
 +                      * [no]iddevlog: Enable or disable access to the
 +                        identify device data log.
  
 -                      * dump_id: dump IDENTIFY data.
 +                      * [no]logdir: Enable or disable access to the general
 +                        purpose log directory.
  
 -                      * atapi_dmadir: Enable ATAPI DMADIR bridge support
 +                      * max_sec_128: Set transfer size limit to 128 sectors.
 +
 +                      * max_sec_1024: Set or clear transfer size limit to
 +                        1024 sectors.
 +
 +                      * max_sec_lba48: Set or clear transfer size limit to
 +                        65535 sectors.
 +
 +                      * [no]lpm: Enable or disable link power management.
 +
 +                      * [no]setxfer: Indicate if transfer speed mode setting
 +                        should be skipped.
 +
 +                      * dump_id: Dump IDENTIFY data.
  
                        * disable: Disable this device.
  
  
        rcupdate.rcu_cpu_stall_timeout= [KNL]
                        Set timeout for RCU CPU stall warning messages.
 +                      The value is in seconds and the maximum allowed
 +                      value is 300 seconds.
 +
 +      rcupdate.rcu_exp_cpu_stall_timeout= [KNL]
 +                      Set timeout for expedited RCU CPU stall warning
 +                      messages.  The value is in milliseconds
 +                      and the maximum allowed value is 21000
 +                      milliseconds. Please note that this value is
 +                      adjusted to an arch timer tick resolution.
 +                      Setting this to zero causes the value from
 +                      rcupdate.rcu_cpu_stall_timeout to be used (after
 +                      conversion from seconds to milliseconds).
  
        rcupdate.rcu_expedited= [KNL]
                        Use expedited grace-period primitives, for
                        number avoids disturbing real-time workloads,
                        but lengthens grace periods.
  
 +      rcupdate.rcu_task_stall_info= [KNL]
 +                      Set initial timeout in jiffies for RCU task stall
 +                      informational messages, which give some indication
 +                      of the problem for those not patient enough to
 +                      wait for ten minutes.  Informational messages are
 +                      only printed prior to the stall-warning message
 +                      for a given grace period. Disable with a value
 +                      less than or equal to zero.  Defaults to ten
 +                      seconds.  A change in value does not take effect
 +                      until the beginning of the next grace period.
 +
 +      rcupdate.rcu_task_stall_info_mult= [KNL]
 +                      Multiplier for time interval between successive
 +                      RCU task stall informational messages for a given
 +                      RCU tasks grace period.  This value is clamped
 +                      to one through ten, inclusive.  It defaults to
 +                      the value three, so that the first informational
 +                      message is printed 10 seconds into the grace
 +                      period, the second at 40 seconds, the third at
 +                      160 seconds, and then the stall warning at 600
 +                      seconds would prevent a fourth at 640 seconds.
 +
        rcupdate.rcu_task_stall_timeout= [KNL]
 -                      Set timeout in jiffies for RCU task stall warning
 -                      messages.  Disable with a value less than or equal
 -                      to zero.
 +                      Set timeout in jiffies for RCU task stall
 +                      warning messages.  Disable with a value less
 +                      than or equal to zero.  Defaults to ten minutes.
 +                      A change in value does not take effect until
 +                      the beginning of the next grace period.
  
        rcupdate.rcu_self_test= [KNL]
                        Run the RCU early boot self tests
  
        serialnumber    [BUGS=X86-32]
  
+       sev=option[,option...] [X86-64] See Documentation/x86/x86_64/boot-options.rst
        shapers=        [NET]
                        Maximal number of shapers.
  
        smart2=         [HW]
                        Format: <io1>[,<io2>[,...,<io8>]]
  
 +      smp.csd_lock_timeout= [KNL]
 +                      Specify the period of time in milliseconds
 +                      that smp_call_function() and friends will wait
 +                      for a CPU to release the CSD lock.  This is
 +                      useful when diagnosing bugs involving CPUs
 +                      disabling interrupts for extended periods
 +                      of time.  Defaults to 5,000 milliseconds, and
 +                      setting a value of zero disables this feature.
 +                      This feature may be more efficiently disabled
 +                      using the csdlock_debug- kernel parameter.
 +
        smsc-ircc2.nopnp        [HW] Don't use PNP to discover SMC devices
        smsc-ircc2.ircc_cfg=    [HW] Device configuration I/O port
        smsc-ircc2.ircc_sir=    [HW] SIR base I/O port
                        off:    Disable mitigation and remove
                                performance impact to RDRAND and RDSEED
  
 +      srcutree.big_cpu_lim [KNL]
 +                      Specifies the number of CPUs constituting a
 +                      large system, such that srcu_struct structures
 +                      should immediately allocate an srcu_node array.
 +                      This kernel-boot parameter defaults to 128,
 +                      but takes effect only when the low-order four
 +                      bits of srcutree.convert_to_big is equal to 3
 +                      (decide at boot).
 +
 +      srcutree.convert_to_big [KNL]
 +                      Specifies under what conditions an SRCU tree
 +                      srcu_struct structure will be converted to big
 +                      form, that is, with an rcu_node tree:
 +
 +                                 0:  Never.
 +                                 1:  At init_srcu_struct() time.
 +                                 2:  When rcutorture decides to.
 +                                 3:  Decide at boot time (default).
 +                              0x1X:  Above plus if high contention.
 +
 +                      Either way, the srcu_node tree will be sized based
 +                      on the actual runtime number of CPUs (nr_cpu_ids)
 +                      instead of the compile-time CONFIG_NR_CPUS.
 +
        srcutree.counter_wrap_check [KNL]
                        Specifies how frequently to check for
                        grace-period sequence counter wrap for the
                        expediting.  Set to zero to disable automatic
                        expediting.
  
 +      srcutree.small_contention_lim [KNL]
 +                      Specifies the number of update-side contention
 +                      events per jiffy will be tolerated before
 +                      initiating a conversion of an srcu_struct
 +                      structure to big form.  Note that the value of
 +                      srcutree.convert_to_big must have the 0x10 bit
 +                      set for contention-based conversions to occur.
 +
        ssbd=           [ARM64,HW]
                        Speculative Store Bypass Disable control
  
index 73d958522b6a47e285b9cf05c8562fdd58894141,58a2d764fa39b540f4a06525100e96efd3c96084..c8119dd975565c437cf1982da3ddca098cbc33ab
@@@ -215,8 -215,13 +215,13 @@@ syscall_return_via_sysret
  
        popq    %rdi
        popq    %rsp
+ SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
+       ANNOTATE_NOENDBR
        swapgs
        sysretq
+ SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
+       ANNOTATE_NOENDBR
+       int3
  SYM_CODE_END(entry_SYSCALL_64)
  
  /*
@@@ -337,9 -342,6 +342,9 @@@ SYM_CODE_END(ret_from_fork
  
        call    \cfunc
  
 +      /* For some configurations \cfunc ends up being a noreturn. */
 +      REACHABLE
 +
        jmp     error_return
  .endm
  
@@@ -508,6 -510,7 +513,7 @@@ SYM_CODE_START(\asmsym
        call    vc_switch_off_ist
        movq    %rax, %rsp              /* Switch to new stack */
  
+       ENCODE_FRAME_POINTER
        UNWIND_HINT_REGS
  
        /* Update pt_regs */
index ee15311b6be1d99e2bea11bd4c03a8a36fd8c706,ef96f166b1b6498af3e09b38e20d95383bf93cfe..5555b2f9af462e4e67da0a4b69b0df6fe6cd2f52
  #define TSX_CTRL_RTM_DISABLE          BIT(0)  /* Disable RTM feature */
  #define TSX_CTRL_CPUID_CLEAR          BIT(1)  /* Disable TSX enumeration */
  
 -/* SRBDS support */
  #define MSR_IA32_MCU_OPT_CTRL         0x00000123
 -#define RNGDS_MITG_DIS                        BIT(0)
 +#define RNGDS_MITG_DIS                        BIT(0)  /* SRBDS support */
 +#define RTM_ALLOW                     BIT(1)  /* TSX development mode */
  
  #define MSR_IA32_SYSENTER_CS          0x00000174
  #define MSR_IA32_SYSENTER_ESP         0x00000175
  #define MSR_AMD64_SEV                 0xc0010131
  #define MSR_AMD64_SEV_ENABLED_BIT     0
  #define MSR_AMD64_SEV_ES_ENABLED_BIT  1
+ #define MSR_AMD64_SEV_SNP_ENABLED_BIT 2
  #define MSR_AMD64_SEV_ENABLED         BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
  #define MSR_AMD64_SEV_ES_ENABLED      BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT)
+ #define MSR_AMD64_SEV_SNP_ENABLED     BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT)
  
  #define MSR_AMD64_VIRT_SPEC_CTRL      0xc001011f
  
index e342ae4db3c4de5e88456b72e727cc0def5e6978,9e45521338721f05cc3c8b92997e5ed64a966daf..f0baf1b7522eea9865be08c770cb122ba7866571
@@@ -60,6 -60,7 +60,7 @@@
  #include <asm/uv/uv.h>
  #include <asm/sigframe.h>
  #include <asm/traps.h>
+ #include <asm/sev.h>
  
  #include "cpu.h"
  
@@@ -1855,8 -1856,6 +1856,8 @@@ void identify_secondary_cpu(struct cpui
        validate_apic_and_package_id(c);
        x86_spec_ctrl_setup_ap();
        update_srbds_msr();
 +
 +      tsx_ap_init();
  }
  
  static __init int setup_noclflush(char *arg)
@@@ -2126,6 -2125,9 +2127,9 @@@ void cpu_init_exception_handling(void
  
        load_TR_desc();
  
+       /* GHCB needs to be setup to handle #VC. */
+       setup_ghcb();
        /* Finally load the IDT */
        load_current_idt();
  }
diff --combined arch/x86/kvm/cpuid.c
index 0c1ba6aa07651f4d2698b004c35c27aa1b406118,4b62d80bb22f5b1ea87c8fea4eac87cb038c99ea..de6d44e07e348fcdae81841f8dd516fdea9781bf
@@@ -19,6 -19,7 +19,7 @@@
  #include <asm/user.h>
  #include <asm/fpu/xstate.h>
  #include <asm/sgx.h>
+ #include <asm/cpuid.h>
  #include "cpuid.h"
  #include "lapic.h"
  #include "mmu.h"
@@@ -744,24 -745,8 +745,8 @@@ static struct kvm_cpuid_entry2 *do_host
        cpuid_count(entry->function, entry->index,
                    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
  
-       switch (function) {
-       case 4:
-       case 7:
-       case 0xb:
-       case 0xd:
-       case 0xf:
-       case 0x10:
-       case 0x12:
-       case 0x14:
-       case 0x17:
-       case 0x18:
-       case 0x1d:
-       case 0x1e:
-       case 0x1f:
-       case 0x8000001d:
+       if (cpuid_function_is_indexed(function))
                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-               break;
-       }
  
        return entry;
  }
@@@ -887,11 -872,6 +872,11 @@@ static inline int __do_cpuid_func(struc
                union cpuid10_eax eax;
                union cpuid10_edx edx;
  
 +              if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
 +                      entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
 +                      break;
 +              }
 +
                perf_get_x86_pmu_capability(&cap);
  
                /*
        case 0x80000000:
                entry->eax = min(entry->eax, 0x80000021);
                /*
 -               * Serializing LFENCE is reported in a multitude of ways,
 -               * and NullSegClearsBase is not reported in CPUID on Zen2;
 -               * help userspace by providing the CPUID leaf ourselves.
 +               * Serializing LFENCE is reported in a multitude of ways, and
 +               * NullSegClearsBase is not reported in CPUID on Zen2; help
 +               * userspace by providing the CPUID leaf ourselves.
 +               *
 +               * However, only do it if the host has CPUID leaf 0x8000001d.
 +               * QEMU thinks that it can query the host blindly for that
 +               * CPUID leaf if KVM reports that it supports 0x8000001d or
 +               * above.  The processor merrily returns values from the
 +               * highest Intel leaf which QEMU tries to use as the guest's
 +               * 0x8000001d.  Even worse, this can result in an infinite
 +               * loop if said highest leaf has no subleaves indexed by ECX.
                 */
 -              if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
 -                  || !static_cpu_has_bug(X86_BUG_NULL_SEG))
 +              if (entry->eax >= 0x8000001d &&
 +                  (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
 +                   || !static_cpu_has_bug(X86_BUG_NULL_SEG)))
                        entry->eax = max(entry->eax, 0x80000021);
                break;
        case 0x80000001:
diff --combined arch/x86/kvm/svm/sev.c
index 7c392873626fd6e1cbc17a4559e736201e6325d7,6e18ec1839f0a9eb222be8e601af7d7984ffbc09..636c77ef55fc3be26d9b6e2cd8b19f5cfecbc4cc
@@@ -260,8 -260,6 +260,8 @@@ static int sev_guest_init(struct kvm *k
        INIT_LIST_HEAD(&sev->regions_list);
        INIT_LIST_HEAD(&sev->mirror_vms);
  
 +      kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
 +
        return 0;
  
  e_free:
@@@ -467,7 -465,6 +467,7 @@@ static void sev_clflush_pages(struct pa
                page_virtual = kmap_atomic(pages[i]);
                clflush_cache_range(page_virtual, PAGE_SIZE);
                kunmap_atomic(page_virtual);
 +              cond_resched();
        }
  }
  
@@@ -562,12 -559,20 +562,20 @@@ e_unpin
  
  static int sev_es_sync_vmsa(struct vcpu_svm *svm)
  {
-       struct vmcb_save_area *save = &svm->vmcb->save;
+       struct sev_es_save_area *save = svm->sev_es.vmsa;
  
        /* Check some debug related fields before encrypting the VMSA */
-       if (svm->vcpu.guest_debug || (save->dr7 & ~DR7_FIXED_1))
+       if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1))
                return -EINVAL;
  
+       /*
+        * SEV-ES will use a VMSA that is pointed to by the VMCB, not
+        * the traditional VMSA that is part of the VMCB. Copy the
+        * traditional VMSA as it has been built so far (in prep
+        * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
+        */
+       memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save));
        /* Sync registgers */
        save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX];
        save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX];
        save->xss  = svm->vcpu.arch.ia32_xss;
        save->dr6  = svm->vcpu.arch.dr6;
  
-       /*
-        * SEV-ES will use a VMSA that is pointed to by the VMCB, not
-        * the traditional VMSA that is part of the VMCB. Copy the
-        * traditional VMSA as it has been built so far (in prep
-        * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
-        */
-       memcpy(svm->sev_es.vmsa, save, sizeof(*save));
        return 0;
  }
  
@@@ -1594,51 -1591,24 +1594,51 @@@ static void sev_unlock_two_vms(struct k
        atomic_set_release(&src_sev->migration_in_progress, 0);
  }
  
 +/* vCPU mutex subclasses.  */
 +enum sev_migration_role {
 +      SEV_MIGRATION_SOURCE = 0,
 +      SEV_MIGRATION_TARGET,
 +      SEV_NR_MIGRATION_ROLES,
 +};
  
 -static int sev_lock_vcpus_for_migration(struct kvm *kvm)
 +static int sev_lock_vcpus_for_migration(struct kvm *kvm,
 +                                      enum sev_migration_role role)
  {
        struct kvm_vcpu *vcpu;
        unsigned long i, j;
 +      bool first = true;
  
        kvm_for_each_vcpu(i, vcpu, kvm) {
 -              if (mutex_lock_killable(&vcpu->mutex))
 +              if (mutex_lock_killable_nested(&vcpu->mutex, role))
                        goto out_unlock;
 +
 +              if (first) {
 +                      /*
 +                       * Reset the role to one that avoids colliding with
 +                       * the role used for the first vcpu mutex.
 +                       */
 +                      role = SEV_NR_MIGRATION_ROLES;
 +                      first = false;
 +              } else {
 +                      mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
 +              }
        }
  
        return 0;
  
  out_unlock:
 +
 +      first = true;
        kvm_for_each_vcpu(j, vcpu, kvm) {
                if (i == j)
                        break;
  
 +              if (first)
 +                      first = false;
 +              else
 +                      mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
 +
 +
                mutex_unlock(&vcpu->mutex);
        }
        return -EINTR;
@@@ -1648,15 -1618,8 +1648,15 @@@ static void sev_unlock_vcpus_for_migrat
  {
        struct kvm_vcpu *vcpu;
        unsigned long i;
 +      bool first = true;
  
        kvm_for_each_vcpu(i, vcpu, kvm) {
 +              if (first)
 +                      first = false;
 +              else
 +                      mutex_acquire(&vcpu->mutex.dep_map,
 +                                    SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_);
 +
                mutex_unlock(&vcpu->mutex);
        }
  }
@@@ -1782,10 -1745,10 +1782,10 @@@ int sev_vm_move_enc_context_from(struc
                charged = true;
        }
  
 -      ret = sev_lock_vcpus_for_migration(kvm);
 +      ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE);
        if (ret)
                goto out_dst_cgroup;
 -      ret = sev_lock_vcpus_for_migration(source_kvm);
 +      ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET);
        if (ret)
                goto out_dst_vcpu;
  
@@@ -2260,47 -2223,51 +2260,47 @@@ int sev_cpu_init(struct svm_cpu_data *s
   * Pages used by hardware to hold guest encrypted state must be flushed before
   * returning them to the system.
   */
 -static void sev_flush_guest_memory(struct vcpu_svm *svm, void *va,
 -                                 unsigned long len)
 +static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
  {
 +      int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid;
 +
        /*
 -       * If hardware enforced cache coherency for encrypted mappings of the
 -       * same physical page is supported, nothing to do.
 +       * Note!  The address must be a kernel address, as regular page walk
 +       * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user
 +       * address is non-deterministic and unsafe.  This function deliberately
 +       * takes a pointer to deter passing in a user address.
         */
 -      if (boot_cpu_has(X86_FEATURE_SME_COHERENT))
 -              return;
 +      unsigned long addr = (unsigned long)va;
  
        /*
 -       * If the VM Page Flush MSR is supported, use it to flush the page
 -       * (using the page virtual address and the guest ASID).
 +       * If CPU enforced cache coherency for encrypted mappings of the
 +       * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache
 +       * flush is still needed in order to work properly with DMA devices.
         */
 -      if (boot_cpu_has(X86_FEATURE_VM_PAGE_FLUSH)) {
 -              struct kvm_sev_info *sev;
 -              unsigned long va_start;
 -              u64 start, stop;
 -
 -              /* Align start and stop to page boundaries. */
 -              va_start = (unsigned long)va;
 -              start = (u64)va_start & PAGE_MASK;
 -              stop = PAGE_ALIGN((u64)va_start + len);
 -
 -              if (start < stop) {
 -                      sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
 +      if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) {
 +              clflush_cache_range(va, PAGE_SIZE);
 +              return;
 +      }
  
 -                      while (start < stop) {
 -                              wrmsrl(MSR_AMD64_VM_PAGE_FLUSH,
 -                                     start | sev->asid);
 +      /*
 +       * VM Page Flush takes a host virtual address and a guest ASID.  Fall
 +       * back to WBINVD if this faults so as not to make any problems worse
 +       * by leaving stale encrypted data in the cache.
 +       */
 +      if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
 +              goto do_wbinvd;
  
 -                              start += PAGE_SIZE;
 -                      }
 +      return;
  
 -                      return;
 -              }
 +do_wbinvd:
 +      wbinvd_on_all_cpus();
 +}
  
 -              WARN(1, "Address overflow, using WBINVD\n");
 -      }
 +void sev_guest_memory_reclaimed(struct kvm *kvm)
 +{
 +      if (!sev_guest(kvm))
 +              return;
  
 -      /*
 -       * Hardware should always have one of the above features,
 -       * but if not, use WBINVD and issue a warning.
 -       */
 -      WARN_ONCE(1, "Using WBINVD to flush guest memory\n");
        wbinvd_on_all_cpus();
  }
  
@@@ -2314,8 -2281,7 +2314,8 @@@ void sev_free_vcpu(struct kvm_vcpu *vcp
        svm = to_svm(vcpu);
  
        if (vcpu->arch.guest_state_protected)
 -              sev_flush_guest_memory(svm, svm->sev_es.vmsa, PAGE_SIZE);
 +              sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
 +
        __free_page(virt_to_page(svm->sev_es.vmsa));
  
        if (svm->sev_es.ghcb_sa_free)
@@@ -2966,7 -2932,7 +2966,7 @@@ void sev_es_vcpu_reset(struct vcpu_svm 
                                            sev_enc_bit));
  }
  
- void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa)
+ void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
  {
        /*
         * As an SEV-ES guest, hardware will restore the host state on VMEXIT,
diff --combined arch/x86/kvm/svm/svm.c
index 7e45d03cd018a5cc354936fcebc5b14d43c2cbcc,6ff595f74e3ab5e736e6f997ebf1b978c9021607..17d334ef54308229fe64141f3bff7562181af66f
@@@ -1270,8 -1270,8 +1270,8 @@@ static void svm_prepare_switch_to_guest
         */
        vmsave(__sme_page_pa(sd->save_area));
        if (sev_es_guest(vcpu->kvm)) {
-               struct vmcb_save_area *hostsa;
-               hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
+               struct sev_es_save_area *hostsa;
+               hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
  
                sev_es_prepare_switch_to_guest(hostsa);
        }
@@@ -3117,8 -3117,8 +3117,8 @@@ static void dump_vmcb(struct kvm_vcpu *
               "tr:",
               save01->tr.selector, save01->tr.attrib,
               save01->tr.limit, save01->tr.base);
-       pr_err("cpl:            %d                efer:         %016llx\n",
-               save->cpl, save->efer);
+       pr_err("vmpl: %d   cpl:  %d               efer:          %016llx\n",
+              save->vmpl, save->cpl, save->efer);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "cr0:", save->cr0, "cr2:", save->cr2);
        pr_err("%-15s %016llx %-13s %016llx\n",
@@@ -4620,7 -4620,6 +4620,7 @@@ static struct kvm_x86_ops svm_x86_ops _
        .mem_enc_ioctl = sev_mem_enc_ioctl,
        .mem_enc_register_region = sev_mem_enc_register_region,
        .mem_enc_unregister_region = sev_mem_enc_unregister_region,
 +      .guest_memory_reclaimed = sev_guest_memory_reclaimed,
  
        .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
        .vm_move_enc_context_from = sev_vm_move_enc_context_from,
diff --combined arch/x86/kvm/svm/svm.h
index f76deff71002cbbd3403f43faf7a773bfb14280d,cc857deaee5e7d85b0889594693e29d5b287b42c..2d83845b9032fe858f60ba026d20fccc399f7526
@@@ -181,7 -181,7 +181,7 @@@ struct svm_nested_state 
  
  struct vcpu_sev_es_state {
        /* SEV-ES support */
-       struct vmcb_save_area *vmsa;
+       struct sev_es_save_area *vmsa;
        struct ghcb *ghcb;
        struct kvm_host_map ghcb_map;
        bool received_first_sipi;
@@@ -609,8 -609,6 +609,8 @@@ int sev_mem_enc_unregister_region(struc
                                  struct kvm_enc_region *range);
  int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
  int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
 +void sev_guest_memory_reclaimed(struct kvm *kvm);
 +
  void pre_sev_run(struct vcpu_svm *svm, int cpu);
  void __init sev_set_cpu_caps(void);
  void __init sev_hardware_setup(void);
@@@ -622,7 -620,7 +622,7 @@@ int sev_es_string_io(struct vcpu_svm *s
  void sev_es_init_vmcb(struct vcpu_svm *svm);
  void sev_es_vcpu_reset(struct vcpu_svm *svm);
  void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
- void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa);
+ void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
  void sev_es_unmap_ghcb(struct vcpu_svm *svm);
  
  /* vmenter.S */
diff --combined drivers/virt/Kconfig
index c877da072d4dc44c9602edbac100bfcfbcdf24a7,0c1bba7c5c66b851d59e66c9e868de4c3f6e23c8..87ef258cec64839802079305a79bcd167f2c0df3
@@@ -48,6 -48,6 +48,8 @@@ source "drivers/virt/nitro_enclaves/Kco
  
  source "drivers/virt/acrn/Kconfig"
  
 +source "drivers/virt/coco/efi_secret/Kconfig"
 +
+ source "drivers/virt/coco/sev-guest/Kconfig"
  endif
diff --combined drivers/virt/Makefile
index 067b5427f40fc93de8d7493e6c225722fa2c7e43,b2e6e864ebbe32cca48c518b0d3ee500e4e6407c..093674e05c40f2b5ba5fd9629e8c6a306d678cd1
@@@ -9,4 -9,4 +9,5 @@@ obj-y                            += vboxguest
  
  obj-$(CONFIG_NITRO_ENCLAVES)  += nitro_enclaves/
  obj-$(CONFIG_ACRN_HSM)                += acrn/
 +obj-$(CONFIG_EFI_SECRET)      += coco/efi_secret/
+ obj-$(CONFIG_SEV_GUEST)               += coco/sev-guest/
diff --combined include/linux/efi.h
index 0412304ce34ed9ebb28626468d2b73cd496a1957,984aa688997a4d7f2f3ac8f56d4341e435934d04..db424f3dc3f2fc3550ffceceef226439a0cd2c8c
@@@ -213,8 -213,6 +213,8 @@@ struct capsule_info 
        size_t                  page_bytes_remain;
  };
  
 +int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
 +                           size_t hdr_bytes);
  int __efi_capsule_setup_info(struct capsule_info *cap_info);
  
  /*
@@@ -385,7 -383,6 +385,7 @@@ void efi_native_runtime_setup(void)
  #define EFI_LOAD_FILE_PROTOCOL_GUID           EFI_GUID(0x56ec3091, 0x954c, 0x11d2,  0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b)
  #define EFI_LOAD_FILE2_PROTOCOL_GUID          EFI_GUID(0x4006c0c1, 0xfcb3, 0x403e,  0x99, 0x6d, 0x4a, 0x6c, 0x87, 0x24, 0xe0, 0x6d)
  #define EFI_RT_PROPERTIES_TABLE_GUID          EFI_GUID(0xeb66918a, 0x7eef, 0x402a,  0x84, 0x2e, 0x93, 0x1d, 0x21, 0xc3, 0x8a, 0xe9)
 +#define EFI_DXE_SERVICES_TABLE_GUID           EFI_GUID(0x05ad34ba, 0x6f02, 0x4214,  0x95, 0x2e, 0x4d, 0xa0, 0x39, 0x8e, 0x2b, 0xb9)
  
  #define EFI_IMAGE_SECURITY_DATABASE_GUID      EFI_GUID(0xd719b2cb, 0x3d3a, 0x4596,  0xa3, 0xbc, 0xda, 0xd0, 0x0e, 0x67, 0x65, 0x6f)
  #define EFI_SHIM_LOCK_GUID                    EFI_GUID(0x605dab50, 0xe046, 0x4300,  0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23)
  #define EFI_CERT_SHA256_GUID                  EFI_GUID(0xc1c41626, 0x504c, 0x4092, 0xac, 0xa9, 0x41, 0xf9, 0x36, 0x93, 0x43, 0x28)
  #define EFI_CERT_X509_GUID                    EFI_GUID(0xa5c059a1, 0x94e4, 0x4aa7, 0x87, 0xb5, 0xab, 0x15, 0x5c, 0x2b, 0xf0, 0x72)
  #define EFI_CERT_X509_SHA256_GUID             EFI_GUID(0x3bd2a492, 0x96c0, 0x4079, 0xb4, 0x20, 0xfc, 0xf9, 0x8e, 0xf1, 0x03, 0xed)
+ #define EFI_CC_BLOB_GUID                      EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42)
  
  /*
   * This GUID is used to pass to the kernel proper the struct screen_info
  #define LINUX_EFI_MEMRESERVE_TABLE_GUID               EFI_GUID(0x888eb0c6, 0x8ede, 0x4ff5,  0xa8, 0xf0, 0x9a, 0xee, 0x5c, 0xb9, 0x77, 0xc2)
  #define LINUX_EFI_INITRD_MEDIA_GUID           EFI_GUID(0x5568e427, 0x68fc, 0x4f3d,  0xac, 0x74, 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68)
  #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID     EFI_GUID(0xc451ed2b, 0x9694, 0x45d3,  0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89)
 +#define LINUX_EFI_COCO_SECRET_AREA_GUID               EFI_GUID(0xadf956ad, 0xe98c, 0x484c,  0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47)
 +
 +#define RISCV_EFI_BOOT_PROTOCOL_GUID          EFI_GUID(0xccd15fec, 0x6f73, 0x4eec,  0x83, 0x95, 0x3e, 0x69, 0xe4, 0xb9, 0x40, 0xbf)
 +
 +/*
 + * This GUID may be installed onto the kernel image's handle as a NULL protocol
 + * to signal to the stub that the placement of the image should be respected,
 + * and moving the image in physical memory is undesirable. To ensure
 + * compatibility with 64k pages kernels with virtually mapped stacks, and to
 + * avoid defeating physical randomization, this protocol should only be
 + * installed if the image was placed at a randomized 128k aligned address in
 + * memory.
 + */
 +#define LINUX_EFI_LOADED_IMAGE_FIXED_GUID     EFI_GUID(0xf5a37b6d, 0x3344, 0x42a5,  0xb6, 0xbb, 0x97, 0x86, 0x48, 0xc1, 0x89, 0x0a)
  
  /* OEM GUIDs */
  #define DELLEMC_EFI_RCI2_TABLE_GUID           EFI_GUID(0x2d9f28a2, 0xa886, 0x456a,  0x97, 0xa8, 0xf1, 0x1e, 0xf2, 0x4f, 0xf4, 0x55)
@@@ -452,7 -436,6 +453,7 @@@ typedef struct 
  } efi_config_table_type_t;
  
  #define EFI_SYSTEM_TABLE_SIGNATURE ((u64)0x5453595320494249ULL)
 +#define EFI_DXE_SERVICES_TABLE_SIGNATURE ((u64)0x565245535f455844ULL)
  
  #define EFI_2_30_SYSTEM_TABLE_REVISION  ((2 << 16) | (30))
  #define EFI_2_20_SYSTEM_TABLE_REVISION  ((2 << 16) | (20))
@@@ -614,7 -597,6 +615,7 @@@ extern struct efi 
        unsigned long                   tpm_log;                /* TPM2 Event Log table */
        unsigned long                   tpm_final_log;          /* TPM2 Final Events Log table */
        unsigned long                   mokvar_table;           /* MOK variable config table */
 +      unsigned long                   coco_secret;            /* Confidential computing secret table */
  
        efi_get_time_t                  *get_time;
        efi_set_time_t                  *set_time;
@@@ -1354,12 -1336,4 +1355,12 @@@ extern void efifb_setup_from_dmi(struc
  static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) { }
  #endif
  
 +struct linux_efi_coco_secret_area {
 +      u64     base_pa;
 +      u64     size;
 +};
 +
 +/* Header of a populated EFI secret area */
 +#define EFI_SECRET_TABLE_HEADER_GUID  EFI_GUID(0x1e74f542, 0x71dd, 0x4d66,  0x96, 0x3e, 0xef, 0x42, 0x87, 0xff, 0x17, 0x3b)
 +
  #endif /* _LINUX_EFI_H */
This page took 0.167773 seconds and 4 git commands to generate.