Merge tag 'x86_sev_for_v5.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <[email protected]>

Tue, 24 May 2022 00:38:01 +0000 (17:38 -0700)

committer Linus Torvalds <[email protected]>

Tue, 24 May 2022 00:38:01 +0000 (17:38 -0700)
author Linus Torvalds <[email protected]>
Tue, 24 May 2022 00:38:01 +0000 (17:38 -0700)
committer Linus Torvalds <[email protected]>
Tue, 24 May 2022 00:38:01 +0000 (17:38 -0700)
diff --combined Documentation/admin-guide/kernel-parameters.txt

index 297c852c53f8c8b4e0014daa8ec53999aba05c3d,48ad2ec0770b7a8ec3b14cc756db7a056f8b225e..0c648a867dfa22907ba778269371c79ef128c4a4
--- 1/Documentation/admin-guide/kernel-parameters.txt
--- 2/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@@ -2622,14 -2622,14 +2622,14 @@@
                         when set.
                         Format: <int>
   
- -      libata.force=   [LIBATA] Force configurations.  The format is comma-
- -                      separated list of "[ID:]VAL" where ID is
- -                      PORT[.DEVICE].  PORT and DEVICE are decimal numbers
- -                      matching port, link or device.  Basically, it matches
- -                      the ATA ID string printed on console by libata.  If
- -                      the whole ID part is omitted, the last PORT and DEVICE
- -                      values are used.  If ID hasn't been specified yet, the
- -                      configuration applies to all ports, links and devices.
+ +      libata.force=   [LIBATA] Force configurations.  The format is a comma-
+ +                      separated list of "[ID:]VAL" where ID is PORT[.DEVICE].
+ +                      PORT and DEVICE are decimal numbers matching port, link
+ +                      or device.  Basically, it matches the ATA ID string
+ +                      printed on console by libata.  If the whole ID part is
+ +                      omitted, the last PORT and DEVICE values are used.  If
+ +                      ID hasn't been specified yet, the configuration applies
+ +                      to all ports, links and devices.
   
                         If only DEVICE is omitted, the parameter applies to
                         the port and all links and devices behind it.  DEVICE
@@@ -2639,7 -2639,7 +2639,7 @@@
                         host link and device attached to it.
   
                         The VAL specifies the configuration to force.  As long
- -                      as there's no ambiguity shortcut notation is allowed.
+ +                      as there is no ambiguity, shortcut notation is allowed.
                         For example, both 1.5 and 1.5G would work for 1.5Gbps.
                         The following configurations can be forced.
   
@@@ -2652,58 -2652,19 +2652,58 @@@
                           udma[/][16,25,33,44,66,100,133] notation is also
                           allowed.
   
+ +                      * nohrst, nosrst, norst: suppress hard, soft and both
+ +                        resets.
+ +
+ +                      * rstonce: only attempt one reset during hot-unplug
+ +                        link recovery.
+ +
+ +                      * [no]dbdelay: Enable or disable the extra 200ms delay
+ +                        before debouncing a link PHY and device presence
+ +                        detection.
+ +
                         * [no]ncq: Turn on or off NCQ.
   
- -                      * [no]ncqtrim: Turn off queued DSM TRIM.
+ +                      * [no]ncqtrim: Enable or disable queued DSM TRIM.
+ +
+ +                      * [no]ncqati: Enable or disable NCQ trim on ATI chipset.
+ +
+ +                      * [no]trim: Enable or disable (unqueued) TRIM.
+ +
+ +                      * trim_zero: Indicate that TRIM command zeroes data.
+ +
+ +                      * max_trim_128m: Set 128M maximum trim size limit.
+ +
+ +                      * [no]dma: Turn on or off DMA transfers.
+ +
+ +                      * atapi_dmadir: Enable ATAPI DMADIR bridge support.
+ +
+ +                      * atapi_mod16_dma: Enable the use of ATAPI DMA for
+ +                        commands that are not a multiple of 16 bytes.
   
- -                      * nohrst, nosrst, norst: suppress hard, soft
- -                        and both resets.
+ +                      * [no]dmalog: Enable or disable the use of the
+ +                        READ LOG DMA EXT command to access logs.
   
- -                      * rstonce: only attempt one reset during
- -                        hot-unplug link recovery
+ +                      * [no]iddevlog: Enable or disable access to the
+ +                        identify device data log.
   
- -                      * dump_id: dump IDENTIFY data.
+ +                      * [no]logdir: Enable or disable access to the general
+ +                        purpose log directory.
   
- -                      * atapi_dmadir: Enable ATAPI DMADIR bridge support
+ +                      * max_sec_128: Set transfer size limit to 128 sectors.
+ +
+ +                      * max_sec_1024: Set or clear transfer size limit to
+ +                        1024 sectors.
+ +
+ +                      * max_sec_lba48: Set or clear transfer size limit to
+ +                        65535 sectors.
+ +
+ +                      * [no]lpm: Enable or disable link power management.
+ +
+ +                      * [no]setxfer: Indicate if transfer speed mode setting
+ +                        should be skipped.
+ +
+ +                      * dump_id: Dump IDENTIFY data.
   
                         * disable: Disable this device.
   
@@@ -4932,18 -4893,6 +4932,18 @@@
   
         rcupdate.rcu_cpu_stall_timeout= [KNL]
                         Set timeout for RCU CPU stall warning messages.
+ +                      The value is in seconds and the maximum allowed
+ +                      value is 300 seconds.
+ +
+ +      rcupdate.rcu_exp_cpu_stall_timeout= [KNL]
+ +                      Set timeout for expedited RCU CPU stall warning
+ +                      messages.  The value is in milliseconds
+ +                      and the maximum allowed value is 21000
+ +                      milliseconds. Please note that this value is
+ +                      adjusted to an arch timer tick resolution.
+ +                      Setting this to zero causes the value from
+ +                      rcupdate.rcu_cpu_stall_timeout to be used (after
+ +                      conversion from seconds to milliseconds).
   
         rcupdate.rcu_expedited= [KNL]
                         Use expedited grace-period primitives, for
@@@ -5006,34 -4955,10 +5006,34 @@@
                         number avoids disturbing real-time workloads,
                         but lengthens grace periods.
   
+ +      rcupdate.rcu_task_stall_info= [KNL]
+ +                      Set initial timeout in jiffies for RCU task stall
+ +                      informational messages, which give some indication
+ +                      of the problem for those not patient enough to
+ +                      wait for ten minutes.  Informational messages are
+ +                      only printed prior to the stall-warning message
+ +                      for a given grace period. Disable with a value
+ +                      less than or equal to zero.  Defaults to ten
+ +                      seconds.  A change in value does not take effect
+ +                      until the beginning of the next grace period.
+ +
+ +      rcupdate.rcu_task_stall_info_mult= [KNL]
+ +                      Multiplier for time interval between successive
+ +                      RCU task stall informational messages for a given
+ +                      RCU tasks grace period.  This value is clamped
+ +                      to one through ten, inclusive.  It defaults to
+ +                      the value three, so that the first informational
+ +                      message is printed 10 seconds into the grace
+ +                      period, the second at 40 seconds, the third at
+ +                      160 seconds, and then the stall warning at 600
+ +                      seconds would prevent a fourth at 640 seconds.
+ +
         rcupdate.rcu_task_stall_timeout= [KNL]
- -                      Set timeout in jiffies for RCU task stall warning
- -                      messages.  Disable with a value less than or equal
- -                      to zero.
+ +                      Set timeout in jiffies for RCU task stall
+ +                      warning messages.  Disable with a value less
+ +                      than or equal to zero.  Defaults to ten minutes.
+ +                      A change in value does not take effect until
+ +                      the beginning of the next grace period.
   
         rcupdate.rcu_self_test= [KNL]
                         Run the RCU early boot self tests
@@@ -5383,6 -5308,8 +5383,8 @@@
   
         serialnumber    [BUGS=X86-32]
   
+       sev=option[,option...] [X86-64] See Documentation/x86/x86_64/boot-options.rst
+ 
         shapers=        [NET]
                         Maximal number of shapers.
   
@@@ -5452,17 -5379,6 +5454,17 @@@
         smart2=         [HW]
                         Format: <io1>[,<io2>[,...,<io8>]]
   
+ +      smp.csd_lock_timeout= [KNL]
+ +                      Specify the period of time in milliseconds
+ +                      that smp_call_function() and friends will wait
+ +                      for a CPU to release the CSD lock.  This is
+ +                      useful when diagnosing bugs involving CPUs
+ +                      disabling interrupts for extended periods
+ +                      of time.  Defaults to 5,000 milliseconds, and
+ +                      setting a value of zero disables this feature.
+ +                      This feature may be more efficiently disabled
+ +                      using the csdlock_debug- kernel parameter.
+ +
         smsc-ircc2.nopnp        [HW] Don't use PNP to discover SMC devices
         smsc-ircc2.ircc_cfg=    [HW] Device configuration I/O port
         smsc-ircc2.ircc_sir=    [HW] SIR base I/O port
@@@ -5694,30 -5610,6 +5696,30 @@@
                         off:    Disable mitigation and remove
                                 performance impact to RDRAND and RDSEED
   
+ +      srcutree.big_cpu_lim [KNL]
+ +                      Specifies the number of CPUs constituting a
+ +                      large system, such that srcu_struct structures
+ +                      should immediately allocate an srcu_node array.
+ +                      This kernel-boot parameter defaults to 128,
+ +                      but takes effect only when the low-order four
+ +                      bits of srcutree.convert_to_big is equal to 3
+ +                      (decide at boot).
+ +
+ +      srcutree.convert_to_big [KNL]
+ +                      Specifies under what conditions an SRCU tree
+ +                      srcu_struct structure will be converted to big
+ +                      form, that is, with an rcu_node tree:
+ +
+ +                                 0:  Never.
+ +                                 1:  At init_srcu_struct() time.
+ +                                 2:  When rcutorture decides to.
+ +                                 3:  Decide at boot time (default).
+ +                              0x1X:  Above plus if high contention.
+ +
+ +                      Either way, the srcu_node tree will be sized based
+ +                      on the actual runtime number of CPUs (nr_cpu_ids)
+ +                      instead of the compile-time CONFIG_NR_CPUS.
+ +
         srcutree.counter_wrap_check [KNL]
                         Specifies how frequently to check for
                         grace-period sequence counter wrap for the
@@@ -5735,14 -5627,6 +5737,14 @@@
                         expediting.  Set to zero to disable automatic
                         expediting.
   
+ +      srcutree.small_contention_lim [KNL]
+ +                      Specifies the number of update-side contention
+ +                      events per jiffy will be tolerated before
+ +                      initiating a conversion of an srcu_struct
+ +                      structure to big form.  Note that the value of
+ +                      srcutree.convert_to_big must have the 0x10 bit
+ +                      set for contention-based conversions to occur.
+ +
         ssbd=           [ARM64,HW]
                         Speculative Store Bypass Disable control
   
diff --combined arch/x86/entry/entry_64.S

index 73d958522b6a47e285b9cf05c8562fdd58894141,58a2d764fa39b540f4a06525100e96efd3c96084..c8119dd975565c437cf1982da3ddca098cbc33ab
--- 1/arch/x86/entry/entry_64.S
--- 2/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@@ -215,8 -215,13 +215,13 @@@ syscall_return_via_sysret
   
         popq    %rdi
         popq    %rsp
+ SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
+       ANNOTATE_NOENDBR
         swapgs
         sysretq
+ SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
+       ANNOTATE_NOENDBR
+       int3
   SYM_CODE_END(entry_SYSCALL_64)
   
   /*
@@@ -337,9 -342,6 +342,9 @@@ SYM_CODE_END(ret_from_fork
   
         call    \cfunc
   
+ +      /* For some configurations \cfunc ends up being a noreturn. */
+ +      REACHABLE
+ +
         jmp     error_return
   .endm
   
@@@ -508,6 -510,7 +513,7 @@@ SYM_CODE_START(\asmsym
         call    vc_switch_off_ist
         movq    %rax, %rsp              /* Switch to new stack */
   
+       ENCODE_FRAME_POINTER
         UNWIND_HINT_REGS
   
         /* Update pt_regs */
diff --combined arch/x86/include/asm/msr-index.h

index ee15311b6be1d99e2bea11bd4c03a8a36fd8c706,ef96f166b1b6498af3e09b38e20d95383bf93cfe..5555b2f9af462e4e67da0a4b69b0df6fe6cd2f52
--- 1/arch/x86/include/asm/msr-index.h
--- 2/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@@ -128,9 -128,9 +128,9 @@@
   #define TSX_CTRL_RTM_DISABLE          BIT(0)  /* Disable RTM feature */
   #define TSX_CTRL_CPUID_CLEAR          BIT(1)  /* Disable TSX enumeration */
   
- -/* SRBDS support */
   #define MSR_IA32_MCU_OPT_CTRL         0x00000123
- -#define RNGDS_MITG_DIS                        BIT(0)
+ +#define RNGDS_MITG_DIS                        BIT(0)  /* SRBDS support */
+ +#define RTM_ALLOW                     BIT(1)  /* TSX development mode */
   
   #define MSR_IA32_SYSENTER_CS          0x00000174
   #define MSR_IA32_SYSENTER_ESP         0x00000175
@@@ -502,8 -502,10 +502,10 @@@
   #define MSR_AMD64_SEV                 0xc0010131
   #define MSR_AMD64_SEV_ENABLED_BIT     0
   #define MSR_AMD64_SEV_ES_ENABLED_BIT  1
+ #define MSR_AMD64_SEV_SNP_ENABLED_BIT 2
   #define MSR_AMD64_SEV_ENABLED         BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
   #define MSR_AMD64_SEV_ES_ENABLED      BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT)
+ #define MSR_AMD64_SEV_SNP_ENABLED     BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT)
   
   #define MSR_AMD64_VIRT_SPEC_CTRL      0xc001011f
   
diff --combined arch/x86/kernel/cpu/common.c

index e342ae4db3c4de5e88456b72e727cc0def5e6978,9e45521338721f05cc3c8b92997e5ed64a966daf..f0baf1b7522eea9865be08c770cb122ba7866571
--- 1/arch/x86/kernel/cpu/common.c
--- 2/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@@ -60,6 -60,7 +60,7 @@@
   #include <asm/uv/uv.h>
   #include <asm/sigframe.h>
   #include <asm/traps.h>
+ #include <asm/sev.h>
   
   #include "cpu.h"
   
@@@ -1855,8 -1856,6 +1856,8 @@@ void identify_secondary_cpu(struct cpui
         validate_apic_and_package_id(c);
         x86_spec_ctrl_setup_ap();
         update_srbds_msr();
+ +
+ +      tsx_ap_init();
   }
   
   static __init int setup_noclflush(char *arg)
@@@ -2126,6 -2125,9 +2127,9 @@@ void cpu_init_exception_handling(void
   
         load_TR_desc();
   
+       /* GHCB needs to be setup to handle #VC. */
+       setup_ghcb();
+ 
         /* Finally load the IDT */
         load_current_idt();
   }
diff --combined arch/x86/kvm/cpuid.c

index 0c1ba6aa07651f4d2698b004c35c27aa1b406118,4b62d80bb22f5b1ea87c8fea4eac87cb038c99ea..de6d44e07e348fcdae81841f8dd516fdea9781bf
--- 1/arch/x86/kvm/cpuid.c
--- 2/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@@ -19,6 -19,7 +19,7 @@@
   #include <asm/user.h>
   #include <asm/fpu/xstate.h>
   #include <asm/sgx.h>
+ #include <asm/cpuid.h>
   #include "cpuid.h"
   #include "lapic.h"
   #include "mmu.h"
@@@ -744,24 -745,8 +745,8 @@@ static struct kvm_cpuid_entry2 *do_host
         cpuid_count(entry->function, entry->index,
                     &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
   
-       switch (function) {
-       case 4:
-       case 7:
-       case 0xb:
-       case 0xd:
-       case 0xf:
-       case 0x10:
-       case 0x12:
-       case 0x14:
-       case 0x17:
-       case 0x18:
-       case 0x1d:
-       case 0x1e:
-       case 0x1f:
-       case 0x8000001d:
+       if (cpuid_function_is_indexed(function))
                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-               break;
-       }
   
         return entry;
   }
@@@ -887,11 -872,6 +872,11 @@@ static inline int __do_cpuid_func(struc
                 union cpuid10_eax eax;
                 union cpuid10_edx edx;
   
+ +              if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
+ +                      entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ +                      break;
+ +              }
+ +
                 perf_get_x86_pmu_capability(&cap);
   
                 /*
@@@ -1090,21 -1070,12 +1075,21 @@@
         case 0x80000000:
                 entry->eax = min(entry->eax, 0x80000021);
                 /*
- -               * Serializing LFENCE is reported in a multitude of ways,
- -               * and NullSegClearsBase is not reported in CPUID on Zen2;
- -               * help userspace by providing the CPUID leaf ourselves.
+ +               * Serializing LFENCE is reported in a multitude of ways, and
+ +               * NullSegClearsBase is not reported in CPUID on Zen2; help
+ +               * userspace by providing the CPUID leaf ourselves.
+ +               *
+ +               * However, only do it if the host has CPUID leaf 0x8000001d.
+ +               * QEMU thinks that it can query the host blindly for that
+ +               * CPUID leaf if KVM reports that it supports 0x8000001d or
+ +               * above.  The processor merrily returns values from the
+ +               * highest Intel leaf which QEMU tries to use as the guest's
+ +               * 0x8000001d.  Even worse, this can result in an infinite
+ +               * loop if said highest leaf has no subleaves indexed by ECX.
                  */
- -              if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
- -                  || !static_cpu_has_bug(X86_BUG_NULL_SEG))
+ +              if (entry->eax >= 0x8000001d &&
+ +                  (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
+ +                   || !static_cpu_has_bug(X86_BUG_NULL_SEG)))
                         entry->eax = max(entry->eax, 0x80000021);
                 break;
         case 0x80000001:
diff --combined arch/x86/kvm/svm/sev.c

index 7c392873626fd6e1cbc17a4559e736201e6325d7,6e18ec1839f0a9eb222be8e601af7d7984ffbc09..636c77ef55fc3be26d9b6e2cd8b19f5cfecbc4cc
--- 1/arch/x86/kvm/svm/sev.c
--- 2/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@@ -260,8 -260,6 +260,8 @@@ static int sev_guest_init(struct kvm *k
         INIT_LIST_HEAD(&sev->regions_list);
         INIT_LIST_HEAD(&sev->mirror_vms);
   
+ +      kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
+ +
         return 0;
   
   e_free:
@@@ -467,7 -465,6 +467,7 @@@ static void sev_clflush_pages(struct pa
                 page_virtual = kmap_atomic(pages[i]);
                 clflush_cache_range(page_virtual, PAGE_SIZE);
                 kunmap_atomic(page_virtual);
+ +              cond_resched();
         }
   }
   
@@@ -562,12 -559,20 +562,20 @@@ e_unpin
   
   static int sev_es_sync_vmsa(struct vcpu_svm *svm)
   {
-       struct vmcb_save_area *save = &svm->vmcb->save;
+       struct sev_es_save_area *save = svm->sev_es.vmsa;
   
         /* Check some debug related fields before encrypting the VMSA */
-       if (svm->vcpu.guest_debug || (save->dr7 & ~DR7_FIXED_1))
+       if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1))
                 return -EINVAL;
   
+       /*
+        * SEV-ES will use a VMSA that is pointed to by the VMCB, not
+        * the traditional VMSA that is part of the VMCB. Copy the
+        * traditional VMSA as it has been built so far (in prep
+        * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
+        */
+       memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save));
+ 
         /* Sync registgers */
         save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX];
         save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX];
@@@ -595,14 -600,6 +603,6 @@@
         save->xss  = svm->vcpu.arch.ia32_xss;
         save->dr6  = svm->vcpu.arch.dr6;
   
-       /*
-        * SEV-ES will use a VMSA that is pointed to by the VMCB, not
-        * the traditional VMSA that is part of the VMCB. Copy the
-        * traditional VMSA as it has been built so far (in prep
-        * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
-        */
-       memcpy(svm->sev_es.vmsa, save, sizeof(*save));
- 
         return 0;
   }
   
@@@ -1594,51 -1591,24 +1594,51 @@@ static void sev_unlock_two_vms(struct k
         atomic_set_release(&src_sev->migration_in_progress, 0);
   }
   
+ +/* vCPU mutex subclasses.  */
+ +enum sev_migration_role {
+ +      SEV_MIGRATION_SOURCE = 0,
+ +      SEV_MIGRATION_TARGET,
+ +      SEV_NR_MIGRATION_ROLES,
+ +};
   
- -static int sev_lock_vcpus_for_migration(struct kvm *kvm)
+ +static int sev_lock_vcpus_for_migration(struct kvm *kvm,
+ +                                      enum sev_migration_role role)
   {
         struct kvm_vcpu *vcpu;
         unsigned long i, j;
+ +      bool first = true;
   
         kvm_for_each_vcpu(i, vcpu, kvm) {
- -              if (mutex_lock_killable(&vcpu->mutex))
+ +              if (mutex_lock_killable_nested(&vcpu->mutex, role))
                         goto out_unlock;
+ +
+ +              if (first) {
+ +                      /*
+ +                       * Reset the role to one that avoids colliding with
+ +                       * the role used for the first vcpu mutex.
+ +                       */
+ +                      role = SEV_NR_MIGRATION_ROLES;
+ +                      first = false;
+ +              } else {
+ +                      mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
+ +              }
         }
   
         return 0;
   
   out_unlock:
+ +
+ +      first = true;
         kvm_for_each_vcpu(j, vcpu, kvm) {
                 if (i == j)
                         break;
   
+ +              if (first)
+ +                      first = false;
+ +              else
+ +                      mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
+ +
+ +
                 mutex_unlock(&vcpu->mutex);
         }
         return -EINTR;
@@@ -1648,15 -1618,8 +1648,15 @@@ static void sev_unlock_vcpus_for_migrat
   {
         struct kvm_vcpu *vcpu;
         unsigned long i;
+ +      bool first = true;
   
         kvm_for_each_vcpu(i, vcpu, kvm) {
+ +              if (first)
+ +                      first = false;
+ +              else
+ +                      mutex_acquire(&vcpu->mutex.dep_map,
+ +                                    SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_);
+ +
                 mutex_unlock(&vcpu->mutex);
         }
   }
@@@ -1782,10 -1745,10 +1782,10 @@@ int sev_vm_move_enc_context_from(struc
                 charged = true;
         }
   
- -      ret = sev_lock_vcpus_for_migration(kvm);
+ +      ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE);
         if (ret)
                 goto out_dst_cgroup;
- -      ret = sev_lock_vcpus_for_migration(source_kvm);
+ +      ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET);
         if (ret)
                 goto out_dst_vcpu;
   
@@@ -2260,47 -2223,51 +2260,47 @@@ int sev_cpu_init(struct svm_cpu_data *s
    * Pages used by hardware to hold guest encrypted state must be flushed before
    * returning them to the system.
    */
- -static void sev_flush_guest_memory(struct vcpu_svm *svm, void *va,
- -                                 unsigned long len)
+ +static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
   {
+ +      int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid;
+ +
         /*
- -       * If hardware enforced cache coherency for encrypted mappings of the
- -       * same physical page is supported, nothing to do.
+ +       * Note!  The address must be a kernel address, as regular page walk
+ +       * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user
+ +       * address is non-deterministic and unsafe.  This function deliberately
+ +       * takes a pointer to deter passing in a user address.
          */
- -      if (boot_cpu_has(X86_FEATURE_SME_COHERENT))
- -              return;
+ +      unsigned long addr = (unsigned long)va;
   
         /*
- -       * If the VM Page Flush MSR is supported, use it to flush the page
- -       * (using the page virtual address and the guest ASID).
+ +       * If CPU enforced cache coherency for encrypted mappings of the
+ +       * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache
+ +       * flush is still needed in order to work properly with DMA devices.
          */
- -      if (boot_cpu_has(X86_FEATURE_VM_PAGE_FLUSH)) {
- -              struct kvm_sev_info *sev;
- -              unsigned long va_start;
- -              u64 start, stop;
- -
- -              /* Align start and stop to page boundaries. */
- -              va_start = (unsigned long)va;
- -              start = (u64)va_start & PAGE_MASK;
- -              stop = PAGE_ALIGN((u64)va_start + len);
- -
- -              if (start < stop) {
- -                      sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+ +      if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) {
+ +              clflush_cache_range(va, PAGE_SIZE);
+ +              return;
+ +      }
   
- -                      while (start < stop) {
- -                              wrmsrl(MSR_AMD64_VM_PAGE_FLUSH,
- -                                     start | sev->asid);
+ +      /*
+ +       * VM Page Flush takes a host virtual address and a guest ASID.  Fall
+ +       * back to WBINVD if this faults so as not to make any problems worse
+ +       * by leaving stale encrypted data in the cache.
+ +       */
+ +      if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
+ +              goto do_wbinvd;
   
- -                              start += PAGE_SIZE;
- -                      }
+ +      return;
   
- -                      return;
- -              }
+ +do_wbinvd:
+ +      wbinvd_on_all_cpus();
+ +}
   
- -              WARN(1, "Address overflow, using WBINVD\n");
- -      }
+ +void sev_guest_memory_reclaimed(struct kvm *kvm)
+ +{
+ +      if (!sev_guest(kvm))
+ +              return;
   
- -      /*
- -       * Hardware should always have one of the above features,
- -       * but if not, use WBINVD and issue a warning.
- -       */
- -      WARN_ONCE(1, "Using WBINVD to flush guest memory\n");
         wbinvd_on_all_cpus();
   }
   
@@@ -2314,8 -2281,7 +2314,8 @@@ void sev_free_vcpu(struct kvm_vcpu *vcp
         svm = to_svm(vcpu);
   
         if (vcpu->arch.guest_state_protected)
- -              sev_flush_guest_memory(svm, svm->sev_es.vmsa, PAGE_SIZE);
+ +              sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
+ +
         __free_page(virt_to_page(svm->sev_es.vmsa));
   
         if (svm->sev_es.ghcb_sa_free)
@@@ -2966,7 -2932,7 +2966,7 @@@ void sev_es_vcpu_reset(struct vcpu_svm 
                                             sev_enc_bit));
   }
   
- void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa)
+ void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
   {
         /*
          * As an SEV-ES guest, hardware will restore the host state on VMEXIT,
diff --combined arch/x86/kvm/svm/svm.c

index 7e45d03cd018a5cc354936fcebc5b14d43c2cbcc,6ff595f74e3ab5e736e6f997ebf1b978c9021607..17d334ef54308229fe64141f3bff7562181af66f
--- 1/arch/x86/kvm/svm/svm.c
--- 2/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@@ -1270,8 -1270,8 +1270,8 @@@ static void svm_prepare_switch_to_guest
          */
         vmsave(__sme_page_pa(sd->save_area));
         if (sev_es_guest(vcpu->kvm)) {
-               struct vmcb_save_area *hostsa;
-               hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
+               struct sev_es_save_area *hostsa;
+               hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
   
                 sev_es_prepare_switch_to_guest(hostsa);
         }
@@@ -3117,8 -3117,8 +3117,8 @@@ static void dump_vmcb(struct kvm_vcpu *
                "tr:",
                save01->tr.selector, save01->tr.attrib,
                save01->tr.limit, save01->tr.base);
-       pr_err("cpl:            %d                efer:         %016llx\n",
-               save->cpl, save->efer);
+       pr_err("vmpl: %d   cpl:  %d               efer:          %016llx\n",
+              save->vmpl, save->cpl, save->efer);
         pr_err("%-15s %016llx %-13s %016llx\n",
                "cr0:", save->cr0, "cr2:", save->cr2);
         pr_err("%-15s %016llx %-13s %016llx\n",
@@@ -4620,7 -4620,6 +4620,7 @@@ static struct kvm_x86_ops svm_x86_ops _
         .mem_enc_ioctl = sev_mem_enc_ioctl,
         .mem_enc_register_region = sev_mem_enc_register_region,
         .mem_enc_unregister_region = sev_mem_enc_unregister_region,
+ +      .guest_memory_reclaimed = sev_guest_memory_reclaimed,
   
         .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
         .vm_move_enc_context_from = sev_vm_move_enc_context_from,
diff --combined arch/x86/kvm/svm/svm.h

index f76deff71002cbbd3403f43faf7a773bfb14280d,cc857deaee5e7d85b0889594693e29d5b287b42c..2d83845b9032fe858f60ba026d20fccc399f7526
--- 1/arch/x86/kvm/svm/svm.h
--- 2/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@@ -181,7 -181,7 +181,7 @@@ struct svm_nested_state 
   
   struct vcpu_sev_es_state {
         /* SEV-ES support */
-       struct vmcb_save_area *vmsa;
+       struct sev_es_save_area *vmsa;
         struct ghcb *ghcb;
         struct kvm_host_map ghcb_map;
         bool received_first_sipi;
@@@ -609,8 -609,6 +609,8 @@@ int sev_mem_enc_unregister_region(struc
                                   struct kvm_enc_region *range);
   int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
   int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
+ +void sev_guest_memory_reclaimed(struct kvm *kvm);
+ +
   void pre_sev_run(struct vcpu_svm *svm, int cpu);
   void __init sev_set_cpu_caps(void);
   void __init sev_hardware_setup(void);
@@@ -622,7 -620,7 +622,7 @@@ int sev_es_string_io(struct vcpu_svm *s
   void sev_es_init_vmcb(struct vcpu_svm *svm);
   void sev_es_vcpu_reset(struct vcpu_svm *svm);
   void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
- void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa);
+ void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
   void sev_es_unmap_ghcb(struct vcpu_svm *svm);
   
   /* vmenter.S */
diff --combined drivers/virt/Kconfig

index c877da072d4dc44c9602edbac100bfcfbcdf24a7,0c1bba7c5c66b851d59e66c9e868de4c3f6e23c8..87ef258cec64839802079305a79bcd167f2c0df3
--- 1/drivers/virt/Kconfig
--- 2/drivers/virt/Kconfig
+++ b/drivers/virt/Kconfig
@@@ -48,6 -48,6 +48,8 @@@ source "drivers/virt/nitro_enclaves/Kco
   
   source "drivers/virt/acrn/Kconfig"
   
+ +source "drivers/virt/coco/efi_secret/Kconfig"
+ +
+ source "drivers/virt/coco/sev-guest/Kconfig"
+ 
   endif
diff --combined drivers/virt/Makefile

index 067b5427f40fc93de8d7493e6c225722fa2c7e43,b2e6e864ebbe32cca48c518b0d3ee500e4e6407c..093674e05c40f2b5ba5fd9629e8c6a306d678cd1
--- 1/drivers/virt/Makefile
--- 2/drivers/virt/Makefile
+++ b/drivers/virt/Makefile
@@@ -9,4 -9,4 +9,5 @@@ obj-y                            += vboxguest
   
   obj-$(CONFIG_NITRO_ENCLAVES)  += nitro_enclaves/
   obj-$(CONFIG_ACRN_HSM)                += acrn/
+ +obj-$(CONFIG_EFI_SECRET)      += coco/efi_secret/
+ obj-$(CONFIG_SEV_GUEST)               += coco/sev-guest/
diff --combined include/linux/efi.h

index 0412304ce34ed9ebb28626468d2b73cd496a1957,984aa688997a4d7f2f3ac8f56d4341e435934d04..db424f3dc3f2fc3550ffceceef226439a0cd2c8c
--- 1/include/linux/efi.h
--- 2/include/linux/efi.h
+++ b/include/linux/efi.h
@@@ -213,8 -213,6 +213,8 @@@ struct capsule_info 
         size_t                  page_bytes_remain;
   };
   
+ +int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
+ +                           size_t hdr_bytes);
   int __efi_capsule_setup_info(struct capsule_info *cap_info);
   
   /*
@@@ -385,7 -383,6 +385,7 @@@ void efi_native_runtime_setup(void)
   #define EFI_LOAD_FILE_PROTOCOL_GUID           EFI_GUID(0x56ec3091, 0x954c, 0x11d2,  0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b)
   #define EFI_LOAD_FILE2_PROTOCOL_GUID          EFI_GUID(0x4006c0c1, 0xfcb3, 0x403e,  0x99, 0x6d, 0x4a, 0x6c, 0x87, 0x24, 0xe0, 0x6d)
   #define EFI_RT_PROPERTIES_TABLE_GUID          EFI_GUID(0xeb66918a, 0x7eef, 0x402a,  0x84, 0x2e, 0x93, 0x1d, 0x21, 0xc3, 0x8a, 0xe9)
+ +#define EFI_DXE_SERVICES_TABLE_GUID           EFI_GUID(0x05ad34ba, 0x6f02, 0x4214,  0x95, 0x2e, 0x4d, 0xa0, 0x39, 0x8e, 0x2b, 0xb9)
   
   #define EFI_IMAGE_SECURITY_DATABASE_GUID      EFI_GUID(0xd719b2cb, 0x3d3a, 0x4596,  0xa3, 0xbc, 0xda, 0xd0, 0x0e, 0x67, 0x65, 0x6f)
   #define EFI_SHIM_LOCK_GUID                    EFI_GUID(0x605dab50, 0xe046, 0x4300,  0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23)
@@@ -393,6 -390,7 +393,7 @@@
   #define EFI_CERT_SHA256_GUID                  EFI_GUID(0xc1c41626, 0x504c, 0x4092, 0xac, 0xa9, 0x41, 0xf9, 0x36, 0x93, 0x43, 0x28)
   #define EFI_CERT_X509_GUID                    EFI_GUID(0xa5c059a1, 0x94e4, 0x4aa7, 0x87, 0xb5, 0xab, 0x15, 0x5c, 0x2b, 0xf0, 0x72)
   #define EFI_CERT_X509_SHA256_GUID             EFI_GUID(0x3bd2a492, 0x96c0, 0x4079, 0xb4, 0x20, 0xfc, 0xf9, 0x8e, 0xf1, 0x03, 0xed)
+ #define EFI_CC_BLOB_GUID                      EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42)
   
   /*
    * This GUID is used to pass to the kernel proper the struct screen_info
@@@ -408,20 -406,6 +409,20 @@@
   #define LINUX_EFI_MEMRESERVE_TABLE_GUID               EFI_GUID(0x888eb0c6, 0x8ede, 0x4ff5,  0xa8, 0xf0, 0x9a, 0xee, 0x5c, 0xb9, 0x77, 0xc2)
   #define LINUX_EFI_INITRD_MEDIA_GUID           EFI_GUID(0x5568e427, 0x68fc, 0x4f3d,  0xac, 0x74, 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68)
   #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID     EFI_GUID(0xc451ed2b, 0x9694, 0x45d3,  0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89)
+ +#define LINUX_EFI_COCO_SECRET_AREA_GUID               EFI_GUID(0xadf956ad, 0xe98c, 0x484c,  0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47)
+ +
+ +#define RISCV_EFI_BOOT_PROTOCOL_GUID          EFI_GUID(0xccd15fec, 0x6f73, 0x4eec,  0x83, 0x95, 0x3e, 0x69, 0xe4, 0xb9, 0x40, 0xbf)
+ +
+ +/*
+ + * This GUID may be installed onto the kernel image's handle as a NULL protocol
+ + * to signal to the stub that the placement of the image should be respected,
+ + * and moving the image in physical memory is undesirable. To ensure
+ + * compatibility with 64k pages kernels with virtually mapped stacks, and to
+ + * avoid defeating physical randomization, this protocol should only be
+ + * installed if the image was placed at a randomized 128k aligned address in
+ + * memory.
+ + */
+ +#define LINUX_EFI_LOADED_IMAGE_FIXED_GUID     EFI_GUID(0xf5a37b6d, 0x3344, 0x42a5,  0xb6, 0xbb, 0x97, 0x86, 0x48, 0xc1, 0x89, 0x0a)
   
   /* OEM GUIDs */
   #define DELLEMC_EFI_RCI2_TABLE_GUID           EFI_GUID(0x2d9f28a2, 0xa886, 0x456a,  0x97, 0xa8, 0xf1, 0x1e, 0xf2, 0x4f, 0xf4, 0x55)
@@@ -452,7 -436,6 +453,7 @@@ typedef struct 
   } efi_config_table_type_t;
   
   #define EFI_SYSTEM_TABLE_SIGNATURE ((u64)0x5453595320494249ULL)
+ +#define EFI_DXE_SERVICES_TABLE_SIGNATURE ((u64)0x565245535f455844ULL)
   
   #define EFI_2_30_SYSTEM_TABLE_REVISION  ((2 << 16) | (30))
   #define EFI_2_20_SYSTEM_TABLE_REVISION  ((2 << 16) | (20))
@@@ -614,7 -597,6 +615,7 @@@ extern struct efi 
         unsigned long                   tpm_log;                /* TPM2 Event Log table */
         unsigned long                   tpm_final_log;          /* TPM2 Final Events Log table */
         unsigned long                   mokvar_table;           /* MOK variable config table */
+ +      unsigned long                   coco_secret;            /* Confidential computing secret table */
   
         efi_get_time_t                  *get_time;
         efi_set_time_t                  *set_time;
@@@ -1354,12 -1336,4 +1355,12 @@@ extern void efifb_setup_from_dmi(struc
   static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) { }
   #endif
   
+ +struct linux_efi_coco_secret_area {
+ +      u64     base_pa;
+ +      u64     size;
+ +};
+ +
+ +/* Header of a populated EFI secret area */
+ +#define EFI_SECRET_TABLE_HEADER_GUID  EFI_GUID(0x1e74f542, 0x71dd, 0x4d66,  0x96, 0x3e, 0xef, 0x42, 0x87, 0xff, 0x17, 0x3b)
+ +
   #endif /* _LINUX_EFI_H */
author	Linus Torvalds <[email protected]>
	Tue, 24 May 2022 00:38:01 +0000 (17:38 -0700)
committer	Linus Torvalds <[email protected]>
	Tue, 24 May 2022 00:38:01 +0000 (17:38 -0700)
		1	2
Documentation/admin-guide/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/entry/entry_64.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/msr-index.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/common.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/cpuid.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/svm/sev.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/svm/svm.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/svm/svm.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/virt/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/virt/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/efi.h	patch \|	diff1 \|	diff2 \|	blob \| history