]> Git Repo - J-linux.git/blob - arch/x86/kvm/vmx/nested.c
Merge tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[J-linux.git] / arch / x86 / kvm / vmx / nested.c
1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3
4 #include <linux/objtool.h>
5 #include <linux/percpu.h>
6
7 #include <asm/debugreg.h>
8 #include <asm/mmu_context.h>
9
10 #include "x86.h"
11 #include "cpuid.h"
12 #include "hyperv.h"
13 #include "mmu.h"
14 #include "nested.h"
15 #include "pmu.h"
16 #include "posted_intr.h"
17 #include "sgx.h"
18 #include "trace.h"
19 #include "vmx.h"
20 #include "smm.h"
21
22 static bool __read_mostly enable_shadow_vmcs = 1;
23 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
24
25 static bool __read_mostly nested_early_check = 0;
26 module_param(nested_early_check, bool, S_IRUGO);
27
28 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
29
30 /*
31  * Hyper-V requires all of these, so mark them as supported even though
32  * they are just treated the same as all-context.
33  */
34 #define VMX_VPID_EXTENT_SUPPORTED_MASK          \
35         (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |  \
36         VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |    \
37         VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |    \
38         VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
39
40 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
41
42 enum {
43         VMX_VMREAD_BITMAP,
44         VMX_VMWRITE_BITMAP,
45         VMX_BITMAP_NR
46 };
47 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
48
49 #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
50 #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
51
52 struct shadow_vmcs_field {
53         u16     encoding;
54         u16     offset;
55 };
56 static struct shadow_vmcs_field shadow_read_only_fields[] = {
57 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
58 #include "vmcs_shadow_fields.h"
59 };
60 static int max_shadow_read_only_fields =
61         ARRAY_SIZE(shadow_read_only_fields);
62
63 static struct shadow_vmcs_field shadow_read_write_fields[] = {
64 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
65 #include "vmcs_shadow_fields.h"
66 };
67 static int max_shadow_read_write_fields =
68         ARRAY_SIZE(shadow_read_write_fields);
69
70 static void init_vmcs_shadow_fields(void)
71 {
72         int i, j;
73
74         memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
75         memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
76
77         for (i = j = 0; i < max_shadow_read_only_fields; i++) {
78                 struct shadow_vmcs_field entry = shadow_read_only_fields[i];
79                 u16 field = entry.encoding;
80
81                 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
82                     (i + 1 == max_shadow_read_only_fields ||
83                      shadow_read_only_fields[i + 1].encoding != field + 1))
84                         pr_err("Missing field from shadow_read_only_field %x\n",
85                                field + 1);
86
87                 clear_bit(field, vmx_vmread_bitmap);
88                 if (field & 1)
89 #ifdef CONFIG_X86_64
90                         continue;
91 #else
92                         entry.offset += sizeof(u32);
93 #endif
94                 shadow_read_only_fields[j++] = entry;
95         }
96         max_shadow_read_only_fields = j;
97
98         for (i = j = 0; i < max_shadow_read_write_fields; i++) {
99                 struct shadow_vmcs_field entry = shadow_read_write_fields[i];
100                 u16 field = entry.encoding;
101
102                 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
103                     (i + 1 == max_shadow_read_write_fields ||
104                      shadow_read_write_fields[i + 1].encoding != field + 1))
105                         pr_err("Missing field from shadow_read_write_field %x\n",
106                                field + 1);
107
108                 WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
109                           field <= GUEST_TR_AR_BYTES,
110                           "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
111
112                 /*
113                  * PML and the preemption timer can be emulated, but the
114                  * processor cannot vmwrite to fields that don't exist
115                  * on bare metal.
116                  */
117                 switch (field) {
118                 case GUEST_PML_INDEX:
119                         if (!cpu_has_vmx_pml())
120                                 continue;
121                         break;
122                 case VMX_PREEMPTION_TIMER_VALUE:
123                         if (!cpu_has_vmx_preemption_timer())
124                                 continue;
125                         break;
126                 case GUEST_INTR_STATUS:
127                         if (!cpu_has_vmx_apicv())
128                                 continue;
129                         break;
130                 default:
131                         break;
132                 }
133
134                 clear_bit(field, vmx_vmwrite_bitmap);
135                 clear_bit(field, vmx_vmread_bitmap);
136                 if (field & 1)
137 #ifdef CONFIG_X86_64
138                         continue;
139 #else
140                         entry.offset += sizeof(u32);
141 #endif
142                 shadow_read_write_fields[j++] = entry;
143         }
144         max_shadow_read_write_fields = j;
145 }
146
147 /*
148  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
149  * set the success or error code of an emulated VMX instruction (as specified
150  * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
151  * instruction.
152  */
153 static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
154 {
155         vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
156                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
157                             X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
158         return kvm_skip_emulated_instruction(vcpu);
159 }
160
161 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
162 {
163         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
164                         & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
165                             X86_EFLAGS_SF | X86_EFLAGS_OF))
166                         | X86_EFLAGS_CF);
167         return kvm_skip_emulated_instruction(vcpu);
168 }
169
170 static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
171                                 u32 vm_instruction_error)
172 {
173         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
174                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
175                             X86_EFLAGS_SF | X86_EFLAGS_OF))
176                         | X86_EFLAGS_ZF);
177         get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
178         /*
179          * We don't need to force sync to shadow VMCS because
180          * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
181          * fields and thus must be synced.
182          */
183         if (nested_vmx_is_evmptr12_set(to_vmx(vcpu)))
184                 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
185
186         return kvm_skip_emulated_instruction(vcpu);
187 }
188
189 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
190 {
191         struct vcpu_vmx *vmx = to_vmx(vcpu);
192
193         /*
194          * failValid writes the error number to the current VMCS, which
195          * can't be done if there isn't a current VMCS.
196          */
197         if (vmx->nested.current_vmptr == INVALID_GPA &&
198             !nested_vmx_is_evmptr12_valid(vmx))
199                 return nested_vmx_failInvalid(vcpu);
200
201         return nested_vmx_failValid(vcpu, vm_instruction_error);
202 }
203
204 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
205 {
206         /* TODO: not to reset guest simply here. */
207         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
208         pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator);
209 }
210
211 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
212 {
213         return fixed_bits_valid(control, low, high);
214 }
215
216 static inline u64 vmx_control_msr(u32 low, u32 high)
217 {
218         return low | ((u64)high << 32);
219 }
220
221 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
222 {
223         secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
224         vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
225         vmx->nested.need_vmcs12_to_shadow_sync = false;
226 }
227
228 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
229 {
230 #ifdef CONFIG_KVM_HYPERV
231         struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
232         struct vcpu_vmx *vmx = to_vmx(vcpu);
233
234         kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map);
235         vmx->nested.hv_evmcs = NULL;
236         vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
237
238         if (hv_vcpu) {
239                 hv_vcpu->nested.pa_page_gpa = INVALID_GPA;
240                 hv_vcpu->nested.vm_id = 0;
241                 hv_vcpu->nested.vp_id = 0;
242         }
243 #endif
244 }
245
246 static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr)
247 {
248 #ifdef CONFIG_KVM_HYPERV
249         struct vcpu_vmx *vmx = to_vmx(vcpu);
250         /*
251          * When Enlightened VMEntry is enabled on the calling CPU we treat
252          * memory area pointer by vmptr as Enlightened VMCS (as there's no good
253          * way to distinguish it from VMCS12) and we must not corrupt it by
254          * writing to the non-existent 'launch_state' field. The area doesn't
255          * have to be the currently active EVMCS on the calling CPU and there's
256          * nothing KVM has to do to transition it from 'active' to 'non-active'
257          * state. It is possible that the area will stay mapped as
258          * vmx->nested.hv_evmcs but this shouldn't be a problem.
259          */
260         if (!guest_cpuid_has_evmcs(vcpu) ||
261             !evmptr_is_valid(nested_get_evmptr(vcpu)))
262                 return false;
263
264         if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr)
265                 nested_release_evmcs(vcpu);
266
267         return true;
268 #else
269         return false;
270 #endif
271 }
272
273 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
274                                      struct loaded_vmcs *prev)
275 {
276         struct vmcs_host_state *dest, *src;
277
278         if (unlikely(!vmx->guest_state_loaded))
279                 return;
280
281         src = &prev->host_state;
282         dest = &vmx->loaded_vmcs->host_state;
283
284         vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
285         dest->ldt_sel = src->ldt_sel;
286 #ifdef CONFIG_X86_64
287         dest->ds_sel = src->ds_sel;
288         dest->es_sel = src->es_sel;
289 #endif
290 }
291
292 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
293 {
294         struct vcpu_vmx *vmx = to_vmx(vcpu);
295         struct loaded_vmcs *prev;
296         int cpu;
297
298         if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
299                 return;
300
301         cpu = get_cpu();
302         prev = vmx->loaded_vmcs;
303         vmx->loaded_vmcs = vmcs;
304         vmx_vcpu_load_vmcs(vcpu, cpu, prev);
305         vmx_sync_vmcs_host_state(vmx, prev);
306         put_cpu();
307
308         vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET;
309
310         /*
311          * All lazily updated registers will be reloaded from VMCS12 on both
312          * vmentry and vmexit.
313          */
314         vcpu->arch.regs_dirty = 0;
315 }
316
317 static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu)
318 {
319         struct vcpu_vmx *vmx = to_vmx(vcpu);
320
321         kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map);
322         kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map);
323         kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map);
324         vmx->nested.pi_desc = NULL;
325 }
326
327 /*
328  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
329  * just stops using VMX.
330  */
331 static void free_nested(struct kvm_vcpu *vcpu)
332 {
333         struct vcpu_vmx *vmx = to_vmx(vcpu);
334
335         if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
336                 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
337
338         if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
339                 return;
340
341         kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
342
343         vmx->nested.vmxon = false;
344         vmx->nested.smm.vmxon = false;
345         vmx->nested.vmxon_ptr = INVALID_GPA;
346         free_vpid(vmx->nested.vpid02);
347         vmx->nested.posted_intr_nv = -1;
348         vmx->nested.current_vmptr = INVALID_GPA;
349         if (enable_shadow_vmcs) {
350                 vmx_disable_shadow_vmcs(vmx);
351                 vmcs_clear(vmx->vmcs01.shadow_vmcs);
352                 free_vmcs(vmx->vmcs01.shadow_vmcs);
353                 vmx->vmcs01.shadow_vmcs = NULL;
354         }
355         kfree(vmx->nested.cached_vmcs12);
356         vmx->nested.cached_vmcs12 = NULL;
357         kfree(vmx->nested.cached_shadow_vmcs12);
358         vmx->nested.cached_shadow_vmcs12 = NULL;
359
360         nested_put_vmcs12_pages(vcpu);
361
362         kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
363
364         nested_release_evmcs(vcpu);
365
366         free_loaded_vmcs(&vmx->nested.vmcs02);
367 }
368
369 /*
370  * Ensure that the current vmcs of the logical processor is the
371  * vmcs01 of the vcpu before calling free_nested().
372  */
373 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
374 {
375         vcpu_load(vcpu);
376         vmx_leave_nested(vcpu);
377         vcpu_put(vcpu);
378 }
379
380 #define EPTP_PA_MASK   GENMASK_ULL(51, 12)
381
382 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
383 {
384         return VALID_PAGE(root_hpa) &&
385                ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
386 }
387
388 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
389                                        gpa_t addr)
390 {
391         unsigned long roots = 0;
392         uint i;
393         struct kvm_mmu_root_info *cached_root;
394
395         WARN_ON_ONCE(!mmu_is_nested(vcpu));
396
397         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
398                 cached_root = &vcpu->arch.mmu->prev_roots[i];
399
400                 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
401                                             eptp))
402                         roots |= KVM_MMU_ROOT_PREVIOUS(i);
403         }
404         if (roots)
405                 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots);
406 }
407
408 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
409                 struct x86_exception *fault)
410 {
411         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
412         struct vcpu_vmx *vmx = to_vmx(vcpu);
413         unsigned long exit_qualification;
414         u32 vm_exit_reason;
415
416         if (vmx->nested.pml_full) {
417                 vm_exit_reason = EXIT_REASON_PML_FULL;
418                 vmx->nested.pml_full = false;
419
420                 /*
421                  * It should be impossible to trigger a nested PML Full VM-Exit
422                  * for anything other than an EPT Violation from L2.  KVM *can*
423                  * trigger nEPT page fault injection in response to an EPT
424                  * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT
425                  * tables also changed, but KVM should not treat EPT Misconfig
426                  * VM-Exits as writes.
427                  */
428                 WARN_ON_ONCE(vmx->exit_reason.basic != EXIT_REASON_EPT_VIOLATION);
429
430                 /*
431                  * PML Full and EPT Violation VM-Exits both use bit 12 to report
432                  * "NMI unblocking due to IRET", i.e. the bit can be propagated
433                  * as-is from the original EXIT_QUALIFICATION.
434                  */
435                 exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI;
436         } else {
437                 if (fault->error_code & PFERR_RSVD_MASK) {
438                         vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
439                         exit_qualification = 0;
440                 } else {
441                         exit_qualification = fault->exit_qualification;
442                         exit_qualification |= vmx_get_exit_qual(vcpu) &
443                                               (EPT_VIOLATION_GVA_IS_VALID |
444                                                EPT_VIOLATION_GVA_TRANSLATED);
445                         vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
446                 }
447
448                 /*
449                  * Although the caller (kvm_inject_emulated_page_fault) would
450                  * have already synced the faulting address in the shadow EPT
451                  * tables for the current EPTP12, we also need to sync it for
452                  * any other cached EPTP02s based on the same EP4TA, since the
453                  * TLB associates mappings to the EP4TA rather than the full EPTP.
454                  */
455                 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
456                                            fault->address);
457         }
458
459         nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
460         vmcs12->guest_physical_address = fault->address;
461 }
462
463 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
464 {
465         struct vcpu_vmx *vmx = to_vmx(vcpu);
466         bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT;
467         int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps);
468
469         kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level,
470                                 nested_ept_ad_enabled(vcpu),
471                                 nested_ept_get_eptp(vcpu));
472 }
473
474 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
475 {
476         WARN_ON(mmu_is_nested(vcpu));
477
478         vcpu->arch.mmu = &vcpu->arch.guest_mmu;
479         nested_ept_new_eptp(vcpu);
480         vcpu->arch.mmu->get_guest_pgd     = nested_ept_get_eptp;
481         vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
482         vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
483
484         vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
485 }
486
487 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
488 {
489         vcpu->arch.mmu = &vcpu->arch.root_mmu;
490         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
491 }
492
493 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
494                                             u16 error_code)
495 {
496         bool inequality, bit;
497
498         bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
499         inequality =
500                 (error_code & vmcs12->page_fault_error_code_mask) !=
501                  vmcs12->page_fault_error_code_match;
502         return inequality ^ bit;
503 }
504
505 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
506                                            u32 error_code)
507 {
508         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
509
510         /*
511          * Drop bits 31:16 of the error code when performing the #PF mask+match
512          * check.  All VMCS fields involved are 32 bits, but Intel CPUs never
513          * set bits 31:16 and VMX disallows setting bits 31:16 in the injected
514          * error code.  Including the to-be-dropped bits in the check might
515          * result in an "impossible" or missed exit from L1's perspective.
516          */
517         if (vector == PF_VECTOR)
518                 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code);
519
520         return (vmcs12->exception_bitmap & (1u << vector));
521 }
522
523 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
524                                                struct vmcs12 *vmcs12)
525 {
526         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
527                 return 0;
528
529         if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
530             CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
531                 return -EINVAL;
532
533         return 0;
534 }
535
536 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
537                                                 struct vmcs12 *vmcs12)
538 {
539         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
540                 return 0;
541
542         if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
543                 return -EINVAL;
544
545         return 0;
546 }
547
548 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
549                                                 struct vmcs12 *vmcs12)
550 {
551         if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
552                 return 0;
553
554         if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
555                 return -EINVAL;
556
557         return 0;
558 }
559
560 /*
561  * For x2APIC MSRs, ignore the vmcs01 bitmap.  L1 can enable x2APIC without L1
562  * itself utilizing x2APIC.  All MSRs were previously set to be intercepted,
563  * only the "disable intercept" case needs to be handled.
564  */
565 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1,
566                                                         unsigned long *msr_bitmap_l0,
567                                                         u32 msr, int type)
568 {
569         if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr))
570                 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr);
571
572         if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr))
573                 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr);
574 }
575
576 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
577 {
578         int msr;
579
580         for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
581                 unsigned word = msr / BITS_PER_LONG;
582
583                 msr_bitmap[word] = ~0;
584                 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
585         }
586 }
587
588 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw)                                     \
589 static inline                                                                   \
590 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx,                  \
591                                          unsigned long *msr_bitmap_l1,          \
592                                          unsigned long *msr_bitmap_l0, u32 msr) \
593 {                                                                               \
594         if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) ||            \
595             vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr))                       \
596                 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr);                    \
597         else                                                                    \
598                 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr);                  \
599 }
600 BUILD_NVMX_MSR_INTERCEPT_HELPER(read)
601 BUILD_NVMX_MSR_INTERCEPT_HELPER(write)
602
603 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
604                                                     unsigned long *msr_bitmap_l1,
605                                                     unsigned long *msr_bitmap_l0,
606                                                     u32 msr, int types)
607 {
608         if (types & MSR_TYPE_R)
609                 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1,
610                                                   msr_bitmap_l0, msr);
611         if (types & MSR_TYPE_W)
612                 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1,
613                                                    msr_bitmap_l0, msr);
614 }
615
616 /*
617  * Merge L0's and L1's MSR bitmap, return false to indicate that
618  * we do not use the hardware.
619  */
620 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
621                                                  struct vmcs12 *vmcs12)
622 {
623         struct vcpu_vmx *vmx = to_vmx(vcpu);
624         int msr;
625         unsigned long *msr_bitmap_l1;
626         unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
627         struct kvm_host_map map;
628
629         /* Nothing to do if the MSR bitmap is not in use.  */
630         if (!cpu_has_vmx_msr_bitmap() ||
631             !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
632                 return false;
633
634         /*
635          * MSR bitmap update can be skipped when:
636          * - MSR bitmap for L1 hasn't changed.
637          * - Nested hypervisor (L1) is attempting to launch the same L2 as
638          *   before.
639          * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature
640          *   and tells KVM (L0) there were no changes in MSR bitmap for L2.
641          */
642         if (!vmx->nested.force_msr_bitmap_recalc) {
643                 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
644
645                 if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap &&
646                     evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
647                         return true;
648         }
649
650         if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map))
651                 return false;
652
653         msr_bitmap_l1 = (unsigned long *)map.hva;
654
655         /*
656          * To keep the control flow simple, pay eight 8-byte writes (sixteen
657          * 4-byte writes on 32-bit systems) up front to enable intercepts for
658          * the x2APIC MSR range and selectively toggle those relevant to L2.
659          */
660         enable_x2apic_msr_intercepts(msr_bitmap_l0);
661
662         if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
663                 if (nested_cpu_has_apic_reg_virt(vmcs12)) {
664                         /*
665                          * L0 need not intercept reads for MSRs between 0x800
666                          * and 0x8ff, it just lets the processor take the value
667                          * from the virtual-APIC page; take those 256 bits
668                          * directly from the L1 bitmap.
669                          */
670                         for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
671                                 unsigned word = msr / BITS_PER_LONG;
672
673                                 msr_bitmap_l0[word] = msr_bitmap_l1[word];
674                         }
675                 }
676
677                 nested_vmx_disable_intercept_for_x2apic_msr(
678                         msr_bitmap_l1, msr_bitmap_l0,
679                         X2APIC_MSR(APIC_TASKPRI),
680                         MSR_TYPE_R | MSR_TYPE_W);
681
682                 if (nested_cpu_has_vid(vmcs12)) {
683                         nested_vmx_disable_intercept_for_x2apic_msr(
684                                 msr_bitmap_l1, msr_bitmap_l0,
685                                 X2APIC_MSR(APIC_EOI),
686                                 MSR_TYPE_W);
687                         nested_vmx_disable_intercept_for_x2apic_msr(
688                                 msr_bitmap_l1, msr_bitmap_l0,
689                                 X2APIC_MSR(APIC_SELF_IPI),
690                                 MSR_TYPE_W);
691                 }
692         }
693
694         /*
695          * Always check vmcs01's bitmap to honor userspace MSR filters and any
696          * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through.
697          */
698 #ifdef CONFIG_X86_64
699         nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
700                                          MSR_FS_BASE, MSR_TYPE_RW);
701
702         nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
703                                          MSR_GS_BASE, MSR_TYPE_RW);
704
705         nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
706                                          MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
707 #endif
708         nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
709                                          MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
710
711         nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
712                                          MSR_IA32_PRED_CMD, MSR_TYPE_W);
713
714         nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
715                                          MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
716
717         kvm_vcpu_unmap(vcpu, &map);
718
719         vmx->nested.force_msr_bitmap_recalc = false;
720
721         return true;
722 }
723
724 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
725                                        struct vmcs12 *vmcs12)
726 {
727         struct vcpu_vmx *vmx = to_vmx(vcpu);
728         struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
729
730         if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
731             vmcs12->vmcs_link_pointer == INVALID_GPA)
732                 return;
733
734         if (ghc->gpa != vmcs12->vmcs_link_pointer &&
735             kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
736                                       vmcs12->vmcs_link_pointer, VMCS12_SIZE))
737                 return;
738
739         kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
740                               VMCS12_SIZE);
741 }
742
743 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
744                                               struct vmcs12 *vmcs12)
745 {
746         struct vcpu_vmx *vmx = to_vmx(vcpu);
747         struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
748
749         if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
750             vmcs12->vmcs_link_pointer == INVALID_GPA)
751                 return;
752
753         if (ghc->gpa != vmcs12->vmcs_link_pointer &&
754             kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
755                                       vmcs12->vmcs_link_pointer, VMCS12_SIZE))
756                 return;
757
758         kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
759                                VMCS12_SIZE);
760 }
761
762 /*
763  * In nested virtualization, check if L1 has set
764  * VM_EXIT_ACK_INTR_ON_EXIT
765  */
766 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
767 {
768         return get_vmcs12(vcpu)->vm_exit_controls &
769                 VM_EXIT_ACK_INTR_ON_EXIT;
770 }
771
772 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
773                                           struct vmcs12 *vmcs12)
774 {
775         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
776             CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
777                 return -EINVAL;
778         else
779                 return 0;
780 }
781
782 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
783                                            struct vmcs12 *vmcs12)
784 {
785         if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
786             !nested_cpu_has_apic_reg_virt(vmcs12) &&
787             !nested_cpu_has_vid(vmcs12) &&
788             !nested_cpu_has_posted_intr(vmcs12))
789                 return 0;
790
791         /*
792          * If virtualize x2apic mode is enabled,
793          * virtualize apic access must be disabled.
794          */
795         if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
796                nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
797                 return -EINVAL;
798
799         /*
800          * If virtual interrupt delivery is enabled,
801          * we must exit on external interrupts.
802          */
803         if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
804                 return -EINVAL;
805
806         /*
807          * bits 15:8 should be zero in posted_intr_nv,
808          * the descriptor address has been already checked
809          * in nested_get_vmcs12_pages.
810          *
811          * bits 5:0 of posted_intr_desc_addr should be zero.
812          */
813         if (nested_cpu_has_posted_intr(vmcs12) &&
814            (CC(!nested_cpu_has_vid(vmcs12)) ||
815             CC(!nested_exit_intr_ack_set(vcpu)) ||
816             CC((vmcs12->posted_intr_nv & 0xff00)) ||
817             CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
818                 return -EINVAL;
819
820         /* tpr shadow is needed by all apicv features. */
821         if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
822                 return -EINVAL;
823
824         return 0;
825 }
826
827 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
828                                        u32 count, u64 addr)
829 {
830         if (count == 0)
831                 return 0;
832
833         if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
834             !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
835                 return -EINVAL;
836
837         return 0;
838 }
839
840 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
841                                                      struct vmcs12 *vmcs12)
842 {
843         if (CC(nested_vmx_check_msr_switch(vcpu,
844                                            vmcs12->vm_exit_msr_load_count,
845                                            vmcs12->vm_exit_msr_load_addr)) ||
846             CC(nested_vmx_check_msr_switch(vcpu,
847                                            vmcs12->vm_exit_msr_store_count,
848                                            vmcs12->vm_exit_msr_store_addr)))
849                 return -EINVAL;
850
851         return 0;
852 }
853
854 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
855                                                       struct vmcs12 *vmcs12)
856 {
857         if (CC(nested_vmx_check_msr_switch(vcpu,
858                                            vmcs12->vm_entry_msr_load_count,
859                                            vmcs12->vm_entry_msr_load_addr)))
860                 return -EINVAL;
861
862         return 0;
863 }
864
865 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
866                                          struct vmcs12 *vmcs12)
867 {
868         if (!nested_cpu_has_pml(vmcs12))
869                 return 0;
870
871         if (CC(!nested_cpu_has_ept(vmcs12)) ||
872             CC(!page_address_valid(vcpu, vmcs12->pml_address)))
873                 return -EINVAL;
874
875         return 0;
876 }
877
878 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
879                                                         struct vmcs12 *vmcs12)
880 {
881         if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
882                !nested_cpu_has_ept(vmcs12)))
883                 return -EINVAL;
884         return 0;
885 }
886
887 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
888                                                          struct vmcs12 *vmcs12)
889 {
890         if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
891                !nested_cpu_has_ept(vmcs12)))
892                 return -EINVAL;
893         return 0;
894 }
895
896 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
897                                                  struct vmcs12 *vmcs12)
898 {
899         if (!nested_cpu_has_shadow_vmcs(vmcs12))
900                 return 0;
901
902         if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
903             CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
904                 return -EINVAL;
905
906         return 0;
907 }
908
909 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
910                                        struct vmx_msr_entry *e)
911 {
912         /* x2APIC MSR accesses are not allowed */
913         if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
914                 return -EINVAL;
915         if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
916             CC(e->index == MSR_IA32_UCODE_REV))
917                 return -EINVAL;
918         if (CC(e->reserved != 0))
919                 return -EINVAL;
920         return 0;
921 }
922
923 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
924                                      struct vmx_msr_entry *e)
925 {
926         if (CC(e->index == MSR_FS_BASE) ||
927             CC(e->index == MSR_GS_BASE) ||
928             CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
929             nested_vmx_msr_check_common(vcpu, e))
930                 return -EINVAL;
931         return 0;
932 }
933
934 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
935                                       struct vmx_msr_entry *e)
936 {
937         if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
938             nested_vmx_msr_check_common(vcpu, e))
939                 return -EINVAL;
940         return 0;
941 }
942
943 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
944 {
945         struct vcpu_vmx *vmx = to_vmx(vcpu);
946         u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
947                                        vmx->nested.msrs.misc_high);
948
949         return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
950 }
951
952 /*
953  * Load guest's/host's msr at nested entry/exit.
954  * return 0 for success, entry index for failure.
955  *
956  * One of the failure modes for MSR load/store is when a list exceeds the
957  * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
958  * as possible, process all valid entries before failing rather than precheck
959  * for a capacity violation.
960  */
961 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
962 {
963         u32 i;
964         struct vmx_msr_entry e;
965         u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
966
967         for (i = 0; i < count; i++) {
968                 if (unlikely(i >= max_msr_list_size))
969                         goto fail;
970
971                 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
972                                         &e, sizeof(e))) {
973                         pr_debug_ratelimited(
974                                 "%s cannot read MSR entry (%u, 0x%08llx)\n",
975                                 __func__, i, gpa + i * sizeof(e));
976                         goto fail;
977                 }
978                 if (nested_vmx_load_msr_check(vcpu, &e)) {
979                         pr_debug_ratelimited(
980                                 "%s check failed (%u, 0x%x, 0x%x)\n",
981                                 __func__, i, e.index, e.reserved);
982                         goto fail;
983                 }
984                 if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) {
985                         pr_debug_ratelimited(
986                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
987                                 __func__, i, e.index, e.value);
988                         goto fail;
989                 }
990         }
991         return 0;
992 fail:
993         /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
994         return i + 1;
995 }
996
997 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
998                                             u32 msr_index,
999                                             u64 *data)
1000 {
1001         struct vcpu_vmx *vmx = to_vmx(vcpu);
1002
1003         /*
1004          * If the L0 hypervisor stored a more accurate value for the TSC that
1005          * does not include the time taken for emulation of the L2->L1
1006          * VM-exit in L0, use the more accurate value.
1007          */
1008         if (msr_index == MSR_IA32_TSC) {
1009                 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
1010                                                     MSR_IA32_TSC);
1011
1012                 if (i >= 0) {
1013                         u64 val = vmx->msr_autostore.guest.val[i].value;
1014
1015                         *data = kvm_read_l1_tsc(vcpu, val);
1016                         return true;
1017                 }
1018         }
1019
1020         if (kvm_get_msr_with_filter(vcpu, msr_index, data)) {
1021                 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
1022                         msr_index);
1023                 return false;
1024         }
1025         return true;
1026 }
1027
1028 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
1029                                      struct vmx_msr_entry *e)
1030 {
1031         if (kvm_vcpu_read_guest(vcpu,
1032                                 gpa + i * sizeof(*e),
1033                                 e, 2 * sizeof(u32))) {
1034                 pr_debug_ratelimited(
1035                         "%s cannot read MSR entry (%u, 0x%08llx)\n",
1036                         __func__, i, gpa + i * sizeof(*e));
1037                 return false;
1038         }
1039         if (nested_vmx_store_msr_check(vcpu, e)) {
1040                 pr_debug_ratelimited(
1041                         "%s check failed (%u, 0x%x, 0x%x)\n",
1042                         __func__, i, e->index, e->reserved);
1043                 return false;
1044         }
1045         return true;
1046 }
1047
1048 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
1049 {
1050         u64 data;
1051         u32 i;
1052         struct vmx_msr_entry e;
1053         u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
1054
1055         for (i = 0; i < count; i++) {
1056                 if (unlikely(i >= max_msr_list_size))
1057                         return -EINVAL;
1058
1059                 if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1060                         return -EINVAL;
1061
1062                 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
1063                         return -EINVAL;
1064
1065                 if (kvm_vcpu_write_guest(vcpu,
1066                                          gpa + i * sizeof(e) +
1067                                              offsetof(struct vmx_msr_entry, value),
1068                                          &data, sizeof(data))) {
1069                         pr_debug_ratelimited(
1070                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
1071                                 __func__, i, e.index, data);
1072                         return -EINVAL;
1073                 }
1074         }
1075         return 0;
1076 }
1077
1078 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
1079 {
1080         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1081         u32 count = vmcs12->vm_exit_msr_store_count;
1082         u64 gpa = vmcs12->vm_exit_msr_store_addr;
1083         struct vmx_msr_entry e;
1084         u32 i;
1085
1086         for (i = 0; i < count; i++) {
1087                 if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1088                         return false;
1089
1090                 if (e.index == msr_index)
1091                         return true;
1092         }
1093         return false;
1094 }
1095
1096 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
1097                                            u32 msr_index)
1098 {
1099         struct vcpu_vmx *vmx = to_vmx(vcpu);
1100         struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
1101         bool in_vmcs12_store_list;
1102         int msr_autostore_slot;
1103         bool in_autostore_list;
1104         int last;
1105
1106         msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
1107         in_autostore_list = msr_autostore_slot >= 0;
1108         in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
1109
1110         if (in_vmcs12_store_list && !in_autostore_list) {
1111                 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
1112                         /*
1113                          * Emulated VMEntry does not fail here.  Instead a less
1114                          * accurate value will be returned by
1115                          * nested_vmx_get_vmexit_msr_value() by reading KVM's
1116                          * internal MSR state instead of reading the value from
1117                          * the vmcs02 VMExit MSR-store area.
1118                          */
1119                         pr_warn_ratelimited(
1120                                 "Not enough msr entries in msr_autostore.  Can't add msr %x\n",
1121                                 msr_index);
1122                         return;
1123                 }
1124                 last = autostore->nr++;
1125                 autostore->val[last].index = msr_index;
1126         } else if (!in_vmcs12_store_list && in_autostore_list) {
1127                 last = --autostore->nr;
1128                 autostore->val[msr_autostore_slot] = autostore->val[last];
1129         }
1130 }
1131
1132 /*
1133  * Load guest's/host's cr3 at nested entry/exit.  @nested_ept is true if we are
1134  * emulating VM-Entry into a guest with EPT enabled.  On failure, the expected
1135  * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
1136  * @entry_failure_code.
1137  */
1138 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
1139                                bool nested_ept, bool reload_pdptrs,
1140                                enum vm_entry_failure_code *entry_failure_code)
1141 {
1142         if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) {
1143                 *entry_failure_code = ENTRY_FAIL_DEFAULT;
1144                 return -EINVAL;
1145         }
1146
1147         /*
1148          * If PAE paging and EPT are both on, CR3 is not used by the CPU and
1149          * must not be dereferenced.
1150          */
1151         if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
1152             CC(!load_pdptrs(vcpu, cr3))) {
1153                 *entry_failure_code = ENTRY_FAIL_PDPTE;
1154                 return -EINVAL;
1155         }
1156
1157         vcpu->arch.cr3 = cr3;
1158         kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
1159
1160         /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
1161         kvm_init_mmu(vcpu);
1162
1163         if (!nested_ept)
1164                 kvm_mmu_new_pgd(vcpu, cr3);
1165
1166         return 0;
1167 }
1168
1169 /*
1170  * Returns if KVM is able to config CPU to tag TLB entries
1171  * populated by L2 differently than TLB entries populated
1172  * by L1.
1173  *
1174  * If L0 uses EPT, L1 and L2 run with different EPTP because
1175  * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
1176  * are tagged with different EPTP.
1177  *
1178  * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1179  * with different VPID (L1 entries are tagged with vmx->vpid
1180  * while L2 entries are tagged with vmx->nested.vpid02).
1181  */
1182 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
1183 {
1184         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1185
1186         return enable_ept ||
1187                (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1188 }
1189
1190 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
1191                                             struct vmcs12 *vmcs12,
1192                                             bool is_vmenter)
1193 {
1194         struct vcpu_vmx *vmx = to_vmx(vcpu);
1195
1196         /* Handle pending Hyper-V TLB flush requests */
1197         kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept);
1198
1199         /*
1200          * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
1201          * same VPID as the host, and so architecturally, linear and combined
1202          * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit.  KVM
1203          * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2,
1204          * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01.  This
1205          * is required if VPID is disabled in KVM, as a TLB flush (there are no
1206          * VPIDs) still occurs from L1's perspective, and KVM may need to
1207          * synchronize the MMU in response to the guest TLB flush.
1208          *
1209          * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
1210          * EPT is a special snowflake, as guest-physical mappings aren't
1211          * flushed on VPID invalidations, including VM-Enter or VM-Exit with
1212          * VPID disabled.  As a result, KVM _never_ needs to sync nEPT
1213          * entries on VM-Enter because L1 can't rely on VM-Enter to flush
1214          * those mappings.
1215          */
1216         if (!nested_cpu_has_vpid(vmcs12)) {
1217                 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1218                 return;
1219         }
1220
1221         /* L2 should never have a VPID if VPID is disabled. */
1222         WARN_ON(!enable_vpid);
1223
1224         /*
1225          * VPID is enabled and in use by vmcs12.  If vpid12 is changing, then
1226          * emulate a guest TLB flush as KVM does not track vpid12 history nor
1227          * is the VPID incorporated into the MMU context.  I.e. KVM must assume
1228          * that the new vpid12 has never been used and thus represents a new
1229          * guest ASID that cannot have entries in the TLB.
1230          */
1231         if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
1232                 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
1233                 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1234                 return;
1235         }
1236
1237         /*
1238          * If VPID is enabled, used by vmc12, and vpid12 is not changing but
1239          * does not have a unique TLB tag (ASID), i.e. EPT is disabled and
1240          * KVM was unable to allocate a VPID for L2, flush the current context
1241          * as the effective ASID is common to both L1 and L2.
1242          */
1243         if (!nested_has_guest_tlb_tag(vcpu))
1244                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1245 }
1246
1247 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1248 {
1249         superset &= mask;
1250         subset &= mask;
1251
1252         return (superset | subset) == superset;
1253 }
1254
1255 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1256 {
1257         const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT |
1258                                  VMX_BASIC_INOUT |
1259                                  VMX_BASIC_TRUE_CTLS;
1260
1261         const u64 reserved_bits = GENMASK_ULL(63, 56) |
1262                                   GENMASK_ULL(47, 45) |
1263                                   BIT_ULL(31);
1264
1265         u64 vmx_basic = vmcs_config.nested.basic;
1266
1267         BUILD_BUG_ON(feature_bits & reserved_bits);
1268
1269         /*
1270          * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has
1271          * inverted polarity), the incoming value must not set feature bits or
1272          * reserved bits that aren't allowed/supported by KVM.  Fields, i.e.
1273          * multi-bit values, are explicitly checked below.
1274          */
1275         if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits))
1276                 return -EINVAL;
1277
1278         /*
1279          * KVM does not emulate a version of VMX that constrains physical
1280          * addresses of VMX structures (e.g. VMCS) to 32-bits.
1281          */
1282         if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
1283                 return -EINVAL;
1284
1285         if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1286             vmx_basic_vmcs_revision_id(data))
1287                 return -EINVAL;
1288
1289         if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1290                 return -EINVAL;
1291
1292         vmx->nested.msrs.basic = data;
1293         return 0;
1294 }
1295
1296 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index,
1297                                 u32 **low, u32 **high)
1298 {
1299         switch (msr_index) {
1300         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1301                 *low = &msrs->pinbased_ctls_low;
1302                 *high = &msrs->pinbased_ctls_high;
1303                 break;
1304         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1305                 *low = &msrs->procbased_ctls_low;
1306                 *high = &msrs->procbased_ctls_high;
1307                 break;
1308         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1309                 *low = &msrs->exit_ctls_low;
1310                 *high = &msrs->exit_ctls_high;
1311                 break;
1312         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1313                 *low = &msrs->entry_ctls_low;
1314                 *high = &msrs->entry_ctls_high;
1315                 break;
1316         case MSR_IA32_VMX_PROCBASED_CTLS2:
1317                 *low = &msrs->secondary_ctls_low;
1318                 *high = &msrs->secondary_ctls_high;
1319                 break;
1320         default:
1321                 BUG();
1322         }
1323 }
1324
1325 static int
1326 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1327 {
1328         u32 *lowp, *highp;
1329         u64 supported;
1330
1331         vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp);
1332
1333         supported = vmx_control_msr(*lowp, *highp);
1334
1335         /* Check must-be-1 bits are still 1. */
1336         if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1337                 return -EINVAL;
1338
1339         /* Check must-be-0 bits are still 0. */
1340         if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1341                 return -EINVAL;
1342
1343         vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp);
1344         *lowp = data;
1345         *highp = data >> 32;
1346         return 0;
1347 }
1348
1349 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1350 {
1351         const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA |
1352                                  VMX_MISC_ACTIVITY_HLT |
1353                                  VMX_MISC_ACTIVITY_SHUTDOWN |
1354                                  VMX_MISC_ACTIVITY_WAIT_SIPI |
1355                                  VMX_MISC_INTEL_PT |
1356                                  VMX_MISC_RDMSR_IN_SMM |
1357                                  VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
1358                                  VMX_MISC_VMXOFF_BLOCK_SMI |
1359                                  VMX_MISC_ZERO_LEN_INS;
1360
1361         const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9);
1362
1363         u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
1364                                        vmcs_config.nested.misc_high);
1365
1366         BUILD_BUG_ON(feature_bits & reserved_bits);
1367
1368         /*
1369          * The incoming value must not set feature bits or reserved bits that
1370          * aren't allowed/supported by KVM.  Fields, i.e. multi-bit values, are
1371          * explicitly checked below.
1372          */
1373         if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits))
1374                 return -EINVAL;
1375
1376         if ((vmx->nested.msrs.pinbased_ctls_high &
1377              PIN_BASED_VMX_PREEMPTION_TIMER) &&
1378             vmx_misc_preemption_timer_rate(data) !=
1379             vmx_misc_preemption_timer_rate(vmx_misc))
1380                 return -EINVAL;
1381
1382         if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1383                 return -EINVAL;
1384
1385         if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1386                 return -EINVAL;
1387
1388         if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1389                 return -EINVAL;
1390
1391         vmx->nested.msrs.misc_low = data;
1392         vmx->nested.msrs.misc_high = data >> 32;
1393
1394         return 0;
1395 }
1396
1397 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1398 {
1399         u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps,
1400                                                vmcs_config.nested.vpid_caps);
1401
1402         /* Every bit is either reserved or a feature bit. */
1403         if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1404                 return -EINVAL;
1405
1406         vmx->nested.msrs.ept_caps = data;
1407         vmx->nested.msrs.vpid_caps = data >> 32;
1408         return 0;
1409 }
1410
1411 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index)
1412 {
1413         switch (msr_index) {
1414         case MSR_IA32_VMX_CR0_FIXED0:
1415                 return &msrs->cr0_fixed0;
1416         case MSR_IA32_VMX_CR4_FIXED0:
1417                 return &msrs->cr4_fixed0;
1418         default:
1419                 BUG();
1420         }
1421 }
1422
1423 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1424 {
1425         const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index);
1426
1427         /*
1428          * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1429          * must be 1 in the restored value.
1430          */
1431         if (!is_bitwise_subset(data, *msr, -1ULL))
1432                 return -EINVAL;
1433
1434         *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data;
1435         return 0;
1436 }
1437
1438 /*
1439  * Called when userspace is restoring VMX MSRs.
1440  *
1441  * Returns 0 on success, non-0 otherwise.
1442  */
1443 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1444 {
1445         struct vcpu_vmx *vmx = to_vmx(vcpu);
1446
1447         /*
1448          * Don't allow changes to the VMX capability MSRs while the vCPU
1449          * is in VMX operation.
1450          */
1451         if (vmx->nested.vmxon)
1452                 return -EBUSY;
1453
1454         switch (msr_index) {
1455         case MSR_IA32_VMX_BASIC:
1456                 return vmx_restore_vmx_basic(vmx, data);
1457         case MSR_IA32_VMX_PINBASED_CTLS:
1458         case MSR_IA32_VMX_PROCBASED_CTLS:
1459         case MSR_IA32_VMX_EXIT_CTLS:
1460         case MSR_IA32_VMX_ENTRY_CTLS:
1461                 /*
1462                  * The "non-true" VMX capability MSRs are generated from the
1463                  * "true" MSRs, so we do not support restoring them directly.
1464                  *
1465                  * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1466                  * should restore the "true" MSRs with the must-be-1 bits
1467                  * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1468                  * DEFAULT SETTINGS".
1469                  */
1470                 return -EINVAL;
1471         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1472         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1473         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1474         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1475         case MSR_IA32_VMX_PROCBASED_CTLS2:
1476                 return vmx_restore_control_msr(vmx, msr_index, data);
1477         case MSR_IA32_VMX_MISC:
1478                 return vmx_restore_vmx_misc(vmx, data);
1479         case MSR_IA32_VMX_CR0_FIXED0:
1480         case MSR_IA32_VMX_CR4_FIXED0:
1481                 return vmx_restore_fixed0_msr(vmx, msr_index, data);
1482         case MSR_IA32_VMX_CR0_FIXED1:
1483         case MSR_IA32_VMX_CR4_FIXED1:
1484                 /*
1485                  * These MSRs are generated based on the vCPU's CPUID, so we
1486                  * do not support restoring them directly.
1487                  */
1488                 return -EINVAL;
1489         case MSR_IA32_VMX_EPT_VPID_CAP:
1490                 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1491         case MSR_IA32_VMX_VMCS_ENUM:
1492                 vmx->nested.msrs.vmcs_enum = data;
1493                 return 0;
1494         case MSR_IA32_VMX_VMFUNC:
1495                 if (data & ~vmcs_config.nested.vmfunc_controls)
1496                         return -EINVAL;
1497                 vmx->nested.msrs.vmfunc_controls = data;
1498                 return 0;
1499         default:
1500                 /*
1501                  * The rest of the VMX capability MSRs do not support restore.
1502                  */
1503                 return -EINVAL;
1504         }
1505 }
1506
1507 /* Returns 0 on success, non-0 otherwise. */
1508 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1509 {
1510         switch (msr_index) {
1511         case MSR_IA32_VMX_BASIC:
1512                 *pdata = msrs->basic;
1513                 break;
1514         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1515         case MSR_IA32_VMX_PINBASED_CTLS:
1516                 *pdata = vmx_control_msr(
1517                         msrs->pinbased_ctls_low,
1518                         msrs->pinbased_ctls_high);
1519                 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1520                         *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1521                 break;
1522         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1523         case MSR_IA32_VMX_PROCBASED_CTLS:
1524                 *pdata = vmx_control_msr(
1525                         msrs->procbased_ctls_low,
1526                         msrs->procbased_ctls_high);
1527                 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1528                         *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1529                 break;
1530         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1531         case MSR_IA32_VMX_EXIT_CTLS:
1532                 *pdata = vmx_control_msr(
1533                         msrs->exit_ctls_low,
1534                         msrs->exit_ctls_high);
1535                 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1536                         *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1537                 break;
1538         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1539         case MSR_IA32_VMX_ENTRY_CTLS:
1540                 *pdata = vmx_control_msr(
1541                         msrs->entry_ctls_low,
1542                         msrs->entry_ctls_high);
1543                 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1544                         *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1545                 break;
1546         case MSR_IA32_VMX_MISC:
1547                 *pdata = vmx_control_msr(
1548                         msrs->misc_low,
1549                         msrs->misc_high);
1550                 break;
1551         case MSR_IA32_VMX_CR0_FIXED0:
1552                 *pdata = msrs->cr0_fixed0;
1553                 break;
1554         case MSR_IA32_VMX_CR0_FIXED1:
1555                 *pdata = msrs->cr0_fixed1;
1556                 break;
1557         case MSR_IA32_VMX_CR4_FIXED0:
1558                 *pdata = msrs->cr4_fixed0;
1559                 break;
1560         case MSR_IA32_VMX_CR4_FIXED1:
1561                 *pdata = msrs->cr4_fixed1;
1562                 break;
1563         case MSR_IA32_VMX_VMCS_ENUM:
1564                 *pdata = msrs->vmcs_enum;
1565                 break;
1566         case MSR_IA32_VMX_PROCBASED_CTLS2:
1567                 *pdata = vmx_control_msr(
1568                         msrs->secondary_ctls_low,
1569                         msrs->secondary_ctls_high);
1570                 break;
1571         case MSR_IA32_VMX_EPT_VPID_CAP:
1572                 *pdata = msrs->ept_caps |
1573                         ((u64)msrs->vpid_caps << 32);
1574                 break;
1575         case MSR_IA32_VMX_VMFUNC:
1576                 *pdata = msrs->vmfunc_controls;
1577                 break;
1578         default:
1579                 return 1;
1580         }
1581
1582         return 0;
1583 }
1584
1585 /*
1586  * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1587  * been modified by the L1 guest.  Note, "writable" in this context means
1588  * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1589  * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1590  * VM-exit information fields (which are actually writable if the vCPU is
1591  * configured to support "VMWRITE to any supported field in the VMCS").
1592  */
1593 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1594 {
1595         struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1596         struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1597         struct shadow_vmcs_field field;
1598         unsigned long val;
1599         int i;
1600
1601         if (WARN_ON(!shadow_vmcs))
1602                 return;
1603
1604         preempt_disable();
1605
1606         vmcs_load(shadow_vmcs);
1607
1608         for (i = 0; i < max_shadow_read_write_fields; i++) {
1609                 field = shadow_read_write_fields[i];
1610                 val = __vmcs_readl(field.encoding);
1611                 vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
1612         }
1613
1614         vmcs_clear(shadow_vmcs);
1615         vmcs_load(vmx->loaded_vmcs->vmcs);
1616
1617         preempt_enable();
1618 }
1619
1620 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1621 {
1622         const struct shadow_vmcs_field *fields[] = {
1623                 shadow_read_write_fields,
1624                 shadow_read_only_fields
1625         };
1626         const int max_fields[] = {
1627                 max_shadow_read_write_fields,
1628                 max_shadow_read_only_fields
1629         };
1630         struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1631         struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1632         struct shadow_vmcs_field field;
1633         unsigned long val;
1634         int i, q;
1635
1636         if (WARN_ON(!shadow_vmcs))
1637                 return;
1638
1639         vmcs_load(shadow_vmcs);
1640
1641         for (q = 0; q < ARRAY_SIZE(fields); q++) {
1642                 for (i = 0; i < max_fields[q]; i++) {
1643                         field = fields[q][i];
1644                         val = vmcs12_read_any(vmcs12, field.encoding,
1645                                               field.offset);
1646                         __vmcs_writel(field.encoding, val);
1647                 }
1648         }
1649
1650         vmcs_clear(shadow_vmcs);
1651         vmcs_load(vmx->loaded_vmcs->vmcs);
1652 }
1653
1654 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
1655 {
1656 #ifdef CONFIG_KVM_HYPERV
1657         struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1658         struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
1659         struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu);
1660
1661         /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1662         vmcs12->tpr_threshold = evmcs->tpr_threshold;
1663         vmcs12->guest_rip = evmcs->guest_rip;
1664
1665         if (unlikely(!(hv_clean_fields &
1666                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) {
1667                 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page;
1668                 hv_vcpu->nested.vm_id = evmcs->hv_vm_id;
1669                 hv_vcpu->nested.vp_id = evmcs->hv_vp_id;
1670         }
1671
1672         if (unlikely(!(hv_clean_fields &
1673                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1674                 vmcs12->guest_rsp = evmcs->guest_rsp;
1675                 vmcs12->guest_rflags = evmcs->guest_rflags;
1676                 vmcs12->guest_interruptibility_info =
1677                         evmcs->guest_interruptibility_info;
1678                 /*
1679                  * Not present in struct vmcs12:
1680                  * vmcs12->guest_ssp = evmcs->guest_ssp;
1681                  */
1682         }
1683
1684         if (unlikely(!(hv_clean_fields &
1685                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1686                 vmcs12->cpu_based_vm_exec_control =
1687                         evmcs->cpu_based_vm_exec_control;
1688         }
1689
1690         if (unlikely(!(hv_clean_fields &
1691                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
1692                 vmcs12->exception_bitmap = evmcs->exception_bitmap;
1693         }
1694
1695         if (unlikely(!(hv_clean_fields &
1696                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1697                 vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1698         }
1699
1700         if (unlikely(!(hv_clean_fields &
1701                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1702                 vmcs12->vm_entry_intr_info_field =
1703                         evmcs->vm_entry_intr_info_field;
1704                 vmcs12->vm_entry_exception_error_code =
1705                         evmcs->vm_entry_exception_error_code;
1706                 vmcs12->vm_entry_instruction_len =
1707                         evmcs->vm_entry_instruction_len;
1708         }
1709
1710         if (unlikely(!(hv_clean_fields &
1711                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1712                 vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1713                 vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1714                 vmcs12->host_cr0 = evmcs->host_cr0;
1715                 vmcs12->host_cr3 = evmcs->host_cr3;
1716                 vmcs12->host_cr4 = evmcs->host_cr4;
1717                 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1718                 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1719                 vmcs12->host_rip = evmcs->host_rip;
1720                 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1721                 vmcs12->host_es_selector = evmcs->host_es_selector;
1722                 vmcs12->host_cs_selector = evmcs->host_cs_selector;
1723                 vmcs12->host_ss_selector = evmcs->host_ss_selector;
1724                 vmcs12->host_ds_selector = evmcs->host_ds_selector;
1725                 vmcs12->host_fs_selector = evmcs->host_fs_selector;
1726                 vmcs12->host_gs_selector = evmcs->host_gs_selector;
1727                 vmcs12->host_tr_selector = evmcs->host_tr_selector;
1728                 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl;
1729                 /*
1730                  * Not present in struct vmcs12:
1731                  * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet;
1732                  * vmcs12->host_ssp = evmcs->host_ssp;
1733                  * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr;
1734                  */
1735         }
1736
1737         if (unlikely(!(hv_clean_fields &
1738                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
1739                 vmcs12->pin_based_vm_exec_control =
1740                         evmcs->pin_based_vm_exec_control;
1741                 vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1742                 vmcs12->secondary_vm_exec_control =
1743                         evmcs->secondary_vm_exec_control;
1744         }
1745
1746         if (unlikely(!(hv_clean_fields &
1747                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1748                 vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1749                 vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1750         }
1751
1752         if (unlikely(!(hv_clean_fields &
1753                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1754                 vmcs12->msr_bitmap = evmcs->msr_bitmap;
1755         }
1756
1757         if (unlikely(!(hv_clean_fields &
1758                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1759                 vmcs12->guest_es_base = evmcs->guest_es_base;
1760                 vmcs12->guest_cs_base = evmcs->guest_cs_base;
1761                 vmcs12->guest_ss_base = evmcs->guest_ss_base;
1762                 vmcs12->guest_ds_base = evmcs->guest_ds_base;
1763                 vmcs12->guest_fs_base = evmcs->guest_fs_base;
1764                 vmcs12->guest_gs_base = evmcs->guest_gs_base;
1765                 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1766                 vmcs12->guest_tr_base = evmcs->guest_tr_base;
1767                 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1768                 vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1769                 vmcs12->guest_es_limit = evmcs->guest_es_limit;
1770                 vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1771                 vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1772                 vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1773                 vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1774                 vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1775                 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1776                 vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1777                 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1778                 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1779                 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1780                 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1781                 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1782                 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1783                 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1784                 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1785                 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1786                 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1787                 vmcs12->guest_es_selector = evmcs->guest_es_selector;
1788                 vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1789                 vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1790                 vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1791                 vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1792                 vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1793                 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1794                 vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1795         }
1796
1797         if (unlikely(!(hv_clean_fields &
1798                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1799                 vmcs12->tsc_offset = evmcs->tsc_offset;
1800                 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1801                 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1802                 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap;
1803                 vmcs12->tsc_multiplier = evmcs->tsc_multiplier;
1804         }
1805
1806         if (unlikely(!(hv_clean_fields &
1807                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1808                 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1809                 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1810                 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1811                 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1812                 vmcs12->guest_cr0 = evmcs->guest_cr0;
1813                 vmcs12->guest_cr3 = evmcs->guest_cr3;
1814                 vmcs12->guest_cr4 = evmcs->guest_cr4;
1815                 vmcs12->guest_dr7 = evmcs->guest_dr7;
1816         }
1817
1818         if (unlikely(!(hv_clean_fields &
1819                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1820                 vmcs12->host_fs_base = evmcs->host_fs_base;
1821                 vmcs12->host_gs_base = evmcs->host_gs_base;
1822                 vmcs12->host_tr_base = evmcs->host_tr_base;
1823                 vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1824                 vmcs12->host_idtr_base = evmcs->host_idtr_base;
1825                 vmcs12->host_rsp = evmcs->host_rsp;
1826         }
1827
1828         if (unlikely(!(hv_clean_fields &
1829                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1830                 vmcs12->ept_pointer = evmcs->ept_pointer;
1831                 vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1832         }
1833
1834         if (unlikely(!(hv_clean_fields &
1835                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1836                 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1837                 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1838                 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1839                 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1840                 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1841                 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1842                 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1843                 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1844                 vmcs12->guest_pending_dbg_exceptions =
1845                         evmcs->guest_pending_dbg_exceptions;
1846                 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1847                 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1848                 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1849                 vmcs12->guest_activity_state = evmcs->guest_activity_state;
1850                 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1851                 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl;
1852                 /*
1853                  * Not present in struct vmcs12:
1854                  * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet;
1855                  * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl;
1856                  * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr;
1857                  */
1858         }
1859
1860         /*
1861          * Not used?
1862          * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1863          * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1864          * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1865          * vmcs12->page_fault_error_code_mask =
1866          *              evmcs->page_fault_error_code_mask;
1867          * vmcs12->page_fault_error_code_match =
1868          *              evmcs->page_fault_error_code_match;
1869          * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1870          * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1871          * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1872          * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1873          */
1874
1875         /*
1876          * Read only fields:
1877          * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1878          * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1879          * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1880          * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1881          * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1882          * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1883          * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1884          * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1885          * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1886          * vmcs12->exit_qualification = evmcs->exit_qualification;
1887          * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1888          *
1889          * Not present in struct vmcs12:
1890          * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1891          * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1892          * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1893          * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1894          */
1895
1896         return;
1897 #else /* CONFIG_KVM_HYPERV */
1898         KVM_BUG_ON(1, vmx->vcpu.kvm);
1899 #endif /* CONFIG_KVM_HYPERV */
1900 }
1901
1902 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1903 {
1904 #ifdef CONFIG_KVM_HYPERV
1905         struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1906         struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
1907
1908         /*
1909          * Should not be changed by KVM:
1910          *
1911          * evmcs->host_es_selector = vmcs12->host_es_selector;
1912          * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1913          * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1914          * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1915          * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1916          * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1917          * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1918          * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1919          * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1920          * evmcs->host_cr0 = vmcs12->host_cr0;
1921          * evmcs->host_cr3 = vmcs12->host_cr3;
1922          * evmcs->host_cr4 = vmcs12->host_cr4;
1923          * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1924          * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1925          * evmcs->host_rip = vmcs12->host_rip;
1926          * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1927          * evmcs->host_fs_base = vmcs12->host_fs_base;
1928          * evmcs->host_gs_base = vmcs12->host_gs_base;
1929          * evmcs->host_tr_base = vmcs12->host_tr_base;
1930          * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1931          * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1932          * evmcs->host_rsp = vmcs12->host_rsp;
1933          * sync_vmcs02_to_vmcs12() doesn't read these:
1934          * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1935          * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1936          * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1937          * evmcs->ept_pointer = vmcs12->ept_pointer;
1938          * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1939          * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1940          * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1941          * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1942          * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1943          * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1944          * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1945          * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1946          * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1947          * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1948          * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1949          * evmcs->page_fault_error_code_mask =
1950          *              vmcs12->page_fault_error_code_mask;
1951          * evmcs->page_fault_error_code_match =
1952          *              vmcs12->page_fault_error_code_match;
1953          * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1954          * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1955          * evmcs->tsc_offset = vmcs12->tsc_offset;
1956          * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1957          * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1958          * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1959          * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1960          * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1961          * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1962          * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1963          * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1964          * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl;
1965          * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl;
1966          * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap;
1967          * evmcs->tsc_multiplier = vmcs12->tsc_multiplier;
1968          *
1969          * Not present in struct vmcs12:
1970          * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1971          * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1972          * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1973          * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1974          * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet;
1975          * evmcs->host_ssp = vmcs12->host_ssp;
1976          * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr;
1977          * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet;
1978          * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl;
1979          * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr;
1980          * evmcs->guest_ssp = vmcs12->guest_ssp;
1981          */
1982
1983         evmcs->guest_es_selector = vmcs12->guest_es_selector;
1984         evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1985         evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1986         evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1987         evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1988         evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1989         evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1990         evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1991
1992         evmcs->guest_es_limit = vmcs12->guest_es_limit;
1993         evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1994         evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1995         evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1996         evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1997         evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1998         evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1999         evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
2000         evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
2001         evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
2002
2003         evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
2004         evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
2005         evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
2006         evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
2007         evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
2008         evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
2009         evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
2010         evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
2011
2012         evmcs->guest_es_base = vmcs12->guest_es_base;
2013         evmcs->guest_cs_base = vmcs12->guest_cs_base;
2014         evmcs->guest_ss_base = vmcs12->guest_ss_base;
2015         evmcs->guest_ds_base = vmcs12->guest_ds_base;
2016         evmcs->guest_fs_base = vmcs12->guest_fs_base;
2017         evmcs->guest_gs_base = vmcs12->guest_gs_base;
2018         evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
2019         evmcs->guest_tr_base = vmcs12->guest_tr_base;
2020         evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
2021         evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
2022
2023         evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
2024         evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
2025
2026         evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
2027         evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
2028         evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
2029         evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
2030
2031         evmcs->guest_pending_dbg_exceptions =
2032                 vmcs12->guest_pending_dbg_exceptions;
2033         evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
2034         evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
2035
2036         evmcs->guest_activity_state = vmcs12->guest_activity_state;
2037         evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
2038
2039         evmcs->guest_cr0 = vmcs12->guest_cr0;
2040         evmcs->guest_cr3 = vmcs12->guest_cr3;
2041         evmcs->guest_cr4 = vmcs12->guest_cr4;
2042         evmcs->guest_dr7 = vmcs12->guest_dr7;
2043
2044         evmcs->guest_physical_address = vmcs12->guest_physical_address;
2045
2046         evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
2047         evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
2048         evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
2049         evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
2050         evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
2051         evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
2052         evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
2053         evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
2054
2055         evmcs->exit_qualification = vmcs12->exit_qualification;
2056
2057         evmcs->guest_linear_address = vmcs12->guest_linear_address;
2058         evmcs->guest_rsp = vmcs12->guest_rsp;
2059         evmcs->guest_rflags = vmcs12->guest_rflags;
2060
2061         evmcs->guest_interruptibility_info =
2062                 vmcs12->guest_interruptibility_info;
2063         evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
2064         evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
2065         evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
2066         evmcs->vm_entry_exception_error_code =
2067                 vmcs12->vm_entry_exception_error_code;
2068         evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
2069
2070         evmcs->guest_rip = vmcs12->guest_rip;
2071
2072         evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
2073
2074         return;
2075 #else /* CONFIG_KVM_HYPERV */
2076         KVM_BUG_ON(1, vmx->vcpu.kvm);
2077 #endif /* CONFIG_KVM_HYPERV */
2078 }
2079
2080 /*
2081  * This is an equivalent of the nested hypervisor executing the vmptrld
2082  * instruction.
2083  */
2084 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
2085         struct kvm_vcpu *vcpu, bool from_launch)
2086 {
2087 #ifdef CONFIG_KVM_HYPERV
2088         struct vcpu_vmx *vmx = to_vmx(vcpu);
2089         bool evmcs_gpa_changed = false;
2090         u64 evmcs_gpa;
2091
2092         if (likely(!guest_cpuid_has_evmcs(vcpu)))
2093                 return EVMPTRLD_DISABLED;
2094
2095         evmcs_gpa = nested_get_evmptr(vcpu);
2096         if (!evmptr_is_valid(evmcs_gpa)) {
2097                 nested_release_evmcs(vcpu);
2098                 return EVMPTRLD_DISABLED;
2099         }
2100
2101         if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
2102                 vmx->nested.current_vmptr = INVALID_GPA;
2103
2104                 nested_release_evmcs(vcpu);
2105
2106                 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
2107                                  &vmx->nested.hv_evmcs_map))
2108                         return EVMPTRLD_ERROR;
2109
2110                 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
2111
2112                 /*
2113                  * Currently, KVM only supports eVMCS version 1
2114                  * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
2115                  * value to first u32 field of eVMCS which should specify eVMCS
2116                  * VersionNumber.
2117                  *
2118                  * Guest should be aware of supported eVMCS versions by host by
2119                  * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
2120                  * expected to set this CPUID leaf according to the value
2121                  * returned in vmcs_version from nested_enable_evmcs().
2122                  *
2123                  * However, it turns out that Microsoft Hyper-V fails to comply
2124                  * to their own invented interface: When Hyper-V use eVMCS, it
2125                  * just sets first u32 field of eVMCS to revision_id specified
2126                  * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
2127                  * which is one of the supported versions specified in
2128                  * CPUID.0x4000000A.EAX[0:15].
2129                  *
2130                  * To overcome Hyper-V bug, we accept here either a supported
2131                  * eVMCS version or VMCS12 revision_id as valid values for first
2132                  * u32 field of eVMCS.
2133                  */
2134                 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
2135                     (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
2136                         nested_release_evmcs(vcpu);
2137                         return EVMPTRLD_VMFAIL;
2138                 }
2139
2140                 vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
2141
2142                 evmcs_gpa_changed = true;
2143                 /*
2144                  * Unlike normal vmcs12, enlightened vmcs12 is not fully
2145                  * reloaded from guest's memory (read only fields, fields not
2146                  * present in struct hv_enlightened_vmcs, ...). Make sure there
2147                  * are no leftovers.
2148                  */
2149                 if (from_launch) {
2150                         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2151                         memset(vmcs12, 0, sizeof(*vmcs12));
2152                         vmcs12->hdr.revision_id = VMCS12_REVISION;
2153                 }
2154
2155         }
2156
2157         /*
2158          * Clean fields data can't be used on VMLAUNCH and when we switch
2159          * between different L2 guests as KVM keeps a single VMCS12 per L1.
2160          */
2161         if (from_launch || evmcs_gpa_changed) {
2162                 vmx->nested.hv_evmcs->hv_clean_fields &=
2163                         ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2164
2165                 vmx->nested.force_msr_bitmap_recalc = true;
2166         }
2167
2168         return EVMPTRLD_SUCCEEDED;
2169 #else
2170         return EVMPTRLD_DISABLED;
2171 #endif
2172 }
2173
2174 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
2175 {
2176         struct vcpu_vmx *vmx = to_vmx(vcpu);
2177
2178         if (nested_vmx_is_evmptr12_valid(vmx))
2179                 copy_vmcs12_to_enlightened(vmx);
2180         else
2181                 copy_vmcs12_to_shadow(vmx);
2182
2183         vmx->nested.need_vmcs12_to_shadow_sync = false;
2184 }
2185
2186 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
2187 {
2188         struct vcpu_vmx *vmx =
2189                 container_of(timer, struct vcpu_vmx, nested.preemption_timer);
2190
2191         vmx->nested.preemption_timer_expired = true;
2192         kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
2193         kvm_vcpu_kick(&vmx->vcpu);
2194
2195         return HRTIMER_NORESTART;
2196 }
2197
2198 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
2199 {
2200         struct vcpu_vmx *vmx = to_vmx(vcpu);
2201         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2202
2203         u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
2204                             VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2205
2206         if (!vmx->nested.has_preemption_timer_deadline) {
2207                 vmx->nested.preemption_timer_deadline =
2208                         vmcs12->vmx_preemption_timer_value + l1_scaled_tsc;
2209                 vmx->nested.has_preemption_timer_deadline = true;
2210         }
2211         return vmx->nested.preemption_timer_deadline - l1_scaled_tsc;
2212 }
2213
2214 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
2215                                         u64 preemption_timeout)
2216 {
2217         struct vcpu_vmx *vmx = to_vmx(vcpu);
2218
2219         /*
2220          * A timer value of zero is architecturally guaranteed to cause
2221          * a VMExit prior to executing any instructions in the guest.
2222          */
2223         if (preemption_timeout == 0) {
2224                 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
2225                 return;
2226         }
2227
2228         if (vcpu->arch.virtual_tsc_khz == 0)
2229                 return;
2230
2231         preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2232         preemption_timeout *= 1000000;
2233         do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
2234         hrtimer_start(&vmx->nested.preemption_timer,
2235                       ktime_add_ns(ktime_get(), preemption_timeout),
2236                       HRTIMER_MODE_ABS_PINNED);
2237 }
2238
2239 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2240 {
2241         if (vmx->nested.nested_run_pending &&
2242             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
2243                 return vmcs12->guest_ia32_efer;
2244         else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
2245                 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
2246         else
2247                 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
2248 }
2249
2250 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
2251 {
2252         struct kvm *kvm = vmx->vcpu.kvm;
2253
2254         /*
2255          * If vmcs02 hasn't been initialized, set the constant vmcs02 state
2256          * according to L0's settings (vmcs12 is irrelevant here).  Host
2257          * fields that come from L0 and are not constant, e.g. HOST_CR3,
2258          * will be set as needed prior to VMLAUNCH/VMRESUME.
2259          */
2260         if (vmx->nested.vmcs02_initialized)
2261                 return;
2262         vmx->nested.vmcs02_initialized = true;
2263
2264         /*
2265          * We don't care what the EPTP value is we just need to guarantee
2266          * it's valid so we don't get a false positive when doing early
2267          * consistency checks.
2268          */
2269         if (enable_ept && nested_early_check)
2270                 vmcs_write64(EPT_POINTER,
2271                              construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
2272
2273         if (vmx->ve_info)
2274                 vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info));
2275
2276         /* All VMFUNCs are currently emulated through L0 vmexits.  */
2277         if (cpu_has_vmx_vmfunc())
2278                 vmcs_write64(VM_FUNCTION_CONTROL, 0);
2279
2280         if (cpu_has_vmx_posted_intr())
2281                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
2282
2283         if (cpu_has_vmx_msr_bitmap())
2284                 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2285
2286         /*
2287          * PML is emulated for L2, but never enabled in hardware as the MMU
2288          * handles A/D emulation.  Disabling PML for L2 also avoids having to
2289          * deal with filtering out L2 GPAs from the buffer.
2290          */
2291         if (enable_pml) {
2292                 vmcs_write64(PML_ADDRESS, 0);
2293                 vmcs_write16(GUEST_PML_INDEX, -1);
2294         }
2295
2296         if (cpu_has_vmx_encls_vmexit())
2297                 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
2298
2299         if (kvm_notify_vmexit_enabled(kvm))
2300                 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
2301
2302         /*
2303          * Set the MSR load/store lists to match L0's settings.  Only the
2304          * addresses are constant (for vmcs02), the counts can change based
2305          * on L2's behavior, e.g. switching to/from long mode.
2306          */
2307         vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
2308         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2309         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
2310
2311         vmx_set_constant_host_state(vmx);
2312 }
2313
2314 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
2315                                       struct vmcs12 *vmcs12)
2316 {
2317         prepare_vmcs02_constant_state(vmx);
2318
2319         vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
2320
2321         /*
2322          * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
2323          * same VPID as the host.  Emulate this behavior by using vpid01 for L2
2324          * if VPID is disabled in vmcs12.  Note, if VPID is disabled, VM-Enter
2325          * and VM-Exit are architecturally required to flush VPID=0, but *only*
2326          * VPID=0.  I.e. using vpid02 would be ok (so long as KVM emulates the
2327          * required flushes), but doing so would cause KVM to over-flush.  E.g.
2328          * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled,
2329          * and then runs L2 X again, then KVM can and should retain TLB entries
2330          * for VPID12=1.
2331          */
2332         if (enable_vpid) {
2333                 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2334                         vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2335                 else
2336                         vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2337         }
2338 }
2339
2340 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
2341                                  struct vmcs12 *vmcs12)
2342 {
2343         u32 exec_control;
2344         u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2345
2346         if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx))
2347                 prepare_vmcs02_early_rare(vmx, vmcs12);
2348
2349         /*
2350          * PIN CONTROLS
2351          */
2352         exec_control = __pin_controls_get(vmcs01);
2353         exec_control |= (vmcs12->pin_based_vm_exec_control &
2354                          ~PIN_BASED_VMX_PREEMPTION_TIMER);
2355
2356         /* Posted interrupts setting is only taken from vmcs12.  */
2357         vmx->nested.pi_pending = false;
2358         if (nested_cpu_has_posted_intr(vmcs12)) {
2359                 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2360         } else {
2361                 vmx->nested.posted_intr_nv = -1;
2362                 exec_control &= ~PIN_BASED_POSTED_INTR;
2363         }
2364         pin_controls_set(vmx, exec_control);
2365
2366         /*
2367          * EXEC CONTROLS
2368          */
2369         exec_control = __exec_controls_get(vmcs01); /* L0's desires */
2370         exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
2371         exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
2372         exec_control &= ~CPU_BASED_TPR_SHADOW;
2373         exec_control |= vmcs12->cpu_based_vm_exec_control;
2374
2375         vmx->nested.l1_tpr_threshold = -1;
2376         if (exec_control & CPU_BASED_TPR_SHADOW)
2377                 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2378 #ifdef CONFIG_X86_64
2379         else
2380                 exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2381                                 CPU_BASED_CR8_STORE_EXITING;
2382 #endif
2383
2384         /*
2385          * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2386          * for I/O port accesses.
2387          */
2388         exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2389         exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2390
2391         /*
2392          * This bit will be computed in nested_get_vmcs12_pages, because
2393          * we do not have access to L1's MSR bitmap yet.  For now, keep
2394          * the same bit as before, hoping to avoid multiple VMWRITEs that
2395          * only set/clear this bit.
2396          */
2397         exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2398         exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2399
2400         exec_controls_set(vmx, exec_control);
2401
2402         /*
2403          * SECONDARY EXEC CONTROLS
2404          */
2405         if (cpu_has_secondary_exec_ctrls()) {
2406                 exec_control = __secondary_exec_controls_get(vmcs01);
2407
2408                 /* Take the following fields only from vmcs12 */
2409                 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2410                                   SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2411                                   SECONDARY_EXEC_ENABLE_INVPCID |
2412                                   SECONDARY_EXEC_ENABLE_RDTSCP |
2413                                   SECONDARY_EXEC_ENABLE_XSAVES |
2414                                   SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2415                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2416                                   SECONDARY_EXEC_APIC_REGISTER_VIRT |
2417                                   SECONDARY_EXEC_ENABLE_VMFUNC |
2418                                   SECONDARY_EXEC_DESC);
2419
2420                 if (nested_cpu_has(vmcs12,
2421                                    CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
2422                         exec_control |= vmcs12->secondary_vm_exec_control;
2423
2424                 /* PML is emulated and never enabled in hardware for L2. */
2425                 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
2426
2427                 /* VMCS shadowing for L2 is emulated for now */
2428                 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2429
2430                 /*
2431                  * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2432                  * will not have to rewrite the controls just for this bit.
2433                  */
2434                 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP))
2435                         exec_control |= SECONDARY_EXEC_DESC;
2436
2437                 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2438                         vmcs_write16(GUEST_INTR_STATUS,
2439                                 vmcs12->guest_intr_status);
2440
2441                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
2442                     exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2443
2444                 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
2445                         vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
2446
2447                 secondary_exec_controls_set(vmx, exec_control);
2448         }
2449
2450         /*
2451          * ENTRY CONTROLS
2452          *
2453          * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2454          * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2455          * on the related bits (if supported by the CPU) in the hope that
2456          * we can avoid VMWrites during vmx_set_efer().
2457          *
2458          * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is
2459          * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to
2460          * do the same for L2.
2461          */
2462         exec_control = __vm_entry_controls_get(vmcs01);
2463         exec_control |= (vmcs12->vm_entry_controls &
2464                          ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
2465         exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
2466         if (cpu_has_load_ia32_efer()) {
2467                 if (guest_efer & EFER_LMA)
2468                         exec_control |= VM_ENTRY_IA32E_MODE;
2469                 if (guest_efer != kvm_host.efer)
2470                         exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2471         }
2472         vm_entry_controls_set(vmx, exec_control);
2473
2474         /*
2475          * EXIT CONTROLS
2476          *
2477          * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2478          * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2479          * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2480          */
2481         exec_control = __vm_exit_controls_get(vmcs01);
2482         if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer)
2483                 exec_control |= VM_EXIT_LOAD_IA32_EFER;
2484         else
2485                 exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
2486         vm_exit_controls_set(vmx, exec_control);
2487
2488         /*
2489          * Interrupt/Exception Fields
2490          */
2491         if (vmx->nested.nested_run_pending) {
2492                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2493                              vmcs12->vm_entry_intr_info_field);
2494                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2495                              vmcs12->vm_entry_exception_error_code);
2496                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2497                              vmcs12->vm_entry_instruction_len);
2498                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2499                              vmcs12->guest_interruptibility_info);
2500                 vmx->loaded_vmcs->nmi_known_unmasked =
2501                         !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2502         } else {
2503                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2504         }
2505 }
2506
2507 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2508 {
2509         struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx);
2510
2511         if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2512                            HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2513
2514                 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2515                 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2516                 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2517                 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2518                 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2519                 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2520                 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2521                 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2522                 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2523                 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2524                 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2525                 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2526                 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2527                 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2528                 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2529                 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2530                 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2531                 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2532                 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2533                 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2534                 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2535                 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2536                 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2537                 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2538                 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2539                 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2540                 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2541                 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2542                 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2543                 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2544                 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2545                 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2546                 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2547                 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2548                 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2549                 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2550
2551                 vmx_segment_cache_clear(vmx);
2552         }
2553
2554         if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2555                            HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2556                 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2557                 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2558                             vmcs12->guest_pending_dbg_exceptions);
2559                 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2560                 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2561
2562                 /*
2563                  * L1 may access the L2's PDPTR, so save them to construct
2564                  * vmcs12
2565                  */
2566                 if (enable_ept) {
2567                         vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2568                         vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2569                         vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2570                         vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2571                 }
2572
2573                 if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2574                     (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2575                         vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2576         }
2577
2578         if (nested_cpu_has_xsaves(vmcs12))
2579                 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2580
2581         /*
2582          * Whether page-faults are trapped is determined by a combination of
2583          * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.  If L0
2584          * doesn't care about page faults then we should set all of these to
2585          * L1's desires. However, if L0 does care about (some) page faults, it
2586          * is not easy (if at all possible?) to merge L0 and L1's desires, we
2587          * simply ask to exit on each and every L2 page fault. This is done by
2588          * setting MASK=MATCH=0 and (see below) EB.PF=1.
2589          * Note that below we don't need special code to set EB.PF beyond the
2590          * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2591          * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2592          * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2593          */
2594         if (vmx_need_pf_intercept(&vmx->vcpu)) {
2595                 /*
2596                  * TODO: if both L0 and L1 need the same MASK and MATCH,
2597                  * go ahead and use it?
2598                  */
2599                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
2600                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
2601         } else {
2602                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
2603                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
2604         }
2605
2606         if (cpu_has_vmx_apicv()) {
2607                 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2608                 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2609                 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2610                 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2611         }
2612
2613         /*
2614          * Make sure the msr_autostore list is up to date before we set the
2615          * count in the vmcs02.
2616          */
2617         prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
2618
2619         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
2620         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2621         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2622
2623         set_cr4_guest_host_mask(vmx);
2624 }
2625
2626 /*
2627  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2628  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2629  * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2630  * guest in a way that will both be appropriate to L1's requests, and our
2631  * needs. In addition to modifying the active vmcs (which is vmcs02), this
2632  * function also has additional necessary side-effects, like setting various
2633  * vcpu->arch fields.
2634  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2635  * is assigned to entry_failure_code on failure.
2636  */
2637 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2638                           bool from_vmentry,
2639                           enum vm_entry_failure_code *entry_failure_code)
2640 {
2641         struct vcpu_vmx *vmx = to_vmx(vcpu);
2642         struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
2643         bool load_guest_pdptrs_vmcs12 = false;
2644
2645         if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) {
2646                 prepare_vmcs02_rare(vmx, vmcs12);
2647                 vmx->nested.dirty_vmcs12 = false;
2648
2649                 load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) ||
2650                         !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
2651         }
2652
2653         if (vmx->nested.nested_run_pending &&
2654             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2655                 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2656                 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2657         } else {
2658                 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2659                 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
2660         }
2661         if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2662             !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2663                 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs);
2664         vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2665
2666         /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2667          * bitwise-or of what L1 wants to trap for L2, and what we want to
2668          * trap. Note that CR0.TS also needs updating - we do this later.
2669          */
2670         vmx_update_exception_bitmap(vcpu);
2671         vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2672         vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2673
2674         if (vmx->nested.nested_run_pending &&
2675             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2676                 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2677                 vcpu->arch.pat = vmcs12->guest_ia32_pat;
2678         } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2679                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2680         }
2681
2682         vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2683                         vcpu->arch.l1_tsc_offset,
2684                         vmx_get_l2_tsc_offset(vcpu),
2685                         vmx_get_l2_tsc_multiplier(vcpu));
2686
2687         vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2688                         vcpu->arch.l1_tsc_scaling_ratio,
2689                         vmx_get_l2_tsc_multiplier(vcpu));
2690
2691         vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2692         if (kvm_caps.has_tsc_control)
2693                 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
2694
2695         nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
2696
2697         if (nested_cpu_has_ept(vmcs12))
2698                 nested_ept_init_mmu_context(vcpu);
2699
2700         /*
2701          * Override the CR0/CR4 read shadows after setting the effective guest
2702          * CR0/CR4.  The common helpers also set the shadows, but they don't
2703          * account for vmcs12's cr0/4_guest_host_mask.
2704          */
2705         vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2706         vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2707
2708         vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2709         vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2710
2711         vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2712         /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2713         vmx_set_efer(vcpu, vcpu->arch.efer);
2714
2715         /*
2716          * Guest state is invalid and unrestricted guest is disabled,
2717          * which means L1 attempted VMEntry to L2 with invalid state.
2718          * Fail the VMEntry.
2719          *
2720          * However when force loading the guest state (SMM exit or
2721          * loading nested state after migration, it is possible to
2722          * have invalid guest state now, which will be later fixed by
2723          * restoring L2 register state
2724          */
2725         if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) {
2726                 *entry_failure_code = ENTRY_FAIL_DEFAULT;
2727                 return -EINVAL;
2728         }
2729
2730         /* Shadow page tables on either EPT or shadow page tables. */
2731         if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2732                                 from_vmentry, entry_failure_code))
2733                 return -EINVAL;
2734
2735         /*
2736          * Immediately write vmcs02.GUEST_CR3.  It will be propagated to vmcs12
2737          * on nested VM-Exit, which can occur without actually running L2 and
2738          * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
2739          * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2740          * transition to HLT instead of running L2.
2741          */
2742         if (enable_ept)
2743                 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
2744
2745         /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2746         if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2747             is_pae_paging(vcpu)) {
2748                 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2749                 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2750                 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2751                 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2752         }
2753
2754         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2755             kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) &&
2756             WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
2757                                      vmcs12->guest_ia32_perf_global_ctrl))) {
2758                 *entry_failure_code = ENTRY_FAIL_DEFAULT;
2759                 return -EINVAL;
2760         }
2761
2762         kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2763         kvm_rip_write(vcpu, vmcs12->guest_rip);
2764
2765         /*
2766          * It was observed that genuine Hyper-V running in L1 doesn't reset
2767          * 'hv_clean_fields' by itself, it only sets the corresponding dirty
2768          * bits when it changes a field in eVMCS. Mark all fields as clean
2769          * here.
2770          */
2771         if (nested_vmx_is_evmptr12_valid(vmx))
2772                 evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2773
2774         return 0;
2775 }
2776
2777 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2778 {
2779         if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
2780                nested_cpu_has_virtual_nmis(vmcs12)))
2781                 return -EINVAL;
2782
2783         if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
2784                nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
2785                 return -EINVAL;
2786
2787         return 0;
2788 }
2789
2790 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
2791 {
2792         struct vcpu_vmx *vmx = to_vmx(vcpu);
2793
2794         /* Check for memory type validity */
2795         switch (new_eptp & VMX_EPTP_MT_MASK) {
2796         case VMX_EPTP_MT_UC:
2797                 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
2798                         return false;
2799                 break;
2800         case VMX_EPTP_MT_WB:
2801                 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
2802                         return false;
2803                 break;
2804         default:
2805                 return false;
2806         }
2807
2808         /* Page-walk levels validity. */
2809         switch (new_eptp & VMX_EPTP_PWL_MASK) {
2810         case VMX_EPTP_PWL_5:
2811                 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
2812                         return false;
2813                 break;
2814         case VMX_EPTP_PWL_4:
2815                 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
2816                         return false;
2817                 break;
2818         default:
2819                 return false;
2820         }
2821
2822         /* Reserved bits should not be set */
2823         if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
2824                 return false;
2825
2826         /* AD, if set, should be supported */
2827         if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
2828                 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
2829                         return false;
2830         }
2831
2832         return true;
2833 }
2834
2835 /*
2836  * Checks related to VM-Execution Control Fields
2837  */
2838 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2839                                               struct vmcs12 *vmcs12)
2840 {
2841         struct vcpu_vmx *vmx = to_vmx(vcpu);
2842
2843         if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2844                                    vmx->nested.msrs.pinbased_ctls_low,
2845                                    vmx->nested.msrs.pinbased_ctls_high)) ||
2846             CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2847                                    vmx->nested.msrs.procbased_ctls_low,
2848                                    vmx->nested.msrs.procbased_ctls_high)))
2849                 return -EINVAL;
2850
2851         if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2852             CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
2853                                    vmx->nested.msrs.secondary_ctls_low,
2854                                    vmx->nested.msrs.secondary_ctls_high)))
2855                 return -EINVAL;
2856
2857         if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
2858             nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2859             nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2860             nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2861             nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2862             nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2863             nested_vmx_check_nmi_controls(vmcs12) ||
2864             nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2865             nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2866             nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2867             nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2868             CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2869                 return -EINVAL;
2870
2871         if (!nested_cpu_has_preemption_timer(vmcs12) &&
2872             nested_cpu_has_save_preemption_timer(vmcs12))
2873                 return -EINVAL;
2874
2875         if (nested_cpu_has_ept(vmcs12) &&
2876             CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
2877                 return -EINVAL;
2878
2879         if (nested_cpu_has_vmfunc(vmcs12)) {
2880                 if (CC(vmcs12->vm_function_control &
2881                        ~vmx->nested.msrs.vmfunc_controls))
2882                         return -EINVAL;
2883
2884                 if (nested_cpu_has_eptp_switching(vmcs12)) {
2885                         if (CC(!nested_cpu_has_ept(vmcs12)) ||
2886                             CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
2887                                 return -EINVAL;
2888                 }
2889         }
2890
2891         return 0;
2892 }
2893
2894 /*
2895  * Checks related to VM-Exit Control Fields
2896  */
2897 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2898                                          struct vmcs12 *vmcs12)
2899 {
2900         struct vcpu_vmx *vmx = to_vmx(vcpu);
2901
2902         if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
2903                                     vmx->nested.msrs.exit_ctls_low,
2904                                     vmx->nested.msrs.exit_ctls_high)) ||
2905             CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
2906                 return -EINVAL;
2907
2908         return 0;
2909 }
2910
2911 /*
2912  * Checks related to VM-Entry Control Fields
2913  */
2914 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2915                                           struct vmcs12 *vmcs12)
2916 {
2917         struct vcpu_vmx *vmx = to_vmx(vcpu);
2918
2919         if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
2920                                     vmx->nested.msrs.entry_ctls_low,
2921                                     vmx->nested.msrs.entry_ctls_high)))
2922                 return -EINVAL;
2923
2924         /*
2925          * From the Intel SDM, volume 3:
2926          * Fields relevant to VM-entry event injection must be set properly.
2927          * These fields are the VM-entry interruption-information field, the
2928          * VM-entry exception error code, and the VM-entry instruction length.
2929          */
2930         if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2931                 u32 intr_info = vmcs12->vm_entry_intr_info_field;
2932                 u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2933                 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2934                 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2935                 bool should_have_error_code;
2936                 bool urg = nested_cpu_has2(vmcs12,
2937                                            SECONDARY_EXEC_UNRESTRICTED_GUEST);
2938                 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2939
2940                 /* VM-entry interruption-info field: interruption type */
2941                 if (CC(intr_type == INTR_TYPE_RESERVED) ||
2942                     CC(intr_type == INTR_TYPE_OTHER_EVENT &&
2943                        !nested_cpu_supports_monitor_trap_flag(vcpu)))
2944                         return -EINVAL;
2945
2946                 /* VM-entry interruption-info field: vector */
2947                 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2948                     CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2949                     CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
2950                         return -EINVAL;
2951
2952                 /* VM-entry interruption-info field: deliver error code */
2953                 should_have_error_code =
2954                         intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2955                         x86_exception_has_error_code(vector);
2956                 if (CC(has_error_code != should_have_error_code))
2957                         return -EINVAL;
2958
2959                 /* VM-entry exception error code */
2960                 if (CC(has_error_code &&
2961                        vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
2962                         return -EINVAL;
2963
2964                 /* VM-entry interruption-info field: reserved bits */
2965                 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
2966                         return -EINVAL;
2967
2968                 /* VM-entry instruction length */
2969                 switch (intr_type) {
2970                 case INTR_TYPE_SOFT_EXCEPTION:
2971                 case INTR_TYPE_SOFT_INTR:
2972                 case INTR_TYPE_PRIV_SW_EXCEPTION:
2973                         if (CC(vmcs12->vm_entry_instruction_len > 15) ||
2974                             CC(vmcs12->vm_entry_instruction_len == 0 &&
2975                             CC(!nested_cpu_has_zero_length_injection(vcpu))))
2976                                 return -EINVAL;
2977                 }
2978         }
2979
2980         if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2981                 return -EINVAL;
2982
2983         return 0;
2984 }
2985
2986 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2987                                      struct vmcs12 *vmcs12)
2988 {
2989         if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2990             nested_check_vm_exit_controls(vcpu, vmcs12) ||
2991             nested_check_vm_entry_controls(vcpu, vmcs12))
2992                 return -EINVAL;
2993
2994 #ifdef CONFIG_KVM_HYPERV
2995         if (guest_cpuid_has_evmcs(vcpu))
2996                 return nested_evmcs_check_controls(vmcs12);
2997 #endif
2998
2999         return 0;
3000 }
3001
3002 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
3003                                        struct vmcs12 *vmcs12)
3004 {
3005 #ifdef CONFIG_X86_64
3006         if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) !=
3007                 !!(vcpu->arch.efer & EFER_LMA)))
3008                 return -EINVAL;
3009 #endif
3010         return 0;
3011 }
3012
3013 static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12)
3014 {
3015         /*
3016          * Check that the given linear address is canonical after a VM exit
3017          * from L2, based on HOST_CR4.LA57 value that will be loaded for L1.
3018          */
3019         u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48;
3020
3021         return !__is_canonical_address(la, l1_address_bits_on_exit);
3022 }
3023
3024 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
3025                                        struct vmcs12 *vmcs12)
3026 {
3027         bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
3028
3029         if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
3030             CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
3031             CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3)))
3032                 return -EINVAL;
3033
3034         if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
3035             CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
3036                 return -EINVAL;
3037
3038         if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
3039             CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
3040                 return -EINVAL;
3041
3042         if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
3043             CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
3044                                            vmcs12->host_ia32_perf_global_ctrl)))
3045                 return -EINVAL;
3046
3047         if (ia32e) {
3048                 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
3049                         return -EINVAL;
3050         } else {
3051                 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
3052                     CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
3053                     CC((vmcs12->host_rip) >> 32))
3054                         return -EINVAL;
3055         }
3056
3057         if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3058             CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3059             CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3060             CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3061             CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3062             CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3063             CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
3064             CC(vmcs12->host_cs_selector == 0) ||
3065             CC(vmcs12->host_tr_selector == 0) ||
3066             CC(vmcs12->host_ss_selector == 0 && !ia32e))
3067                 return -EINVAL;
3068
3069         if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) ||
3070             CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) ||
3071             CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) ||
3072             CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) ||
3073             CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) ||
3074             CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12)))
3075                 return -EINVAL;
3076
3077         /*
3078          * If the load IA32_EFER VM-exit control is 1, bits reserved in the
3079          * IA32_EFER MSR must be 0 in the field for that register. In addition,
3080          * the values of the LMA and LME bits in the field must each be that of
3081          * the host address-space size VM-exit control.
3082          */
3083         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
3084                 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
3085                     CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
3086                     CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
3087                         return -EINVAL;
3088         }
3089
3090         return 0;
3091 }
3092
3093 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
3094                                           struct vmcs12 *vmcs12)
3095 {
3096         struct vcpu_vmx *vmx = to_vmx(vcpu);
3097         struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
3098         struct vmcs_hdr hdr;
3099
3100         if (vmcs12->vmcs_link_pointer == INVALID_GPA)
3101                 return 0;
3102
3103         if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
3104                 return -EINVAL;
3105
3106         if (ghc->gpa != vmcs12->vmcs_link_pointer &&
3107             CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
3108                                          vmcs12->vmcs_link_pointer, VMCS12_SIZE)))
3109                 return -EINVAL;
3110
3111         if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
3112                                             offsetof(struct vmcs12, hdr),
3113                                             sizeof(hdr))))
3114                 return -EINVAL;
3115
3116         if (CC(hdr.revision_id != VMCS12_REVISION) ||
3117             CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
3118                 return -EINVAL;
3119
3120         return 0;
3121 }
3122
3123 /*
3124  * Checks related to Guest Non-register State
3125  */
3126 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
3127 {
3128         if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
3129                vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT &&
3130                vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI))
3131                 return -EINVAL;
3132
3133         return 0;
3134 }
3135
3136 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
3137                                         struct vmcs12 *vmcs12,
3138                                         enum vm_entry_failure_code *entry_failure_code)
3139 {
3140         bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE);
3141
3142         *entry_failure_code = ENTRY_FAIL_DEFAULT;
3143
3144         if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
3145             CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
3146                 return -EINVAL;
3147
3148         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
3149             CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
3150                 return -EINVAL;
3151
3152         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
3153             CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
3154                 return -EINVAL;
3155
3156         if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
3157                 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
3158                 return -EINVAL;
3159         }
3160
3161         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
3162             CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
3163                                            vmcs12->guest_ia32_perf_global_ctrl)))
3164                 return -EINVAL;
3165
3166         if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG))
3167                 return -EINVAL;
3168
3169         if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) ||
3170             CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG)))
3171                 return -EINVAL;
3172
3173         /*
3174          * If the load IA32_EFER VM-entry control is 1, the following checks
3175          * are performed on the field for the IA32_EFER MSR:
3176          * - Bits reserved in the IA32_EFER MSR must be 0.
3177          * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
3178          *   the IA-32e mode guest VM-exit control. It must also be identical
3179          *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
3180          *   CR0.PG) is 1.
3181          */
3182         if (to_vmx(vcpu)->nested.nested_run_pending &&
3183             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
3184                 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
3185                     CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
3186                     CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
3187                      ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
3188                         return -EINVAL;
3189         }
3190
3191         if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
3192             (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
3193              CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
3194                 return -EINVAL;
3195
3196         if (nested_check_guest_non_reg_state(vmcs12))
3197                 return -EINVAL;
3198
3199         return 0;
3200 }
3201
3202 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
3203 {
3204         struct vcpu_vmx *vmx = to_vmx(vcpu);
3205         unsigned long cr3, cr4;
3206         bool vm_fail;
3207
3208         if (!nested_early_check)
3209                 return 0;
3210
3211         if (vmx->msr_autoload.host.nr)
3212                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
3213         if (vmx->msr_autoload.guest.nr)
3214                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
3215
3216         preempt_disable();
3217
3218         vmx_prepare_switch_to_guest(vcpu);
3219
3220         /*
3221          * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
3222          * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
3223          * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
3224          * there is no need to preserve other bits or save/restore the field.
3225          */
3226         vmcs_writel(GUEST_RFLAGS, 0);
3227
3228         cr3 = __get_current_cr3_fast();
3229         if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
3230                 vmcs_writel(HOST_CR3, cr3);
3231                 vmx->loaded_vmcs->host_state.cr3 = cr3;
3232         }
3233
3234         cr4 = cr4_read_shadow();
3235         if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
3236                 vmcs_writel(HOST_CR4, cr4);
3237                 vmx->loaded_vmcs->host_state.cr4 = cr4;
3238         }
3239
3240         vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
3241                                  __vmx_vcpu_run_flags(vmx));
3242
3243         if (vmx->msr_autoload.host.nr)
3244                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3245         if (vmx->msr_autoload.guest.nr)
3246                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3247
3248         if (vm_fail) {
3249                 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
3250
3251                 preempt_enable();
3252
3253                 trace_kvm_nested_vmenter_failed(
3254                         "early hardware check VM-instruction error: ", error);
3255                 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3256                 return 1;
3257         }
3258
3259         /*
3260          * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3261          */
3262         if (hw_breakpoint_active())
3263                 set_debugreg(__this_cpu_read(cpu_dr7), 7);
3264         local_irq_enable();
3265         preempt_enable();
3266
3267         /*
3268          * A non-failing VMEntry means we somehow entered guest mode with
3269          * an illegal RIP, and that's just the tip of the iceberg.  There
3270          * is no telling what memory has been modified or what state has
3271          * been exposed to unknown code.  Hitting this all but guarantees
3272          * a (very critical) hardware issue.
3273          */
3274         WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
3275                 VMX_EXIT_REASONS_FAILED_VMENTRY));
3276
3277         return 0;
3278 }
3279
3280 #ifdef CONFIG_KVM_HYPERV
3281 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
3282 {
3283         struct vcpu_vmx *vmx = to_vmx(vcpu);
3284
3285         /*
3286          * hv_evmcs may end up being not mapped after migration (when
3287          * L2 was running), map it here to make sure vmcs12 changes are
3288          * properly reflected.
3289          */
3290         if (guest_cpuid_has_evmcs(vcpu) &&
3291             vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
3292                 enum nested_evmptrld_status evmptrld_status =
3293                         nested_vmx_handle_enlightened_vmptrld(vcpu, false);
3294
3295                 if (evmptrld_status == EVMPTRLD_VMFAIL ||
3296                     evmptrld_status == EVMPTRLD_ERROR)
3297                         return false;
3298
3299                 /*
3300                  * Post migration VMCS12 always provides the most actual
3301                  * information, copy it to eVMCS upon entry.
3302                  */
3303                 vmx->nested.need_vmcs12_to_shadow_sync = true;
3304         }
3305
3306         return true;
3307 }
3308 #endif
3309
3310 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
3311 {
3312         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3313         struct vcpu_vmx *vmx = to_vmx(vcpu);
3314         struct kvm_host_map *map;
3315
3316         if (!vcpu->arch.pdptrs_from_userspace &&
3317             !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
3318                 /*
3319                  * Reload the guest's PDPTRs since after a migration
3320                  * the guest CR3 might be restored prior to setting the nested
3321                  * state which can lead to a load of wrong PDPTRs.
3322                  */
3323                 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
3324                         return false;
3325         }
3326
3327
3328         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3329                 map = &vmx->nested.apic_access_page_map;
3330
3331                 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) {
3332                         vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn));
3333                 } else {
3334                         pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n",
3335                                              __func__);
3336                         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3337                         vcpu->run->internal.suberror =
3338                                 KVM_INTERNAL_ERROR_EMULATION;
3339                         vcpu->run->internal.ndata = 0;
3340                         return false;
3341                 }
3342         }
3343
3344         if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3345                 map = &vmx->nested.virtual_apic_map;
3346
3347                 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
3348                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
3349                 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
3350                            nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
3351                            !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3352                         /*
3353                          * The processor will never use the TPR shadow, simply
3354                          * clear the bit from the execution control.  Such a
3355                          * configuration is useless, but it happens in tests.
3356                          * For any other configuration, failing the vm entry is
3357                          * _not_ what the processor does but it's basically the
3358                          * only possibility we have.
3359                          */
3360                         exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
3361                 } else {
3362                         /*
3363                          * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3364                          * force VM-Entry to fail.
3365                          */
3366                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA);
3367                 }
3368         }
3369
3370         if (nested_cpu_has_posted_intr(vmcs12)) {
3371                 map = &vmx->nested.pi_desc_map;
3372
3373                 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
3374                         vmx->nested.pi_desc =
3375                                 (struct pi_desc *)(((void *)map->hva) +
3376                                 offset_in_page(vmcs12->posted_intr_desc_addr));
3377                         vmcs_write64(POSTED_INTR_DESC_ADDR,
3378                                      pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
3379                 } else {
3380                         /*
3381                          * Defer the KVM_INTERNAL_EXIT until KVM tries to
3382                          * access the contents of the VMCS12 posted interrupt
3383                          * descriptor. (Note that KVM may do this when it
3384                          * should not, per the architectural specification.)
3385                          */
3386                         vmx->nested.pi_desc = NULL;
3387                         pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
3388                 }
3389         }
3390         if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
3391                 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3392         else
3393                 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3394
3395         return true;
3396 }
3397
3398 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
3399 {
3400 #ifdef CONFIG_KVM_HYPERV
3401         /*
3402          * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy
3403          * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory
3404          * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post
3405          * migration.
3406          */
3407         if (!nested_get_evmcs_page(vcpu)) {
3408                 pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
3409                                      __func__);
3410                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3411                 vcpu->run->internal.suberror =
3412                         KVM_INTERNAL_ERROR_EMULATION;
3413                 vcpu->run->internal.ndata = 0;
3414
3415                 return false;
3416         }
3417 #endif
3418
3419         if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
3420                 return false;
3421
3422         return true;
3423 }
3424
3425 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
3426 {
3427         struct vmcs12 *vmcs12;
3428         struct vcpu_vmx *vmx = to_vmx(vcpu);
3429         gpa_t dst;
3430
3431         if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
3432                 return 0;
3433
3434         if (WARN_ON_ONCE(vmx->nested.pml_full))
3435                 return 1;
3436
3437         /*
3438          * Check if PML is enabled for the nested guest. Whether eptp bit 6 is
3439          * set is already checked as part of A/D emulation.
3440          */
3441         vmcs12 = get_vmcs12(vcpu);
3442         if (!nested_cpu_has_pml(vmcs12))
3443                 return 0;
3444
3445         if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
3446                 vmx->nested.pml_full = true;
3447                 return 1;
3448         }
3449
3450         gpa &= ~0xFFFull;
3451         dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
3452
3453         if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
3454                                  offset_in_page(dst), sizeof(gpa)))
3455                 return 0;
3456
3457         vmcs12->guest_pml_index--;
3458
3459         return 0;
3460 }
3461
3462 /*
3463  * Intel's VMX Instruction Reference specifies a common set of prerequisites
3464  * for running VMX instructions (except VMXON, whose prerequisites are
3465  * slightly different). It also specifies what exception to inject otherwise.
3466  * Note that many of these exceptions have priority over VM exits, so they
3467  * don't have to be checked again here.
3468  */
3469 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
3470 {
3471         if (!to_vmx(vcpu)->nested.vmxon) {
3472                 kvm_queue_exception(vcpu, UD_VECTOR);
3473                 return 0;
3474         }
3475
3476         if (vmx_get_cpl(vcpu)) {
3477                 kvm_inject_gp(vcpu, 0);
3478                 return 0;
3479         }
3480
3481         return 1;
3482 }
3483
3484 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
3485 {
3486         u8 rvi = vmx_get_rvi();
3487         u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
3488
3489         return ((rvi & 0xf0) > (vppr & 0xf0));
3490 }
3491
3492 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3493                                    struct vmcs12 *vmcs12);
3494
3495 /*
3496  * If from_vmentry is false, this is being called from state restore (either RSM
3497  * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
3498  *
3499  * Returns:
3500  *      NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
3501  *      NVMX_VMENTRY_VMFAIL:  Consistency check VMFail
3502  *      NVMX_VMENTRY_VMEXIT:  Consistency check VMExit
3503  *      NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
3504  */
3505 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
3506                                                         bool from_vmentry)
3507 {
3508         struct vcpu_vmx *vmx = to_vmx(vcpu);
3509         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3510         enum vm_entry_failure_code entry_failure_code;
3511         bool evaluate_pending_interrupts;
3512         union vmx_exit_reason exit_reason = {
3513                 .basic = EXIT_REASON_INVALID_STATE,
3514                 .failed_vmentry = 1,
3515         };
3516         u32 failed_index;
3517
3518         trace_kvm_nested_vmenter(kvm_rip_read(vcpu),
3519                                  vmx->nested.current_vmptr,
3520                                  vmcs12->guest_rip,
3521                                  vmcs12->guest_intr_status,
3522                                  vmcs12->vm_entry_intr_info_field,
3523                                  vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT,
3524                                  vmcs12->ept_pointer,
3525                                  vmcs12->guest_cr3,
3526                                  KVM_ISA_VMX);
3527
3528         kvm_service_local_tlb_flush_requests(vcpu);
3529
3530         evaluate_pending_interrupts = exec_controls_get(vmx) &
3531                 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
3532         if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
3533                 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
3534         if (!evaluate_pending_interrupts)
3535                 evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu);
3536
3537         if (!vmx->nested.nested_run_pending ||
3538             !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3539                 vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3540         if (kvm_mpx_supported() &&
3541             (!vmx->nested.nested_run_pending ||
3542              !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
3543                 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3544
3545         /*
3546          * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3547          * nested early checks are disabled.  In the event of a "late" VM-Fail,
3548          * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3549          * software model to the pre-VMEntry host state.  When EPT is disabled,
3550          * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3551          * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3.  Stuffing
3552          * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3553          * the correct value.  Smashing vmcs01.GUEST_CR3 is safe because nested
3554          * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3555          * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3556          * L1.  Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3557          * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3558          * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3559          * path would need to manually save/restore vmcs01.GUEST_CR3.
3560          */
3561         if (!enable_ept && !nested_early_check)
3562                 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3563
3564         vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3565
3566         prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
3567
3568         if (from_vmentry) {
3569                 if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
3570                         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3571                         return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
3572                 }
3573
3574                 if (nested_vmx_check_vmentry_hw(vcpu)) {
3575                         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3576                         return NVMX_VMENTRY_VMFAIL;
3577                 }
3578
3579                 if (nested_vmx_check_guest_state(vcpu, vmcs12,
3580                                                  &entry_failure_code)) {
3581                         exit_reason.basic = EXIT_REASON_INVALID_STATE;
3582                         vmcs12->exit_qualification = entry_failure_code;
3583                         goto vmentry_fail_vmexit;
3584                 }
3585         }
3586
3587         enter_guest_mode(vcpu);
3588
3589         if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) {
3590                 exit_reason.basic = EXIT_REASON_INVALID_STATE;
3591                 vmcs12->exit_qualification = entry_failure_code;
3592                 goto vmentry_fail_vmexit_guest_mode;
3593         }
3594
3595         if (from_vmentry) {
3596                 failed_index = nested_vmx_load_msr(vcpu,
3597                                                    vmcs12->vm_entry_msr_load_addr,
3598                                                    vmcs12->vm_entry_msr_load_count);
3599                 if (failed_index) {
3600                         exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
3601                         vmcs12->exit_qualification = failed_index;
3602                         goto vmentry_fail_vmexit_guest_mode;
3603                 }
3604         } else {
3605                 /*
3606                  * The MMU is not initialized to point at the right entities yet and
3607                  * "get pages" would need to read data from the guest (i.e. we will
3608                  * need to perform gpa to hpa translation). Request a call
3609                  * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
3610                  * have already been set at vmentry time and should not be reset.
3611                  */
3612                 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
3613         }
3614
3615         /*
3616          * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI
3617          * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can
3618          * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit
3619          * unconditionally.
3620          */
3621         if (unlikely(evaluate_pending_interrupts))
3622                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3623
3624         /*
3625          * Do not start the preemption timer hrtimer until after we know
3626          * we are successful, so that only nested_vmx_vmexit needs to cancel
3627          * the timer.
3628          */
3629         vmx->nested.preemption_timer_expired = false;
3630         if (nested_cpu_has_preemption_timer(vmcs12)) {
3631                 u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
3632                 vmx_start_preemption_timer(vcpu, timer_value);
3633         }
3634
3635         /*
3636          * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3637          * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3638          * returned as far as L1 is concerned. It will only return (and set
3639          * the success flag) when L2 exits (see nested_vmx_vmexit()).
3640          */
3641         return NVMX_VMENTRY_SUCCESS;
3642
3643         /*
3644          * A failed consistency check that leads to a VMExit during L1's
3645          * VMEnter to L2 is a variation of a normal VMexit, as explained in
3646          * 26.7 "VM-entry failures during or after loading guest state".
3647          */
3648 vmentry_fail_vmexit_guest_mode:
3649         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
3650                 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3651         leave_guest_mode(vcpu);
3652
3653 vmentry_fail_vmexit:
3654         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3655
3656         if (!from_vmentry)
3657                 return NVMX_VMENTRY_VMEXIT;
3658
3659         load_vmcs12_host_state(vcpu, vmcs12);
3660         vmcs12->vm_exit_reason = exit_reason.full;
3661         if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))
3662                 vmx->nested.need_vmcs12_to_shadow_sync = true;
3663         return NVMX_VMENTRY_VMEXIT;
3664 }
3665
3666 /*
3667  * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3668  * for running an L2 nested guest.
3669  */
3670 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3671 {
3672         struct vmcs12 *vmcs12;
3673         enum nvmx_vmentry_status status;
3674         struct vcpu_vmx *vmx = to_vmx(vcpu);
3675         u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3676         enum nested_evmptrld_status evmptrld_status;
3677
3678         if (!nested_vmx_check_permission(vcpu))
3679                 return 1;
3680
3681         evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
3682         if (evmptrld_status == EVMPTRLD_ERROR) {
3683                 kvm_queue_exception(vcpu, UD_VECTOR);
3684                 return 1;
3685         }
3686
3687         kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED);
3688
3689         if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
3690                 return nested_vmx_failInvalid(vcpu);
3691
3692         if (CC(!nested_vmx_is_evmptr12_valid(vmx) &&
3693                vmx->nested.current_vmptr == INVALID_GPA))
3694                 return nested_vmx_failInvalid(vcpu);
3695
3696         vmcs12 = get_vmcs12(vcpu);
3697
3698         /*
3699          * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3700          * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3701          * rather than RFLAGS.ZF, and no error number is stored to the
3702          * VM-instruction error field.
3703          */
3704         if (CC(vmcs12->hdr.shadow_vmcs))
3705                 return nested_vmx_failInvalid(vcpu);
3706
3707         if (nested_vmx_is_evmptr12_valid(vmx)) {
3708                 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
3709
3710                 copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields);
3711                 /* Enlightened VMCS doesn't have launch state */
3712                 vmcs12->launch_state = !launch;
3713         } else if (enable_shadow_vmcs) {
3714                 copy_shadow_to_vmcs12(vmx);
3715         }
3716
3717         /*
3718          * The nested entry process starts with enforcing various prerequisites
3719          * on vmcs12 as required by the Intel SDM, and act appropriately when
3720          * they fail: As the SDM explains, some conditions should cause the
3721          * instruction to fail, while others will cause the instruction to seem
3722          * to succeed, but return an EXIT_REASON_INVALID_STATE.
3723          * To speed up the normal (success) code path, we should avoid checking
3724          * for misconfigurations which will anyway be caught by the processor
3725          * when using the merged vmcs02.
3726          */
3727         if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
3728                 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3729
3730         if (CC(vmcs12->launch_state == launch))
3731                 return nested_vmx_fail(vcpu,
3732                         launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3733                                : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3734
3735         if (nested_vmx_check_controls(vcpu, vmcs12))
3736                 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3737
3738         if (nested_vmx_check_address_space_size(vcpu, vmcs12))
3739                 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3740
3741         if (nested_vmx_check_host_state(vcpu, vmcs12))
3742                 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3743
3744         /*
3745          * We're finally done with prerequisite checking, and can start with
3746          * the nested entry.
3747          */
3748         vmx->nested.nested_run_pending = 1;
3749         vmx->nested.has_preemption_timer_deadline = false;
3750         status = nested_vmx_enter_non_root_mode(vcpu, true);
3751         if (unlikely(status != NVMX_VMENTRY_SUCCESS))
3752                 goto vmentry_failed;
3753
3754         /* Emulate processing of posted interrupts on VM-Enter. */
3755         if (nested_cpu_has_posted_intr(vmcs12) &&
3756             kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
3757                 vmx->nested.pi_pending = true;
3758                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3759                 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
3760         }
3761
3762         /* Hide L1D cache contents from the nested guest.  */
3763         vmx->vcpu.arch.l1tf_flush_l1d = true;
3764
3765         /*
3766          * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3767          * also be used as part of restoring nVMX state for
3768          * snapshot restore (migration).
3769          *
3770          * In this flow, it is assumed that vmcs12 cache was
3771          * transferred as part of captured nVMX state and should
3772          * therefore not be read from guest memory (which may not
3773          * exist on destination host yet).
3774          */
3775         nested_cache_shadow_vmcs12(vcpu, vmcs12);
3776
3777         switch (vmcs12->guest_activity_state) {
3778         case GUEST_ACTIVITY_HLT:
3779                 /*
3780                  * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3781                  * awakened by event injection or by an NMI-window VM-exit or
3782                  * by an interrupt-window VM-exit, halt the vcpu.
3783                  */
3784                 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3785                     !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
3786                     !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
3787                       (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3788                         vmx->nested.nested_run_pending = 0;
3789                         return kvm_emulate_halt_noskip(vcpu);
3790                 }
3791                 break;
3792         case GUEST_ACTIVITY_WAIT_SIPI:
3793                 vmx->nested.nested_run_pending = 0;
3794                 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
3795                 break;
3796         default:
3797                 break;
3798         }
3799
3800         return 1;
3801
3802 vmentry_failed:
3803         vmx->nested.nested_run_pending = 0;
3804         if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
3805                 return 0;
3806         if (status == NVMX_VMENTRY_VMEXIT)
3807                 return 1;
3808         WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
3809         return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3810 }
3811
3812 /*
3813  * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3814  * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
3815  * This function returns the new value we should put in vmcs12.guest_cr0.
3816  * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3817  *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3818  *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3819  *     didn't trap the bit, because if L1 did, so would L0).
3820  *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3821  *     been modified by L2, and L1 knows it. So just leave the old value of
3822  *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3823  *     isn't relevant, because if L0 traps this bit it can set it to anything.
3824  *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3825  *     changed these bits, and therefore they need to be updated, but L0
3826  *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3827  *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3828  */
3829 static inline unsigned long
3830 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3831 {
3832         return
3833         /*1*/   (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3834         /*2*/   (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3835         /*3*/   (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3836                         vcpu->arch.cr0_guest_owned_bits));
3837 }
3838
3839 static inline unsigned long
3840 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3841 {
3842         return
3843         /*1*/   (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3844         /*2*/   (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3845         /*3*/   (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3846                         vcpu->arch.cr4_guest_owned_bits));
3847 }
3848
3849 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3850                                       struct vmcs12 *vmcs12,
3851                                       u32 vm_exit_reason, u32 exit_intr_info)
3852 {
3853         u32 idt_vectoring;
3854         unsigned int nr;
3855
3856         /*
3857          * Per the SDM, VM-Exits due to double and triple faults are never
3858          * considered to occur during event delivery, even if the double/triple
3859          * fault is the result of an escalating vectoring issue.
3860          *
3861          * Note, the SDM qualifies the double fault behavior with "The original
3862          * event results in a double-fault exception".  It's unclear why the
3863          * qualification exists since exits due to double fault can occur only
3864          * while vectoring a different exception (injected events are never
3865          * subject to interception), i.e. there's _always_ an original event.
3866          *
3867          * The SDM also uses NMI as a confusing example for the "original event
3868          * causes the VM exit directly" clause.  NMI isn't special in any way,
3869          * the same rule applies to all events that cause an exit directly.
3870          * NMI is an odd choice for the example because NMIs can only occur on
3871          * instruction boundaries, i.e. they _can't_ occur during vectoring.
3872          */
3873         if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
3874             ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
3875              is_double_fault(exit_intr_info))) {
3876                 vmcs12->idt_vectoring_info_field = 0;
3877         } else if (vcpu->arch.exception.injected) {
3878                 nr = vcpu->arch.exception.vector;
3879                 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3880
3881                 if (kvm_exception_is_soft(nr)) {
3882                         vmcs12->vm_exit_instruction_len =
3883                                 vcpu->arch.event_exit_inst_len;
3884                         idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3885                 } else
3886                         idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3887
3888                 if (vcpu->arch.exception.has_error_code) {
3889                         idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3890                         vmcs12->idt_vectoring_error_code =
3891                                 vcpu->arch.exception.error_code;
3892                 }
3893
3894                 vmcs12->idt_vectoring_info_field = idt_vectoring;
3895         } else if (vcpu->arch.nmi_injected) {
3896                 vmcs12->idt_vectoring_info_field =
3897                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3898         } else if (vcpu->arch.interrupt.injected) {
3899                 nr = vcpu->arch.interrupt.nr;
3900                 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3901
3902                 if (vcpu->arch.interrupt.soft) {
3903                         idt_vectoring |= INTR_TYPE_SOFT_INTR;
3904                         vmcs12->vm_entry_instruction_len =
3905                                 vcpu->arch.event_exit_inst_len;
3906                 } else
3907                         idt_vectoring |= INTR_TYPE_EXT_INTR;
3908
3909                 vmcs12->idt_vectoring_info_field = idt_vectoring;
3910         } else {
3911                 vmcs12->idt_vectoring_info_field = 0;
3912         }
3913 }
3914
3915
3916 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3917 {
3918         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3919         gfn_t gfn;
3920
3921         /*
3922          * Don't need to mark the APIC access page dirty; it is never
3923          * written to by the CPU during APIC virtualization.
3924          */
3925
3926         if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3927                 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3928                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3929         }
3930
3931         if (nested_cpu_has_posted_intr(vmcs12)) {
3932                 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3933                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3934         }
3935 }
3936
3937 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3938 {
3939         struct vcpu_vmx *vmx = to_vmx(vcpu);
3940         int max_irr;
3941         void *vapic_page;
3942         u16 status;
3943
3944         if (!vmx->nested.pi_pending)
3945                 return 0;
3946
3947         if (!vmx->nested.pi_desc)
3948                 goto mmio_needed;
3949
3950         vmx->nested.pi_pending = false;
3951
3952         if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3953                 return 0;
3954
3955         max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
3956         if (max_irr > 0) {
3957                 vapic_page = vmx->nested.virtual_apic_map.hva;
3958                 if (!vapic_page)
3959                         goto mmio_needed;
3960
3961                 __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3962                         vapic_page, &max_irr);
3963                 status = vmcs_read16(GUEST_INTR_STATUS);
3964                 if ((u8)max_irr > ((u8)status & 0xff)) {
3965                         status &= ~0xff;
3966                         status |= (u8)max_irr;
3967                         vmcs_write16(GUEST_INTR_STATUS, status);
3968                 }
3969         }
3970
3971         nested_mark_vmcs12_pages_dirty(vcpu);
3972         return 0;
3973
3974 mmio_needed:
3975         kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
3976         return -ENXIO;
3977 }
3978
3979 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
3980 {
3981         struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
3982         u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
3983         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3984         unsigned long exit_qual;
3985
3986         if (ex->has_payload) {
3987                 exit_qual = ex->payload;
3988         } else if (ex->vector == PF_VECTOR) {
3989                 exit_qual = vcpu->arch.cr2;
3990         } else if (ex->vector == DB_VECTOR) {
3991                 exit_qual = vcpu->arch.dr6;
3992                 exit_qual &= ~DR6_BT;
3993                 exit_qual ^= DR6_ACTIVE_LOW;
3994         } else {
3995                 exit_qual = 0;
3996         }
3997
3998         /*
3999          * Unlike AMD's Paged Real Mode, which reports an error code on #PF
4000          * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the
4001          * "has error code" flags on VM-Exit if the CPU is in Real Mode.
4002          */
4003         if (ex->has_error_code && is_protmode(vcpu)) {
4004                 /*
4005                  * Intel CPUs do not generate error codes with bits 31:16 set,
4006                  * and more importantly VMX disallows setting bits 31:16 in the
4007                  * injected error code for VM-Entry.  Drop the bits to mimic
4008                  * hardware and avoid inducing failure on nested VM-Entry if L1
4009                  * chooses to inject the exception back to L2.  AMD CPUs _do_
4010                  * generate "full" 32-bit error codes, so KVM allows userspace
4011                  * to inject exception error codes with bits 31:16 set.
4012                  */
4013                 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code;
4014                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
4015         }
4016
4017         if (kvm_exception_is_soft(ex->vector))
4018                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
4019         else
4020                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
4021
4022         if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
4023             vmx_get_nmi_mask(vcpu))
4024                 intr_info |= INTR_INFO_UNBLOCK_NMI;
4025
4026         nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
4027 }
4028
4029 /*
4030  * Returns true if a debug trap is (likely) pending delivery.  Infer the class
4031  * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6).
4032  * Using the payload is flawed because code breakpoints (fault-like) and data
4033  * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e.
4034  * this will return false positives if a to-be-injected code breakpoint #DB is
4035  * pending (from KVM's perspective, but not "pending" across an instruction
4036  * boundary).  ICEBP, a.k.a. INT1, is also not reflected here even though it
4037  * too is trap-like.
4038  *
4039  * KVM "works" despite these flaws as ICEBP isn't currently supported by the
4040  * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the
4041  * #DB has already happened), and MTF isn't marked pending on code breakpoints
4042  * from the emulator (because such #DBs are fault-like and thus don't trigger
4043  * actions that fire on instruction retire).
4044  */
4045 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex)
4046 {
4047         if (!ex->pending || ex->vector != DB_VECTOR)
4048                 return 0;
4049
4050         /* General Detect #DBs are always fault-like. */
4051         return ex->payload & ~DR6_BD;
4052 }
4053
4054 /*
4055  * Returns true if there's a pending #DB exception that is lower priority than
4056  * a pending Monitor Trap Flag VM-Exit.  TSS T-flag #DBs are not emulated by
4057  * KVM, but could theoretically be injected by userspace.  Note, this code is
4058  * imperfect, see above.
4059  */
4060 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex)
4061 {
4062         return vmx_get_pending_dbg_trap(ex) & ~DR6_BT;
4063 }
4064
4065 /*
4066  * Certain VM-exits set the 'pending debug exceptions' field to indicate a
4067  * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
4068  * represents these debug traps with a payload that is said to be compatible
4069  * with the 'pending debug exceptions' field, write the payload to the VMCS
4070  * field if a VM-exit is delivered before the debug trap.
4071  */
4072 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
4073 {
4074         unsigned long pending_dbg;
4075
4076         pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception);
4077         if (pending_dbg)
4078                 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg);
4079 }
4080
4081 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
4082 {
4083         return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
4084                to_vmx(vcpu)->nested.preemption_timer_expired;
4085 }
4086
4087 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection)
4088 {
4089         struct vcpu_vmx *vmx = to_vmx(vcpu);
4090         void *vapic = vmx->nested.virtual_apic_map.hva;
4091         int max_irr, vppr;
4092
4093         if (nested_vmx_preemption_timer_pending(vcpu) ||
4094             vmx->nested.mtf_pending)
4095                 return true;
4096
4097         /*
4098          * Virtual Interrupt Delivery doesn't require manual injection.  Either
4099          * the interrupt is already in GUEST_RVI and will be recognized by CPU
4100          * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move
4101          * the interrupt from the PIR to RVI prior to entering the guest.
4102          */
4103         if (for_injection)
4104                 return false;
4105
4106         if (!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
4107             __vmx_interrupt_blocked(vcpu))
4108                 return false;
4109
4110         if (!vapic)
4111                 return false;
4112
4113         vppr = *((u32 *)(vapic + APIC_PROCPRI));
4114
4115         max_irr = vmx_get_rvi();
4116         if ((max_irr & 0xf0) > (vppr & 0xf0))
4117                 return true;
4118
4119         if (vmx->nested.pi_pending && vmx->nested.pi_desc &&
4120             pi_test_on(vmx->nested.pi_desc)) {
4121                 max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
4122                 if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
4123                         return true;
4124         }
4125
4126         return false;
4127 }
4128
4129 /*
4130  * Per the Intel SDM's table "Priority Among Concurrent Events", with minor
4131  * edits to fill in missing examples, e.g. #DB due to split-lock accesses,
4132  * and less minor edits to splice in the priority of VMX Non-Root specific
4133  * events, e.g. MTF and NMI/INTR-window exiting.
4134  *
4135  * 1 Hardware Reset and Machine Checks
4136  *      - RESET
4137  *      - Machine Check
4138  *
4139  * 2 Trap on Task Switch
4140  *      - T flag in TSS is set (on task switch)
4141  *
4142  * 3 External Hardware Interventions
4143  *      - FLUSH
4144  *      - STOPCLK
4145  *      - SMI
4146  *      - INIT
4147  *
4148  * 3.5 Monitor Trap Flag (MTF) VM-exit[1]
4149  *
4150  * 4 Traps on Previous Instruction
4151  *      - Breakpoints
4152  *      - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O
4153  *        breakpoint, or #DB due to a split-lock access)
4154  *
4155  * 4.3  VMX-preemption timer expired VM-exit
4156  *
4157  * 4.6  NMI-window exiting VM-exit[2]
4158  *
4159  * 5 Nonmaskable Interrupts (NMI)
4160  *
4161  * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery
4162  *
4163  * 6 Maskable Hardware Interrupts
4164  *
4165  * 7 Code Breakpoint Fault
4166  *
4167  * 8 Faults from Fetching Next Instruction
4168  *      - Code-Segment Limit Violation
4169  *      - Code Page Fault
4170  *      - Control protection exception (missing ENDBRANCH at target of indirect
4171  *                                      call or jump)
4172  *
4173  * 9 Faults from Decoding Next Instruction
4174  *      - Instruction length > 15 bytes
4175  *      - Invalid Opcode
4176  *      - Coprocessor Not Available
4177  *
4178  *10 Faults on Executing Instruction
4179  *      - Overflow
4180  *      - Bound error
4181  *      - Invalid TSS
4182  *      - Segment Not Present
4183  *      - Stack fault
4184  *      - General Protection
4185  *      - Data Page Fault
4186  *      - Alignment Check
4187  *      - x86 FPU Floating-point exception
4188  *      - SIMD floating-point exception
4189  *      - Virtualization exception
4190  *      - Control protection exception
4191  *
4192  * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs),
4193  *     INIT signals, and higher priority events take priority over MTF VM exits.
4194  *     MTF VM exits take priority over debug-trap exceptions and lower priority
4195  *     events.
4196  *
4197  * [2] Debug-trap exceptions and higher priority events take priority over VM exits
4198  *     caused by the VMX-preemption timer.  VM exits caused by the VMX-preemption
4199  *     timer take priority over VM exits caused by the "NMI-window exiting"
4200  *     VM-execution control and lower priority events.
4201  *
4202  * [3] Debug-trap exceptions and higher priority events take priority over VM exits
4203  *     caused by "NMI-window exiting".  VM exits caused by this control take
4204  *     priority over non-maskable interrupts (NMIs) and lower priority events.
4205  *
4206  * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to
4207  *     the 1-setting of the "interrupt-window exiting" VM-execution control.  Thus,
4208  *     non-maskable interrupts (NMIs) and higher priority events take priority over
4209  *     delivery of a virtual interrupt; delivery of a virtual interrupt takes
4210  *     priority over external interrupts and lower priority events.
4211  */
4212 static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
4213 {
4214         struct kvm_lapic *apic = vcpu->arch.apic;
4215         struct vcpu_vmx *vmx = to_vmx(vcpu);
4216         /*
4217          * Only a pending nested run blocks a pending exception.  If there is a
4218          * previously injected event, the pending exception occurred while said
4219          * event was being delivered and thus needs to be handled.
4220          */
4221         bool block_nested_exceptions = vmx->nested.nested_run_pending;
4222         /*
4223          * New events (not exceptions) are only recognized at instruction
4224          * boundaries.  If an event needs reinjection, then KVM is handling a
4225          * VM-Exit that occurred _during_ instruction execution; new events are
4226          * blocked until the instruction completes.
4227          */
4228         bool block_nested_events = block_nested_exceptions ||
4229                                    kvm_event_needs_reinjection(vcpu);
4230
4231         if (lapic_in_kernel(vcpu) &&
4232                 test_bit(KVM_APIC_INIT, &apic->pending_events)) {
4233                 if (block_nested_events)
4234                         return -EBUSY;
4235                 nested_vmx_update_pending_dbg(vcpu);
4236                 clear_bit(KVM_APIC_INIT, &apic->pending_events);
4237                 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
4238                         nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
4239
4240                 /* MTF is discarded if the vCPU is in WFS. */
4241                 vmx->nested.mtf_pending = false;
4242                 return 0;
4243         }
4244
4245         if (lapic_in_kernel(vcpu) &&
4246             test_bit(KVM_APIC_SIPI, &apic->pending_events)) {
4247                 if (block_nested_events)
4248                         return -EBUSY;
4249
4250                 clear_bit(KVM_APIC_SIPI, &apic->pending_events);
4251                 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
4252                         nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
4253                                                 apic->sipi_vector & 0xFFUL);
4254                         return 0;
4255                 }
4256                 /* Fallthrough, the SIPI is completely ignored. */
4257         }
4258
4259         /*
4260          * Process exceptions that are higher priority than Monitor Trap Flag:
4261          * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but
4262          * could theoretically come in from userspace), and ICEBP (INT1).
4263          *
4264          * TODO: SMIs have higher priority than MTF and trap-like #DBs (except
4265          * for TSS T flag #DBs).  KVM also doesn't save/restore pending MTF
4266          * across SMI/RSM as it should; that needs to be addressed in order to
4267          * prioritize SMI over MTF and trap-like #DBs.
4268          */
4269         if (vcpu->arch.exception_vmexit.pending &&
4270             !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) {
4271                 if (block_nested_exceptions)
4272                         return -EBUSY;
4273
4274                 nested_vmx_inject_exception_vmexit(vcpu);
4275                 return 0;
4276         }
4277
4278         if (vcpu->arch.exception.pending &&
4279             !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) {
4280                 if (block_nested_exceptions)
4281                         return -EBUSY;
4282                 goto no_vmexit;
4283         }
4284
4285         if (vmx->nested.mtf_pending) {
4286                 if (block_nested_events)
4287                         return -EBUSY;
4288                 nested_vmx_update_pending_dbg(vcpu);
4289                 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
4290                 return 0;
4291         }
4292
4293         if (vcpu->arch.exception_vmexit.pending) {
4294                 if (block_nested_exceptions)
4295                         return -EBUSY;
4296
4297                 nested_vmx_inject_exception_vmexit(vcpu);
4298                 return 0;
4299         }
4300
4301         if (vcpu->arch.exception.pending) {
4302                 if (block_nested_exceptions)
4303                         return -EBUSY;
4304                 goto no_vmexit;
4305         }
4306
4307         if (nested_vmx_preemption_timer_pending(vcpu)) {
4308                 if (block_nested_events)
4309                         return -EBUSY;
4310                 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
4311                 return 0;
4312         }
4313
4314         if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
4315                 if (block_nested_events)
4316                         return -EBUSY;
4317                 goto no_vmexit;
4318         }
4319
4320         if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
4321                 if (block_nested_events)
4322                         return -EBUSY;
4323                 if (!nested_exit_on_nmi(vcpu))
4324                         goto no_vmexit;
4325
4326                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
4327                                   NMI_VECTOR | INTR_TYPE_NMI_INTR |
4328                                   INTR_INFO_VALID_MASK, 0);
4329                 /*
4330                  * The NMI-triggered VM exit counts as injection:
4331                  * clear this one and block further NMIs.
4332                  */
4333                 vcpu->arch.nmi_pending = 0;
4334                 vmx_set_nmi_mask(vcpu, true);
4335                 return 0;
4336         }
4337
4338         if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
4339                 int irq;
4340
4341                 if (block_nested_events)
4342                         return -EBUSY;
4343                 if (!nested_exit_on_intr(vcpu))
4344                         goto no_vmexit;
4345
4346                 if (!nested_exit_intr_ack_set(vcpu)) {
4347                         nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
4348                         return 0;
4349                 }
4350
4351                 irq = kvm_cpu_get_extint(vcpu);
4352                 if (irq != -1) {
4353                         nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
4354                                           INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
4355                         return 0;
4356                 }
4357
4358                 irq = kvm_apic_has_interrupt(vcpu);
4359                 if (WARN_ON_ONCE(irq < 0))
4360                         goto no_vmexit;
4361
4362                 /*
4363                  * If the IRQ is L2's PI notification vector, process posted
4364                  * interrupts for L2 instead of injecting VM-Exit, as the
4365                  * detection/morphing architecturally occurs when the IRQ is
4366                  * delivered to the CPU.  Note, only interrupts that are routed
4367                  * through the local APIC trigger posted interrupt processing,
4368                  * and enabling posted interrupts requires ACK-on-exit.
4369                  */
4370                 if (irq == vmx->nested.posted_intr_nv) {
4371                         vmx->nested.pi_pending = true;
4372                         kvm_apic_clear_irr(vcpu, irq);
4373                         goto no_vmexit;
4374                 }
4375
4376                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
4377                                   INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
4378
4379                 /*
4380                  * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must
4381                  * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI
4382                  * if APICv is active.
4383                  */
4384                 kvm_apic_ack_interrupt(vcpu, irq);
4385                 return 0;
4386         }
4387
4388 no_vmexit:
4389         return vmx_complete_nested_posted_interrupt(vcpu);
4390 }
4391
4392 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
4393 {
4394         ktime_t remaining =
4395                 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
4396         u64 value;
4397
4398         if (ktime_to_ns(remaining) <= 0)
4399                 return 0;
4400
4401         value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
4402         do_div(value, 1000000);
4403         return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
4404 }
4405
4406 static bool is_vmcs12_ext_field(unsigned long field)
4407 {
4408         switch (field) {
4409         case GUEST_ES_SELECTOR:
4410         case GUEST_CS_SELECTOR:
4411         case GUEST_SS_SELECTOR:
4412         case GUEST_DS_SELECTOR:
4413         case GUEST_FS_SELECTOR:
4414         case GUEST_GS_SELECTOR:
4415         case GUEST_LDTR_SELECTOR:
4416         case GUEST_TR_SELECTOR:
4417         case GUEST_ES_LIMIT:
4418         case GUEST_CS_LIMIT:
4419         case GUEST_SS_LIMIT:
4420         case GUEST_DS_LIMIT:
4421         case GUEST_FS_LIMIT:
4422         case GUEST_GS_LIMIT:
4423         case GUEST_LDTR_LIMIT:
4424         case GUEST_TR_LIMIT:
4425         case GUEST_GDTR_LIMIT:
4426         case GUEST_IDTR_LIMIT:
4427         case GUEST_ES_AR_BYTES:
4428         case GUEST_DS_AR_BYTES:
4429         case GUEST_FS_AR_BYTES:
4430         case GUEST_GS_AR_BYTES:
4431         case GUEST_LDTR_AR_BYTES:
4432         case GUEST_TR_AR_BYTES:
4433         case GUEST_ES_BASE:
4434         case GUEST_CS_BASE:
4435         case GUEST_SS_BASE:
4436         case GUEST_DS_BASE:
4437         case GUEST_FS_BASE:
4438         case GUEST_GS_BASE:
4439         case GUEST_LDTR_BASE:
4440         case GUEST_TR_BASE:
4441         case GUEST_GDTR_BASE:
4442         case GUEST_IDTR_BASE:
4443         case GUEST_PENDING_DBG_EXCEPTIONS:
4444         case GUEST_BNDCFGS:
4445                 return true;
4446         default:
4447                 break;
4448         }
4449
4450         return false;
4451 }
4452
4453 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4454                                        struct vmcs12 *vmcs12)
4455 {
4456         struct vcpu_vmx *vmx = to_vmx(vcpu);
4457
4458         vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
4459         vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
4460         vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
4461         vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
4462         vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
4463         vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
4464         vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
4465         vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
4466         vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
4467         vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
4468         vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
4469         vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
4470         vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
4471         vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
4472         vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
4473         vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
4474         vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
4475         vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
4476         vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
4477         vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
4478         vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
4479         vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
4480         vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
4481         vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
4482         vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
4483         vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
4484         vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
4485         vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
4486         vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
4487         vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
4488         vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
4489         vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
4490         vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
4491         vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
4492         vmcs12->guest_pending_dbg_exceptions =
4493                 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
4494
4495         vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
4496 }
4497
4498 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4499                                        struct vmcs12 *vmcs12)
4500 {
4501         struct vcpu_vmx *vmx = to_vmx(vcpu);
4502         int cpu;
4503
4504         if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
4505                 return;
4506
4507
4508         WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
4509
4510         cpu = get_cpu();
4511         vmx->loaded_vmcs = &vmx->nested.vmcs02;
4512         vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01);
4513
4514         sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4515
4516         vmx->loaded_vmcs = &vmx->vmcs01;
4517         vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02);
4518         put_cpu();
4519 }
4520
4521 /*
4522  * Update the guest state fields of vmcs12 to reflect changes that
4523  * occurred while L2 was running. (The "IA-32e mode guest" bit of the
4524  * VM-entry controls is also updated, since this is really a guest
4525  * state bit.)
4526  */
4527 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
4528 {
4529         struct vcpu_vmx *vmx = to_vmx(vcpu);
4530
4531         if (nested_vmx_is_evmptr12_valid(vmx))
4532                 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4533
4534         vmx->nested.need_sync_vmcs02_to_vmcs12_rare =
4535                 !nested_vmx_is_evmptr12_valid(vmx);
4536
4537         vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
4538         vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
4539
4540         vmcs12->guest_rsp = kvm_rsp_read(vcpu);
4541         vmcs12->guest_rip = kvm_rip_read(vcpu);
4542         vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
4543
4544         vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
4545         vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
4546
4547         vmcs12->guest_interruptibility_info =
4548                 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
4549
4550         if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
4551                 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
4552         else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
4553                 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI;
4554         else
4555                 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
4556
4557         if (nested_cpu_has_preemption_timer(vmcs12) &&
4558             vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
4559             !vmx->nested.nested_run_pending)
4560                 vmcs12->vmx_preemption_timer_value =
4561                         vmx_get_preemption_timer_value(vcpu);
4562
4563         /*
4564          * In some cases (usually, nested EPT), L2 is allowed to change its
4565          * own CR3 without exiting. If it has changed it, we must keep it.
4566          * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
4567          * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
4568          *
4569          * Additionally, restore L2's PDPTR to vmcs12.
4570          */
4571         if (enable_ept) {
4572                 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
4573                 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
4574                         vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
4575                         vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
4576                         vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
4577                         vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
4578                 }
4579         }
4580
4581         vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
4582
4583         if (nested_cpu_has_vid(vmcs12))
4584                 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
4585
4586         vmcs12->vm_entry_controls =
4587                 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
4588                 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
4589
4590         if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
4591                 vmcs12->guest_dr7 = vcpu->arch.dr7;
4592
4593         if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
4594                 vmcs12->guest_ia32_efer = vcpu->arch.efer;
4595 }
4596
4597 /*
4598  * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
4599  * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
4600  * and this function updates it to reflect the changes to the guest state while
4601  * L2 was running (and perhaps made some exits which were handled directly by L0
4602  * without going back to L1), and to reflect the exit reason.
4603  * Note that we do not have to copy here all VMCS fields, just those that
4604  * could have changed by the L2 guest or the exit - i.e., the guest-state and
4605  * exit-information fields only. Other fields are modified by L1 with VMWRITE,
4606  * which already writes to vmcs12 directly.
4607  */
4608 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
4609                            u32 vm_exit_reason, u32 exit_intr_info,
4610                            unsigned long exit_qualification)
4611 {
4612         /* update exit information fields: */
4613         vmcs12->vm_exit_reason = vm_exit_reason;
4614         if (to_vmx(vcpu)->exit_reason.enclave_mode)
4615                 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
4616         vmcs12->exit_qualification = exit_qualification;
4617
4618         /*
4619          * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
4620          * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
4621          * exit info fields are unmodified.
4622          */
4623         if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
4624                 vmcs12->launch_state = 1;
4625
4626                 /* vm_entry_intr_info_field is cleared on exit. Emulate this
4627                  * instead of reading the real value. */
4628                 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
4629
4630                 /*
4631                  * Transfer the event that L0 or L1 may wanted to inject into
4632                  * L2 to IDT_VECTORING_INFO_FIELD.
4633                  */
4634                 vmcs12_save_pending_event(vcpu, vmcs12,
4635                                           vm_exit_reason, exit_intr_info);
4636
4637                 vmcs12->vm_exit_intr_info = exit_intr_info;
4638                 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4639                 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4640
4641                 /*
4642                  * According to spec, there's no need to store the guest's
4643                  * MSRs if the exit is due to a VM-entry failure that occurs
4644                  * during or after loading the guest state. Since this exit
4645                  * does not fall in that category, we need to save the MSRs.
4646                  */
4647                 if (nested_vmx_store_msr(vcpu,
4648                                          vmcs12->vm_exit_msr_store_addr,
4649                                          vmcs12->vm_exit_msr_store_count))
4650                         nested_vmx_abort(vcpu,
4651                                          VMX_ABORT_SAVE_GUEST_MSR_FAIL);
4652         }
4653 }
4654
4655 /*
4656  * A part of what we need to when the nested L2 guest exits and we want to
4657  * run its L1 parent, is to reset L1's guest state to the host state specified
4658  * in vmcs12.
4659  * This function is to be called not only on normal nested exit, but also on
4660  * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
4661  * Failures During or After Loading Guest State").
4662  * This function should be called when the active VMCS is L1's (vmcs01).
4663  */
4664 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
4665                                    struct vmcs12 *vmcs12)
4666 {
4667         enum vm_entry_failure_code ignored;
4668         struct kvm_segment seg;
4669
4670         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
4671                 vcpu->arch.efer = vmcs12->host_ia32_efer;
4672         else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4673                 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
4674         else
4675                 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
4676         vmx_set_efer(vcpu, vcpu->arch.efer);
4677
4678         kvm_rsp_write(vcpu, vmcs12->host_rsp);
4679         kvm_rip_write(vcpu, vmcs12->host_rip);
4680         vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
4681         vmx_set_interrupt_shadow(vcpu, 0);
4682
4683         /*
4684          * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
4685          * actually changed, because vmx_set_cr0 refers to efer set above.
4686          *
4687          * CR0_GUEST_HOST_MASK is already set in the original vmcs01
4688          * (KVM doesn't change it);
4689          */
4690         vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
4691         vmx_set_cr0(vcpu, vmcs12->host_cr0);
4692
4693         /* Same as above - no reason to call set_cr4_guest_host_mask().  */
4694         vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4695         vmx_set_cr4(vcpu, vmcs12->host_cr4);
4696
4697         nested_ept_uninit_mmu_context(vcpu);
4698
4699         /*
4700          * Only PDPTE load can fail as the value of cr3 was checked on entry and
4701          * couldn't have changed.
4702          */
4703         if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored))
4704                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
4705
4706         nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
4707
4708         vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
4709         vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
4710         vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
4711         vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
4712         vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
4713         vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
4714         vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
4715
4716         /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
4717         if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
4718                 vmcs_write64(GUEST_BNDCFGS, 0);
4719
4720         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
4721                 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
4722                 vcpu->arch.pat = vmcs12->host_ia32_pat;
4723         }
4724         if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
4725             kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)))
4726                 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
4727                                          vmcs12->host_ia32_perf_global_ctrl));
4728
4729         /* Set L1 segment info according to Intel SDM
4730             27.5.2 Loading Host Segment and Descriptor-Table Registers */
4731         seg = (struct kvm_segment) {
4732                 .base = 0,
4733                 .limit = 0xFFFFFFFF,
4734                 .selector = vmcs12->host_cs_selector,
4735                 .type = 11,
4736                 .present = 1,
4737                 .s = 1,
4738                 .g = 1
4739         };
4740         if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4741                 seg.l = 1;
4742         else
4743                 seg.db = 1;
4744         __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
4745         seg = (struct kvm_segment) {
4746                 .base = 0,
4747                 .limit = 0xFFFFFFFF,
4748                 .type = 3,
4749                 .present = 1,
4750                 .s = 1,
4751                 .db = 1,
4752                 .g = 1
4753         };
4754         seg.selector = vmcs12->host_ds_selector;
4755         __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
4756         seg.selector = vmcs12->host_es_selector;
4757         __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
4758         seg.selector = vmcs12->host_ss_selector;
4759         __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
4760         seg.selector = vmcs12->host_fs_selector;
4761         seg.base = vmcs12->host_fs_base;
4762         __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
4763         seg.selector = vmcs12->host_gs_selector;
4764         seg.base = vmcs12->host_gs_base;
4765         __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
4766         seg = (struct kvm_segment) {
4767                 .base = vmcs12->host_tr_base,
4768                 .limit = 0x67,
4769                 .selector = vmcs12->host_tr_selector,
4770                 .type = 11,
4771                 .present = 1
4772         };
4773         __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
4774
4775         memset(&seg, 0, sizeof(seg));
4776         seg.unusable = 1;
4777         __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
4778
4779         kvm_set_dr(vcpu, 7, 0x400);
4780         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4781
4782         if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
4783                                 vmcs12->vm_exit_msr_load_count))
4784                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4785
4786         to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
4787 }
4788
4789 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
4790 {
4791         struct vmx_uret_msr *efer_msr;
4792         unsigned int i;
4793
4794         if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
4795                 return vmcs_read64(GUEST_IA32_EFER);
4796
4797         if (cpu_has_load_ia32_efer())
4798                 return kvm_host.efer;
4799
4800         for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
4801                 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
4802                         return vmx->msr_autoload.guest.val[i].value;
4803         }
4804
4805         efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
4806         if (efer_msr)
4807                 return efer_msr->data;
4808
4809         return kvm_host.efer;
4810 }
4811
4812 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
4813 {
4814         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4815         struct vcpu_vmx *vmx = to_vmx(vcpu);
4816         struct vmx_msr_entry g, h;
4817         gpa_t gpa;
4818         u32 i, j;
4819
4820         vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
4821
4822         if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
4823                 /*
4824                  * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
4825                  * as vmcs01.GUEST_DR7 contains a userspace defined value
4826                  * and vcpu->arch.dr7 is not squirreled away before the
4827                  * nested VMENTER (not worth adding a variable in nested_vmx).
4828                  */
4829                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
4830                         kvm_set_dr(vcpu, 7, DR7_FIXED_1);
4831                 else
4832                         WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
4833         }
4834
4835         /*
4836          * Note that calling vmx_set_{efer,cr0,cr4} is important as they
4837          * handle a variety of side effects to KVM's software model.
4838          */
4839         vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
4840
4841         vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
4842         vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
4843
4844         vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4845         vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
4846
4847         nested_ept_uninit_mmu_context(vcpu);
4848         vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
4849         kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
4850
4851         /*
4852          * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
4853          * from vmcs01 (if necessary).  The PDPTRs are not loaded on
4854          * VMFail, like everything else we just need to ensure our
4855          * software model is up-to-date.
4856          */
4857         if (enable_ept && is_pae_paging(vcpu))
4858                 ept_save_pdptrs(vcpu);
4859
4860         kvm_mmu_reset_context(vcpu);
4861
4862         /*
4863          * This nasty bit of open coding is a compromise between blindly
4864          * loading L1's MSRs using the exit load lists (incorrect emulation
4865          * of VMFail), leaving the nested VM's MSRs in the software model
4866          * (incorrect behavior) and snapshotting the modified MSRs (too
4867          * expensive since the lists are unbound by hardware).  For each
4868          * MSR that was (prematurely) loaded from the nested VMEntry load
4869          * list, reload it from the exit load list if it exists and differs
4870          * from the guest value.  The intent is to stuff host state as
4871          * silently as possible, not to fully process the exit load list.
4872          */
4873         for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
4874                 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
4875                 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
4876                         pr_debug_ratelimited(
4877                                 "%s read MSR index failed (%u, 0x%08llx)\n",
4878                                 __func__, i, gpa);
4879                         goto vmabort;
4880                 }
4881
4882                 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
4883                         gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
4884                         if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
4885                                 pr_debug_ratelimited(
4886                                         "%s read MSR failed (%u, 0x%08llx)\n",
4887                                         __func__, j, gpa);
4888                                 goto vmabort;
4889                         }
4890                         if (h.index != g.index)
4891                                 continue;
4892                         if (h.value == g.value)
4893                                 break;
4894
4895                         if (nested_vmx_load_msr_check(vcpu, &h)) {
4896                                 pr_debug_ratelimited(
4897                                         "%s check failed (%u, 0x%x, 0x%x)\n",
4898                                         __func__, j, h.index, h.reserved);
4899                                 goto vmabort;
4900                         }
4901
4902                         if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) {
4903                                 pr_debug_ratelimited(
4904                                         "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
4905                                         __func__, j, h.index, h.value);
4906                                 goto vmabort;
4907                         }
4908                 }
4909         }
4910
4911         return;
4912
4913 vmabort:
4914         nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4915 }
4916
4917 /*
4918  * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
4919  * and modify vmcs12 to make it see what it would expect to see there if
4920  * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
4921  */
4922 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
4923                        u32 exit_intr_info, unsigned long exit_qualification)
4924 {
4925         struct vcpu_vmx *vmx = to_vmx(vcpu);
4926         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4927
4928         /* Pending MTF traps are discarded on VM-Exit. */
4929         vmx->nested.mtf_pending = false;
4930
4931         /* trying to cancel vmlaunch/vmresume is a bug */
4932         WARN_ON_ONCE(vmx->nested.nested_run_pending);
4933
4934 #ifdef CONFIG_KVM_HYPERV
4935         if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
4936                 /*
4937                  * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
4938                  * Enlightened VMCS after migration and we still need to
4939                  * do that when something is forcing L2->L1 exit prior to
4940                  * the first L2 run.
4941                  */
4942                 (void)nested_get_evmcs_page(vcpu);
4943         }
4944 #endif
4945
4946         /* Service pending TLB flush requests for L2 before switching to L1. */
4947         kvm_service_local_tlb_flush_requests(vcpu);
4948
4949         /*
4950          * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
4951          * now and the new vmentry.  Ensure that the VMCS02 PDPTR fields are
4952          * up-to-date before switching to L1.
4953          */
4954         if (enable_ept && is_pae_paging(vcpu))
4955                 vmx_ept_load_pdptrs(vcpu);
4956
4957         leave_guest_mode(vcpu);
4958
4959         if (nested_cpu_has_preemption_timer(vmcs12))
4960                 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
4961
4962         if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
4963                 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
4964                 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
4965                         vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
4966         }
4967
4968         if (likely(!vmx->fail)) {
4969                 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
4970
4971                 if (vm_exit_reason != -1)
4972                         prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
4973                                        exit_intr_info, exit_qualification);
4974
4975                 /*
4976                  * Must happen outside of sync_vmcs02_to_vmcs12() as it will
4977                  * also be used to capture vmcs12 cache as part of
4978                  * capturing nVMX state for snapshot (migration).
4979                  *
4980                  * Otherwise, this flush will dirty guest memory at a
4981                  * point it is already assumed by user-space to be
4982                  * immutable.
4983                  */
4984                 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
4985         } else {
4986                 /*
4987                  * The only expected VM-instruction error is "VM entry with
4988                  * invalid control field(s)." Anything else indicates a
4989                  * problem with L0.  And we should never get here with a
4990                  * VMFail of any type if early consistency checks are enabled.
4991                  */
4992                 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
4993                              VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4994                 WARN_ON_ONCE(nested_early_check);
4995         }
4996
4997         /*
4998          * Drop events/exceptions that were queued for re-injection to L2
4999          * (picked up via vmx_complete_interrupts()), as well as exceptions
5000          * that were pending for L2.  Note, this must NOT be hoisted above
5001          * prepare_vmcs12(), events/exceptions queued for re-injection need to
5002          * be captured in vmcs12 (see vmcs12_save_pending_event()).
5003          */
5004         vcpu->arch.nmi_injected = false;
5005         kvm_clear_exception_queue(vcpu);
5006         kvm_clear_interrupt_queue(vcpu);
5007
5008         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
5009
5010         /*
5011          * If IBRS is advertised to the vCPU, KVM must flush the indirect
5012          * branch predictors when transitioning from L2 to L1, as L1 expects
5013          * hardware (KVM in this case) to provide separate predictor modes.
5014          * Bare metal isolates VMX root (host) from VMX non-root (guest), but
5015          * doesn't isolate different VMCSs, i.e. in this case, doesn't provide
5016          * separate modes for L2 vs L1.
5017          */
5018         if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
5019                 indirect_branch_prediction_barrier();
5020
5021         /* Update any VMCS fields that might have changed while L2 ran */
5022         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
5023         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
5024         vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
5025         if (kvm_caps.has_tsc_control)
5026                 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
5027
5028         if (vmx->nested.l1_tpr_threshold != -1)
5029                 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
5030
5031         if (vmx->nested.change_vmcs01_virtual_apic_mode) {
5032                 vmx->nested.change_vmcs01_virtual_apic_mode = false;
5033                 vmx_set_virtual_apic_mode(vcpu);
5034         }
5035
5036         if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
5037                 vmx->nested.update_vmcs01_cpu_dirty_logging = false;
5038                 vmx_update_cpu_dirty_logging(vcpu);
5039         }
5040
5041         nested_put_vmcs12_pages(vcpu);
5042
5043         if (vmx->nested.reload_vmcs01_apic_access_page) {
5044                 vmx->nested.reload_vmcs01_apic_access_page = false;
5045                 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
5046         }
5047
5048         if (vmx->nested.update_vmcs01_apicv_status) {
5049                 vmx->nested.update_vmcs01_apicv_status = false;
5050                 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
5051         }
5052
5053         if ((vm_exit_reason != -1) &&
5054             (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)))
5055                 vmx->nested.need_vmcs12_to_shadow_sync = true;
5056
5057         /* in case we halted in L2 */
5058         vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5059
5060         if (likely(!vmx->fail)) {
5061                 if (vm_exit_reason != -1)
5062                         trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
5063                                                        vmcs12->exit_qualification,
5064                                                        vmcs12->idt_vectoring_info_field,
5065                                                        vmcs12->vm_exit_intr_info,
5066                                                        vmcs12->vm_exit_intr_error_code,
5067                                                        KVM_ISA_VMX);
5068
5069                 load_vmcs12_host_state(vcpu, vmcs12);
5070
5071                 return;
5072         }
5073
5074         /*
5075          * After an early L2 VM-entry failure, we're now back
5076          * in L1 which thinks it just finished a VMLAUNCH or
5077          * VMRESUME instruction, so we need to set the failure
5078          * flag and the VM-instruction error field of the VMCS
5079          * accordingly, and skip the emulated instruction.
5080          */
5081         (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
5082
5083         /*
5084          * Restore L1's host state to KVM's software model.  We're here
5085          * because a consistency check was caught by hardware, which
5086          * means some amount of guest state has been propagated to KVM's
5087          * model and needs to be unwound to the host's state.
5088          */
5089         nested_vmx_restore_host_state(vcpu);
5090
5091         vmx->fail = 0;
5092 }
5093
5094 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
5095 {
5096         kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5097         nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
5098 }
5099
5100 /*
5101  * Decode the memory-address operand of a vmx instruction, as recorded on an
5102  * exit caused by such an instruction (run by a guest hypervisor).
5103  * On success, returns 0. When the operand is invalid, returns 1 and throws
5104  * #UD, #GP, or #SS.
5105  */
5106 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
5107                         u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
5108 {
5109         gva_t off;
5110         bool exn;
5111         struct kvm_segment s;
5112
5113         /*
5114          * According to Vol. 3B, "Information for VM Exits Due to Instruction
5115          * Execution", on an exit, vmx_instruction_info holds most of the
5116          * addressing components of the operand. Only the displacement part
5117          * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
5118          * For how an actual address is calculated from all these components,
5119          * refer to Vol. 1, "Operand Addressing".
5120          */
5121         int  scaling = vmx_instruction_info & 3;
5122         int  addr_size = (vmx_instruction_info >> 7) & 7;
5123         bool is_reg = vmx_instruction_info & (1u << 10);
5124         int  seg_reg = (vmx_instruction_info >> 15) & 7;
5125         int  index_reg = (vmx_instruction_info >> 18) & 0xf;
5126         bool index_is_valid = !(vmx_instruction_info & (1u << 22));
5127         int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
5128         bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
5129
5130         if (is_reg) {
5131                 kvm_queue_exception(vcpu, UD_VECTOR);
5132                 return 1;
5133         }
5134
5135         /* Addr = segment_base + offset */
5136         /* offset = base + [index * scale] + displacement */
5137         off = exit_qualification; /* holds the displacement */
5138         if (addr_size == 1)
5139                 off = (gva_t)sign_extend64(off, 31);
5140         else if (addr_size == 0)
5141                 off = (gva_t)sign_extend64(off, 15);
5142         if (base_is_valid)
5143                 off += kvm_register_read(vcpu, base_reg);
5144         if (index_is_valid)
5145                 off += kvm_register_read(vcpu, index_reg) << scaling;
5146         vmx_get_segment(vcpu, &s, seg_reg);
5147
5148         /*
5149          * The effective address, i.e. @off, of a memory operand is truncated
5150          * based on the address size of the instruction.  Note that this is
5151          * the *effective address*, i.e. the address prior to accounting for
5152          * the segment's base.
5153          */
5154         if (addr_size == 1) /* 32 bit */
5155                 off &= 0xffffffff;
5156         else if (addr_size == 0) /* 16 bit */
5157                 off &= 0xffff;
5158
5159         /* Checks for #GP/#SS exceptions. */
5160         exn = false;
5161         if (is_long_mode(vcpu)) {
5162                 /*
5163                  * The virtual/linear address is never truncated in 64-bit
5164                  * mode, e.g. a 32-bit address size can yield a 64-bit virtual
5165                  * address when using FS/GS with a non-zero base.
5166                  */
5167                 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
5168                         *ret = s.base + off;
5169                 else
5170                         *ret = off;
5171
5172                 *ret = vmx_get_untagged_addr(vcpu, *ret, 0);
5173                 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
5174                  * non-canonical form. This is the only check on the memory
5175                  * destination for long mode!
5176                  */
5177                 exn = is_noncanonical_address(*ret, vcpu, 0);
5178         } else {
5179                 /*
5180                  * When not in long mode, the virtual/linear address is
5181                  * unconditionally truncated to 32 bits regardless of the
5182                  * address size.
5183                  */
5184                 *ret = (s.base + off) & 0xffffffff;
5185
5186                 /* Protected mode: apply checks for segment validity in the
5187                  * following order:
5188                  * - segment type check (#GP(0) may be thrown)
5189                  * - usability check (#GP(0)/#SS(0))
5190                  * - limit check (#GP(0)/#SS(0))
5191                  */
5192                 if (wr)
5193                         /* #GP(0) if the destination operand is located in a
5194                          * read-only data segment or any code segment.
5195                          */
5196                         exn = ((s.type & 0xa) == 0 || (s.type & 8));
5197                 else
5198                         /* #GP(0) if the source operand is located in an
5199                          * execute-only code segment
5200                          */
5201                         exn = ((s.type & 0xa) == 8);
5202                 if (exn) {
5203                         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
5204                         return 1;
5205                 }
5206                 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
5207                  */
5208                 exn = (s.unusable != 0);
5209
5210                 /*
5211                  * Protected mode: #GP(0)/#SS(0) if the memory operand is
5212                  * outside the segment limit.  All CPUs that support VMX ignore
5213                  * limit checks for flat segments, i.e. segments with base==0,
5214                  * limit==0xffffffff and of type expand-up data or code.
5215                  */
5216                 if (!(s.base == 0 && s.limit == 0xffffffff &&
5217                      ((s.type & 8) || !(s.type & 4))))
5218                         exn = exn || ((u64)off + len - 1 > s.limit);
5219         }
5220         if (exn) {
5221                 kvm_queue_exception_e(vcpu,
5222                                       seg_reg == VCPU_SREG_SS ?
5223                                                 SS_VECTOR : GP_VECTOR,
5224                                       0);
5225                 return 1;
5226         }
5227
5228         return 0;
5229 }
5230
5231 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
5232                                 int *ret)
5233 {
5234         gva_t gva;
5235         struct x86_exception e;
5236         int r;
5237
5238         if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5239                                 vmcs_read32(VMX_INSTRUCTION_INFO), false,
5240                                 sizeof(*vmpointer), &gva)) {
5241                 *ret = 1;
5242                 return -EINVAL;
5243         }
5244
5245         r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
5246         if (r != X86EMUL_CONTINUE) {
5247                 *ret = kvm_handle_memory_failure(vcpu, r, &e);
5248                 return -EINVAL;
5249         }
5250
5251         return 0;
5252 }
5253
5254 /*
5255  * Allocate a shadow VMCS and associate it with the currently loaded
5256  * VMCS, unless such a shadow VMCS already exists. The newly allocated
5257  * VMCS is also VMCLEARed, so that it is ready for use.
5258  */
5259 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
5260 {
5261         struct vcpu_vmx *vmx = to_vmx(vcpu);
5262         struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
5263
5264         /*
5265          * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
5266          * when L1 executes VMXOFF or the vCPU is forced out of nested
5267          * operation.  VMXON faults if the CPU is already post-VMXON, so it
5268          * should be impossible to already have an allocated shadow VMCS.  KVM
5269          * doesn't support virtualization of VMCS shadowing, so vmcs01 should
5270          * always be the loaded VMCS.
5271          */
5272         if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
5273                 return loaded_vmcs->shadow_vmcs;
5274
5275         loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
5276         if (loaded_vmcs->shadow_vmcs)
5277                 vmcs_clear(loaded_vmcs->shadow_vmcs);
5278
5279         return loaded_vmcs->shadow_vmcs;
5280 }
5281
5282 static int enter_vmx_operation(struct kvm_vcpu *vcpu)
5283 {
5284         struct vcpu_vmx *vmx = to_vmx(vcpu);
5285         int r;
5286
5287         r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
5288         if (r < 0)
5289                 goto out_vmcs02;
5290
5291         vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
5292         if (!vmx->nested.cached_vmcs12)
5293                 goto out_cached_vmcs12;
5294
5295         vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
5296         vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
5297         if (!vmx->nested.cached_shadow_vmcs12)
5298                 goto out_cached_shadow_vmcs12;
5299
5300         if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
5301                 goto out_shadow_vmcs;
5302
5303         hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
5304                      HRTIMER_MODE_ABS_PINNED);
5305         vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
5306
5307         vmx->nested.vpid02 = allocate_vpid();
5308
5309         vmx->nested.vmcs02_initialized = false;
5310         vmx->nested.vmxon = true;
5311
5312         if (vmx_pt_mode_is_host_guest()) {
5313                 vmx->pt_desc.guest.ctl = 0;
5314                 pt_update_intercept_for_msr(vcpu);
5315         }
5316
5317         return 0;
5318
5319 out_shadow_vmcs:
5320         kfree(vmx->nested.cached_shadow_vmcs12);
5321
5322 out_cached_shadow_vmcs12:
5323         kfree(vmx->nested.cached_vmcs12);
5324
5325 out_cached_vmcs12:
5326         free_loaded_vmcs(&vmx->nested.vmcs02);
5327
5328 out_vmcs02:
5329         return -ENOMEM;
5330 }
5331
5332 /* Emulate the VMXON instruction. */
5333 static int handle_vmxon(struct kvm_vcpu *vcpu)
5334 {
5335         int ret;
5336         gpa_t vmptr;
5337         uint32_t revision;
5338         struct vcpu_vmx *vmx = to_vmx(vcpu);
5339         const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
5340                 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
5341
5342         /*
5343          * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter
5344          * the guest and so cannot rely on hardware to perform the check,
5345          * which has higher priority than VM-Exit (see Intel SDM's pseudocode
5346          * for VMXON).
5347          *
5348          * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86
5349          * and !COMPATIBILITY modes.  For an unrestricted guest, KVM doesn't
5350          * force any of the relevant guest state.  For a restricted guest, KVM
5351          * does force CR0.PE=1, but only to also force VM86 in order to emulate
5352          * Real Mode, and so there's no need to check CR0.PE manually.
5353          */
5354         if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) {
5355                 kvm_queue_exception(vcpu, UD_VECTOR);
5356                 return 1;
5357         }
5358
5359         /*
5360          * The CPL is checked for "not in VMX operation" and for "in VMX root",
5361          * and has higher priority than the VM-Fail due to being post-VMXON,
5362          * i.e. VMXON #GPs outside of VMX non-root if CPL!=0.  In VMX non-root,
5363          * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits
5364          * from L2 to L1, i.e. there's no need to check for the vCPU being in
5365          * VMX non-root.
5366          *
5367          * Forwarding the VM-Exit unconditionally, i.e. without performing the
5368          * #UD checks (see above), is functionally ok because KVM doesn't allow
5369          * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's
5370          * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are
5371          * missed by hardware due to shadowing CR0 and/or CR4.
5372          */
5373         if (vmx_get_cpl(vcpu)) {
5374                 kvm_inject_gp(vcpu, 0);
5375                 return 1;
5376         }
5377
5378         if (vmx->nested.vmxon)
5379                 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
5380
5381         /*
5382          * Invalid CR0/CR4 generates #GP.  These checks are performed if and
5383          * only if the vCPU isn't already in VMX operation, i.e. effectively
5384          * have lower priority than the VM-Fail above.
5385          */
5386         if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
5387             !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
5388                 kvm_inject_gp(vcpu, 0);
5389                 return 1;
5390         }
5391
5392         if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
5393                         != VMXON_NEEDED_FEATURES) {
5394                 kvm_inject_gp(vcpu, 0);
5395                 return 1;
5396         }
5397
5398         if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret))
5399                 return ret;
5400
5401         /*
5402          * SDM 3: 24.11.5
5403          * The first 4 bytes of VMXON region contain the supported
5404          * VMCS revision identifier
5405          *
5406          * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
5407          * which replaces physical address width with 32
5408          */
5409         if (!page_address_valid(vcpu, vmptr))
5410                 return nested_vmx_failInvalid(vcpu);
5411
5412         if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
5413             revision != VMCS12_REVISION)
5414                 return nested_vmx_failInvalid(vcpu);
5415
5416         vmx->nested.vmxon_ptr = vmptr;
5417         ret = enter_vmx_operation(vcpu);
5418         if (ret)
5419                 return ret;
5420
5421         return nested_vmx_succeed(vcpu);
5422 }
5423
5424 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
5425 {
5426         struct vcpu_vmx *vmx = to_vmx(vcpu);
5427
5428         if (vmx->nested.current_vmptr == INVALID_GPA)
5429                 return;
5430
5431         copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
5432
5433         if (enable_shadow_vmcs) {
5434                 /* copy to memory all shadowed fields in case
5435                    they were modified */
5436                 copy_shadow_to_vmcs12(vmx);
5437                 vmx_disable_shadow_vmcs(vmx);
5438         }
5439         vmx->nested.posted_intr_nv = -1;
5440
5441         /* Flush VMCS12 to guest memory */
5442         kvm_vcpu_write_guest_page(vcpu,
5443                                   vmx->nested.current_vmptr >> PAGE_SHIFT,
5444                                   vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
5445
5446         kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5447
5448         vmx->nested.current_vmptr = INVALID_GPA;
5449 }
5450
5451 /* Emulate the VMXOFF instruction */
5452 static int handle_vmxoff(struct kvm_vcpu *vcpu)
5453 {
5454         if (!nested_vmx_check_permission(vcpu))
5455                 return 1;
5456
5457         free_nested(vcpu);
5458
5459         if (kvm_apic_has_pending_init_or_sipi(vcpu))
5460                 kvm_make_request(KVM_REQ_EVENT, vcpu);
5461
5462         return nested_vmx_succeed(vcpu);
5463 }
5464
5465 /* Emulate the VMCLEAR instruction */
5466 static int handle_vmclear(struct kvm_vcpu *vcpu)
5467 {
5468         struct vcpu_vmx *vmx = to_vmx(vcpu);
5469         u32 zero = 0;
5470         gpa_t vmptr;
5471         int r;
5472
5473         if (!nested_vmx_check_permission(vcpu))
5474                 return 1;
5475
5476         if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5477                 return r;
5478
5479         if (!page_address_valid(vcpu, vmptr))
5480                 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
5481
5482         if (vmptr == vmx->nested.vmxon_ptr)
5483                 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
5484
5485         if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) {
5486                 if (vmptr == vmx->nested.current_vmptr)
5487                         nested_release_vmcs12(vcpu);
5488
5489                 /*
5490                  * Silently ignore memory errors on VMCLEAR, Intel's pseudocode
5491                  * for VMCLEAR includes a "ensure that data for VMCS referenced
5492                  * by the operand is in memory" clause that guards writes to
5493                  * memory, i.e. doing nothing for I/O is architecturally valid.
5494                  *
5495                  * FIXME: Suppress failures if and only if no memslot is found,
5496                  * i.e. exit to userspace if __copy_to_user() fails.
5497                  */
5498                 (void)kvm_vcpu_write_guest(vcpu,
5499                                            vmptr + offsetof(struct vmcs12,
5500                                                             launch_state),
5501                                            &zero, sizeof(zero));
5502         }
5503
5504         return nested_vmx_succeed(vcpu);
5505 }
5506
5507 /* Emulate the VMLAUNCH instruction */
5508 static int handle_vmlaunch(struct kvm_vcpu *vcpu)
5509 {
5510         return nested_vmx_run(vcpu, true);
5511 }
5512
5513 /* Emulate the VMRESUME instruction */
5514 static int handle_vmresume(struct kvm_vcpu *vcpu)
5515 {
5516
5517         return nested_vmx_run(vcpu, false);
5518 }
5519
5520 static int handle_vmread(struct kvm_vcpu *vcpu)
5521 {
5522         struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5523                                                     : get_vmcs12(vcpu);
5524         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5525         u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5526         struct vcpu_vmx *vmx = to_vmx(vcpu);
5527         struct x86_exception e;
5528         unsigned long field;
5529         u64 value;
5530         gva_t gva = 0;
5531         short offset;
5532         int len, r;
5533
5534         if (!nested_vmx_check_permission(vcpu))
5535                 return 1;
5536
5537         /* Decode instruction info and find the field to read */
5538         field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
5539
5540         if (!nested_vmx_is_evmptr12_valid(vmx)) {
5541                 /*
5542                  * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
5543                  * any VMREAD sets the ALU flags for VMfailInvalid.
5544                  */
5545                 if (vmx->nested.current_vmptr == INVALID_GPA ||
5546                     (is_guest_mode(vcpu) &&
5547                      get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
5548                         return nested_vmx_failInvalid(vcpu);
5549
5550                 offset = get_vmcs12_field_offset(field);
5551                 if (offset < 0)
5552                         return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5553
5554                 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
5555                         copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5556
5557                 /* Read the field, zero-extended to a u64 value */
5558                 value = vmcs12_read_any(vmcs12, field, offset);
5559         } else {
5560                 /*
5561                  * Hyper-V TLFS (as of 6.0b) explicitly states, that while an
5562                  * enlightened VMCS is active VMREAD/VMWRITE instructions are
5563                  * unsupported. Unfortunately, certain versions of Windows 11
5564                  * don't comply with this requirement which is not enforced in
5565                  * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
5566                  * workaround, as misbehaving guests will panic on VM-Fail.
5567                  * Note, enlightened VMCS is incompatible with shadow VMCS so
5568                  * all VMREADs from L2 should go to L1.
5569                  */
5570                 if (WARN_ON_ONCE(is_guest_mode(vcpu)))
5571                         return nested_vmx_failInvalid(vcpu);
5572
5573                 offset = evmcs_field_offset(field, NULL);
5574                 if (offset < 0)
5575                         return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5576
5577                 /* Read the field, zero-extended to a u64 value */
5578                 value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset);
5579         }
5580
5581         /*
5582          * Now copy part of this value to register or memory, as requested.
5583          * Note that the number of bits actually copied is 32 or 64 depending
5584          * on the guest's mode (32 or 64 bit), not on the given field's length.
5585          */
5586         if (instr_info & BIT(10)) {
5587                 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value);
5588         } else {
5589                 len = is_64_bit_mode(vcpu) ? 8 : 4;
5590                 if (get_vmx_mem_address(vcpu, exit_qualification,
5591                                         instr_info, true, len, &gva))
5592                         return 1;
5593                 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
5594                 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
5595                 if (r != X86EMUL_CONTINUE)
5596                         return kvm_handle_memory_failure(vcpu, r, &e);
5597         }
5598
5599         return nested_vmx_succeed(vcpu);
5600 }
5601
5602 static bool is_shadow_field_rw(unsigned long field)
5603 {
5604         switch (field) {
5605 #define SHADOW_FIELD_RW(x, y) case x:
5606 #include "vmcs_shadow_fields.h"
5607                 return true;
5608         default:
5609                 break;
5610         }
5611         return false;
5612 }
5613
5614 static bool is_shadow_field_ro(unsigned long field)
5615 {
5616         switch (field) {
5617 #define SHADOW_FIELD_RO(x, y) case x:
5618 #include "vmcs_shadow_fields.h"
5619                 return true;
5620         default:
5621                 break;
5622         }
5623         return false;
5624 }
5625
5626 static int handle_vmwrite(struct kvm_vcpu *vcpu)
5627 {
5628         struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5629                                                     : get_vmcs12(vcpu);
5630         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5631         u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5632         struct vcpu_vmx *vmx = to_vmx(vcpu);
5633         struct x86_exception e;
5634         unsigned long field;
5635         short offset;
5636         gva_t gva;
5637         int len, r;
5638
5639         /*
5640          * The value to write might be 32 or 64 bits, depending on L1's long
5641          * mode, and eventually we need to write that into a field of several
5642          * possible lengths. The code below first zero-extends the value to 64
5643          * bit (value), and then copies only the appropriate number of
5644          * bits into the vmcs12 field.
5645          */
5646         u64 value = 0;
5647
5648         if (!nested_vmx_check_permission(vcpu))
5649                 return 1;
5650
5651         /*
5652          * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
5653          * any VMWRITE sets the ALU flags for VMfailInvalid.
5654          */
5655         if (vmx->nested.current_vmptr == INVALID_GPA ||
5656             (is_guest_mode(vcpu) &&
5657              get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
5658                 return nested_vmx_failInvalid(vcpu);
5659
5660         if (instr_info & BIT(10))
5661                 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf));
5662         else {
5663                 len = is_64_bit_mode(vcpu) ? 8 : 4;
5664                 if (get_vmx_mem_address(vcpu, exit_qualification,
5665                                         instr_info, false, len, &gva))
5666                         return 1;
5667                 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
5668                 if (r != X86EMUL_CONTINUE)
5669                         return kvm_handle_memory_failure(vcpu, r, &e);
5670         }
5671
5672         field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
5673
5674         offset = get_vmcs12_field_offset(field);
5675         if (offset < 0)
5676                 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5677
5678         /*
5679          * If the vCPU supports "VMWRITE to any supported field in the
5680          * VMCS," then the "read-only" fields are actually read/write.
5681          */
5682         if (vmcs_field_readonly(field) &&
5683             !nested_cpu_has_vmwrite_any_field(vcpu))
5684                 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
5685
5686         /*
5687          * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
5688          * vmcs12, else we may crush a field or consume a stale value.
5689          */
5690         if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
5691                 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5692
5693         /*
5694          * Some Intel CPUs intentionally drop the reserved bits of the AR byte
5695          * fields on VMWRITE.  Emulate this behavior to ensure consistent KVM
5696          * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
5697          * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
5698          * from L1 will return a different value than VMREAD from L2 (L1 sees
5699          * the stripped down value, L2 sees the full value as stored by KVM).
5700          */
5701         if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
5702                 value &= 0x1f0ff;
5703
5704         vmcs12_write_any(vmcs12, field, offset, value);
5705
5706         /*
5707          * Do not track vmcs12 dirty-state if in guest-mode as we actually
5708          * dirty shadow vmcs12 instead of vmcs12.  Fields that can be updated
5709          * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
5710          * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
5711          */
5712         if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
5713                 /*
5714                  * L1 can read these fields without exiting, ensure the
5715                  * shadow VMCS is up-to-date.
5716                  */
5717                 if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
5718                         preempt_disable();
5719                         vmcs_load(vmx->vmcs01.shadow_vmcs);
5720
5721                         __vmcs_writel(field, value);
5722
5723                         vmcs_clear(vmx->vmcs01.shadow_vmcs);
5724                         vmcs_load(vmx->loaded_vmcs->vmcs);
5725                         preempt_enable();
5726                 }
5727                 vmx->nested.dirty_vmcs12 = true;
5728         }
5729
5730         return nested_vmx_succeed(vcpu);
5731 }
5732
5733 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
5734 {
5735         vmx->nested.current_vmptr = vmptr;
5736         if (enable_shadow_vmcs) {
5737                 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
5738                 vmcs_write64(VMCS_LINK_POINTER,
5739                              __pa(vmx->vmcs01.shadow_vmcs));
5740                 vmx->nested.need_vmcs12_to_shadow_sync = true;
5741         }
5742         vmx->nested.dirty_vmcs12 = true;
5743         vmx->nested.force_msr_bitmap_recalc = true;
5744 }
5745
5746 /* Emulate the VMPTRLD instruction */
5747 static int handle_vmptrld(struct kvm_vcpu *vcpu)
5748 {
5749         struct vcpu_vmx *vmx = to_vmx(vcpu);
5750         gpa_t vmptr;
5751         int r;
5752
5753         if (!nested_vmx_check_permission(vcpu))
5754                 return 1;
5755
5756         if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5757                 return r;
5758
5759         if (!page_address_valid(vcpu, vmptr))
5760                 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
5761
5762         if (vmptr == vmx->nested.vmxon_ptr)
5763                 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
5764
5765         /* Forbid normal VMPTRLD if Enlightened version was used */
5766         if (nested_vmx_is_evmptr12_valid(vmx))
5767                 return 1;
5768
5769         if (vmx->nested.current_vmptr != vmptr) {
5770                 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
5771                 struct vmcs_hdr hdr;
5772
5773                 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
5774                         /*
5775                          * Reads from an unbacked page return all 1s,
5776                          * which means that the 32 bits located at the
5777                          * given physical address won't match the required
5778                          * VMCS12_REVISION identifier.
5779                          */
5780                         return nested_vmx_fail(vcpu,
5781                                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5782                 }
5783
5784                 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
5785                                                  offsetof(struct vmcs12, hdr),
5786                                                  sizeof(hdr))) {
5787                         return nested_vmx_fail(vcpu,
5788                                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5789                 }
5790
5791                 if (hdr.revision_id != VMCS12_REVISION ||
5792                     (hdr.shadow_vmcs &&
5793                      !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
5794                         return nested_vmx_fail(vcpu,
5795                                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5796                 }
5797
5798                 nested_release_vmcs12(vcpu);
5799
5800                 /*
5801                  * Load VMCS12 from guest memory since it is not already
5802                  * cached.
5803                  */
5804                 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12,
5805                                           VMCS12_SIZE)) {
5806                         return nested_vmx_fail(vcpu,
5807                                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5808                 }
5809
5810                 set_current_vmptr(vmx, vmptr);
5811         }
5812
5813         return nested_vmx_succeed(vcpu);
5814 }
5815
5816 /* Emulate the VMPTRST instruction */
5817 static int handle_vmptrst(struct kvm_vcpu *vcpu)
5818 {
5819         unsigned long exit_qual = vmx_get_exit_qual(vcpu);
5820         u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5821         gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
5822         struct x86_exception e;
5823         gva_t gva;
5824         int r;
5825
5826         if (!nested_vmx_check_permission(vcpu))
5827                 return 1;
5828
5829         if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu))))
5830                 return 1;
5831
5832         if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
5833                                 true, sizeof(gpa_t), &gva))
5834                 return 1;
5835         /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
5836         r = kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
5837                                         sizeof(gpa_t), &e);
5838         if (r != X86EMUL_CONTINUE)
5839                 return kvm_handle_memory_failure(vcpu, r, &e);
5840
5841         return nested_vmx_succeed(vcpu);
5842 }
5843
5844 /* Emulate the INVEPT instruction */
5845 static int handle_invept(struct kvm_vcpu *vcpu)
5846 {
5847         struct vcpu_vmx *vmx = to_vmx(vcpu);
5848         u32 vmx_instruction_info, types;
5849         unsigned long type, roots_to_free;
5850         struct kvm_mmu *mmu;
5851         gva_t gva;
5852         struct x86_exception e;
5853         struct {
5854                 u64 eptp, gpa;
5855         } operand;
5856         int i, r, gpr_index;
5857
5858         if (!(vmx->nested.msrs.secondary_ctls_high &
5859               SECONDARY_EXEC_ENABLE_EPT) ||
5860             !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
5861                 kvm_queue_exception(vcpu, UD_VECTOR);
5862                 return 1;
5863         }
5864
5865         if (!nested_vmx_check_permission(vcpu))
5866                 return 1;
5867
5868         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5869         gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
5870         type = kvm_register_read(vcpu, gpr_index);
5871
5872         types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
5873
5874         if (type >= 32 || !(types & (1 << type)))
5875                 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5876
5877         /* According to the Intel VMX instruction reference, the memory
5878          * operand is read even if it isn't needed (e.g., for type==global)
5879          */
5880         if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5881                         vmx_instruction_info, false, sizeof(operand), &gva))
5882                 return 1;
5883         r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
5884         if (r != X86EMUL_CONTINUE)
5885                 return kvm_handle_memory_failure(vcpu, r, &e);
5886
5887         /*
5888          * Nested EPT roots are always held through guest_mmu,
5889          * not root_mmu.
5890          */
5891         mmu = &vcpu->arch.guest_mmu;
5892
5893         switch (type) {
5894         case VMX_EPT_EXTENT_CONTEXT:
5895                 if (!nested_vmx_check_eptp(vcpu, operand.eptp))
5896                         return nested_vmx_fail(vcpu,
5897                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5898
5899                 roots_to_free = 0;
5900                 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd,
5901                                             operand.eptp))
5902                         roots_to_free |= KVM_MMU_ROOT_CURRENT;
5903
5904                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5905                         if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
5906                                                     mmu->prev_roots[i].pgd,
5907                                                     operand.eptp))
5908                                 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5909                 }
5910                 break;
5911         case VMX_EPT_EXTENT_GLOBAL:
5912                 roots_to_free = KVM_MMU_ROOTS_ALL;
5913                 break;
5914         default:
5915                 BUG();
5916                 break;
5917         }
5918
5919         if (roots_to_free)
5920                 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
5921
5922         return nested_vmx_succeed(vcpu);
5923 }
5924
5925 static int handle_invvpid(struct kvm_vcpu *vcpu)
5926 {
5927         struct vcpu_vmx *vmx = to_vmx(vcpu);
5928         u32 vmx_instruction_info;
5929         unsigned long type, types;
5930         gva_t gva;
5931         struct x86_exception e;
5932         struct {
5933                 u64 vpid;
5934                 u64 gla;
5935         } operand;
5936         u16 vpid02;
5937         int r, gpr_index;
5938
5939         if (!(vmx->nested.msrs.secondary_ctls_high &
5940               SECONDARY_EXEC_ENABLE_VPID) ||
5941                         !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
5942                 kvm_queue_exception(vcpu, UD_VECTOR);
5943                 return 1;
5944         }
5945
5946         if (!nested_vmx_check_permission(vcpu))
5947                 return 1;
5948
5949         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5950         gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
5951         type = kvm_register_read(vcpu, gpr_index);
5952
5953         types = (vmx->nested.msrs.vpid_caps &
5954                         VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
5955
5956         if (type >= 32 || !(types & (1 << type)))
5957                 return nested_vmx_fail(vcpu,
5958                         VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5959
5960         /* according to the intel vmx instruction reference, the memory
5961          * operand is read even if it isn't needed (e.g., for type==global)
5962          */
5963         if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5964                         vmx_instruction_info, false, sizeof(operand), &gva))
5965                 return 1;
5966         r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
5967         if (r != X86EMUL_CONTINUE)
5968                 return kvm_handle_memory_failure(vcpu, r, &e);
5969
5970         if (operand.vpid >> 16)
5971                 return nested_vmx_fail(vcpu,
5972                         VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5973
5974         /*
5975          * Always flush the effective vpid02, i.e. never flush the current VPID
5976          * and never explicitly flush vpid01.  INVVPID targets a VPID, not a
5977          * VMCS, and so whether or not the current vmcs12 has VPID enabled is
5978          * irrelevant (and there may not be a loaded vmcs12).
5979          */
5980         vpid02 = nested_get_vpid02(vcpu);
5981         switch (type) {
5982         case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
5983                 /*
5984                  * LAM doesn't apply to addresses that are inputs to TLB
5985                  * invalidation.
5986                  */
5987                 if (!operand.vpid ||
5988                     is_noncanonical_invlpg_address(operand.gla, vcpu))
5989                         return nested_vmx_fail(vcpu,
5990                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5991                 vpid_sync_vcpu_addr(vpid02, operand.gla);
5992                 break;
5993         case VMX_VPID_EXTENT_SINGLE_CONTEXT:
5994         case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
5995                 if (!operand.vpid)
5996                         return nested_vmx_fail(vcpu,
5997                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5998                 vpid_sync_context(vpid02);
5999                 break;
6000         case VMX_VPID_EXTENT_ALL_CONTEXT:
6001                 vpid_sync_context(vpid02);
6002                 break;
6003         default:
6004                 WARN_ON_ONCE(1);
6005                 return kvm_skip_emulated_instruction(vcpu);
6006         }
6007
6008         /*
6009          * Sync the shadow page tables if EPT is disabled, L1 is invalidating
6010          * linear mappings for L2 (tagged with L2's VPID).  Free all guest
6011          * roots as VPIDs are not tracked in the MMU role.
6012          *
6013          * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
6014          * an MMU when EPT is disabled.
6015          *
6016          * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
6017          */
6018         if (!enable_ept)
6019                 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu);
6020
6021         return nested_vmx_succeed(vcpu);
6022 }
6023
6024 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
6025                                      struct vmcs12 *vmcs12)
6026 {
6027         u32 index = kvm_rcx_read(vcpu);
6028         u64 new_eptp;
6029
6030         if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
6031                 return 1;
6032         if (index >= VMFUNC_EPTP_ENTRIES)
6033                 return 1;
6034
6035         if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
6036                                      &new_eptp, index * 8, 8))
6037                 return 1;
6038
6039         /*
6040          * If the (L2) guest does a vmfunc to the currently
6041          * active ept pointer, we don't have to do anything else
6042          */
6043         if (vmcs12->ept_pointer != new_eptp) {
6044                 if (!nested_vmx_check_eptp(vcpu, new_eptp))
6045                         return 1;
6046
6047                 vmcs12->ept_pointer = new_eptp;
6048                 nested_ept_new_eptp(vcpu);
6049
6050                 if (!nested_cpu_has_vpid(vmcs12))
6051                         kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
6052         }
6053
6054         return 0;
6055 }
6056
6057 static int handle_vmfunc(struct kvm_vcpu *vcpu)
6058 {
6059         struct vcpu_vmx *vmx = to_vmx(vcpu);
6060         struct vmcs12 *vmcs12;
6061         u32 function = kvm_rax_read(vcpu);
6062
6063         /*
6064          * VMFUNC should never execute cleanly while L1 is active; KVM supports
6065          * VMFUNC for nested VMs, but not for L1.
6066          */
6067         if (WARN_ON_ONCE(!is_guest_mode(vcpu))) {
6068                 kvm_queue_exception(vcpu, UD_VECTOR);
6069                 return 1;
6070         }
6071
6072         vmcs12 = get_vmcs12(vcpu);
6073
6074         /*
6075          * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC
6076          * is enabled in vmcs02 if and only if it's enabled in vmcs12.
6077          */
6078         if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) {
6079                 kvm_queue_exception(vcpu, UD_VECTOR);
6080                 return 1;
6081         }
6082
6083         if (!(vmcs12->vm_function_control & BIT_ULL(function)))
6084                 goto fail;
6085
6086         switch (function) {
6087         case 0:
6088                 if (nested_vmx_eptp_switching(vcpu, vmcs12))
6089                         goto fail;
6090                 break;
6091         default:
6092                 goto fail;
6093         }
6094         return kvm_skip_emulated_instruction(vcpu);
6095
6096 fail:
6097         /*
6098          * This is effectively a reflected VM-Exit, as opposed to a synthesized
6099          * nested VM-Exit.  Pass the original exit reason, i.e. don't hardcode
6100          * EXIT_REASON_VMFUNC as the exit reason.
6101          */
6102         nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
6103                           vmx_get_intr_info(vcpu),
6104                           vmx_get_exit_qual(vcpu));
6105         return 1;
6106 }
6107
6108 /*
6109  * Return true if an IO instruction with the specified port and size should cause
6110  * a VM-exit into L1.
6111  */
6112 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
6113                                  int size)
6114 {
6115         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6116         gpa_t bitmap, last_bitmap;
6117         u8 b;
6118
6119         last_bitmap = INVALID_GPA;
6120         b = -1;
6121
6122         while (size > 0) {
6123                 if (port < 0x8000)
6124                         bitmap = vmcs12->io_bitmap_a;
6125                 else if (port < 0x10000)
6126                         bitmap = vmcs12->io_bitmap_b;
6127                 else
6128                         return true;
6129                 bitmap += (port & 0x7fff) / 8;
6130
6131                 if (last_bitmap != bitmap)
6132                         if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
6133                                 return true;
6134                 if (b & (1 << (port & 7)))
6135                         return true;
6136
6137                 port++;
6138                 size--;
6139                 last_bitmap = bitmap;
6140         }
6141
6142         return false;
6143 }
6144
6145 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
6146                                        struct vmcs12 *vmcs12)
6147 {
6148         unsigned long exit_qualification;
6149         unsigned short port;
6150         int size;
6151
6152         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
6153                 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
6154
6155         exit_qualification = vmx_get_exit_qual(vcpu);
6156
6157         port = exit_qualification >> 16;
6158         size = (exit_qualification & 7) + 1;
6159
6160         return nested_vmx_check_io_bitmaps(vcpu, port, size);
6161 }
6162
6163 /*
6164  * Return 1 if we should exit from L2 to L1 to handle an MSR access,
6165  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
6166  * disinterest in the current event (read or write a specific MSR) by using an
6167  * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
6168  */
6169 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
6170                                         struct vmcs12 *vmcs12,
6171                                         union vmx_exit_reason exit_reason)
6172 {
6173         u32 msr_index = kvm_rcx_read(vcpu);
6174         gpa_t bitmap;
6175
6176         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
6177                 return true;
6178
6179         /*
6180          * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
6181          * for the four combinations of read/write and low/high MSR numbers.
6182          * First we need to figure out which of the four to use:
6183          */
6184         bitmap = vmcs12->msr_bitmap;
6185         if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
6186                 bitmap += 2048;
6187         if (msr_index >= 0xc0000000) {
6188                 msr_index -= 0xc0000000;
6189                 bitmap += 1024;
6190         }
6191
6192         /* Then read the msr_index'th bit from this bitmap: */
6193         if (msr_index < 1024*8) {
6194                 unsigned char b;
6195                 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
6196                         return true;
6197                 return 1 & (b >> (msr_index & 7));
6198         } else
6199                 return true; /* let L1 handle the wrong parameter */
6200 }
6201
6202 /*
6203  * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
6204  * rather than handle it ourselves in L0. I.e., check if L1 wanted to
6205  * intercept (via guest_host_mask etc.) the current event.
6206  */
6207 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
6208         struct vmcs12 *vmcs12)
6209 {
6210         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
6211         int cr = exit_qualification & 15;
6212         int reg;
6213         unsigned long val;
6214
6215         switch ((exit_qualification >> 4) & 3) {
6216         case 0: /* mov to cr */
6217                 reg = (exit_qualification >> 8) & 15;
6218                 val = kvm_register_read(vcpu, reg);
6219                 switch (cr) {
6220                 case 0:
6221                         if (vmcs12->cr0_guest_host_mask &
6222                             (val ^ vmcs12->cr0_read_shadow))
6223                                 return true;
6224                         break;
6225                 case 3:
6226                         if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
6227                                 return true;
6228                         break;
6229                 case 4:
6230                         if (vmcs12->cr4_guest_host_mask &
6231                             (vmcs12->cr4_read_shadow ^ val))
6232                                 return true;
6233                         break;
6234                 case 8:
6235                         if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
6236                                 return true;
6237                         break;
6238                 }
6239                 break;
6240         case 2: /* clts */
6241                 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
6242                     (vmcs12->cr0_read_shadow & X86_CR0_TS))
6243                         return true;
6244                 break;
6245         case 1: /* mov from cr */
6246                 switch (cr) {
6247                 case 3:
6248                         if (vmcs12->cpu_based_vm_exec_control &
6249                             CPU_BASED_CR3_STORE_EXITING)
6250                                 return true;
6251                         break;
6252                 case 8:
6253                         if (vmcs12->cpu_based_vm_exec_control &
6254                             CPU_BASED_CR8_STORE_EXITING)
6255                                 return true;
6256                         break;
6257                 }
6258                 break;
6259         case 3: /* lmsw */
6260                 /*
6261                  * lmsw can change bits 1..3 of cr0, and only set bit 0 of
6262                  * cr0. Other attempted changes are ignored, with no exit.
6263                  */
6264                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
6265                 if (vmcs12->cr0_guest_host_mask & 0xe &
6266                     (val ^ vmcs12->cr0_read_shadow))
6267                         return true;
6268                 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
6269                     !(vmcs12->cr0_read_shadow & 0x1) &&
6270                     (val & 0x1))
6271                         return true;
6272                 break;
6273         }
6274         return false;
6275 }
6276
6277 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
6278                                           struct vmcs12 *vmcs12)
6279 {
6280         u32 encls_leaf;
6281
6282         if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
6283             !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
6284                 return false;
6285
6286         encls_leaf = kvm_rax_read(vcpu);
6287         if (encls_leaf > 62)
6288                 encls_leaf = 63;
6289         return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
6290 }
6291
6292 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
6293         struct vmcs12 *vmcs12, gpa_t bitmap)
6294 {
6295         u32 vmx_instruction_info;
6296         unsigned long field;
6297         u8 b;
6298
6299         if (!nested_cpu_has_shadow_vmcs(vmcs12))
6300                 return true;
6301
6302         /* Decode instruction info and find the field to access */
6303         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6304         field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
6305
6306         /* Out-of-range fields always cause a VM exit from L2 to L1 */
6307         if (field >> 15)
6308                 return true;
6309
6310         if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
6311                 return true;
6312
6313         return 1 & (b >> (field & 7));
6314 }
6315
6316 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
6317 {
6318         u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
6319
6320         if (nested_cpu_has_mtf(vmcs12))
6321                 return true;
6322
6323         /*
6324          * An MTF VM-exit may be injected into the guest by setting the
6325          * interruption-type to 7 (other event) and the vector field to 0. Such
6326          * is the case regardless of the 'monitor trap flag' VM-execution
6327          * control.
6328          */
6329         return entry_intr_info == (INTR_INFO_VALID_MASK
6330                                    | INTR_TYPE_OTHER_EVENT);
6331 }
6332
6333 /*
6334  * Return true if L0 wants to handle an exit from L2 regardless of whether or not
6335  * L1 wants the exit.  Only call this when in is_guest_mode (L2).
6336  */
6337 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
6338                                      union vmx_exit_reason exit_reason)
6339 {
6340         u32 intr_info;
6341
6342         switch ((u16)exit_reason.basic) {
6343         case EXIT_REASON_EXCEPTION_NMI:
6344                 intr_info = vmx_get_intr_info(vcpu);
6345                 if (is_nmi(intr_info))
6346                         return true;
6347                 else if (is_page_fault(intr_info))
6348                         return vcpu->arch.apf.host_apf_flags ||
6349                                vmx_need_pf_intercept(vcpu);
6350                 else if (is_debug(intr_info) &&
6351                          vcpu->guest_debug &
6352                          (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
6353                         return true;
6354                 else if (is_breakpoint(intr_info) &&
6355                          vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
6356                         return true;
6357                 else if (is_alignment_check(intr_info) &&
6358                          !vmx_guest_inject_ac(vcpu))
6359                         return true;
6360                 else if (is_ve_fault(intr_info))
6361                         return true;
6362                 return false;
6363         case EXIT_REASON_EXTERNAL_INTERRUPT:
6364                 return true;
6365         case EXIT_REASON_MCE_DURING_VMENTRY:
6366                 return true;
6367         case EXIT_REASON_EPT_VIOLATION:
6368                 /*
6369                  * L0 always deals with the EPT violation. If nested EPT is
6370                  * used, and the nested mmu code discovers that the address is
6371                  * missing in the guest EPT table (EPT12), the EPT violation
6372                  * will be injected with nested_ept_inject_page_fault()
6373                  */
6374                 return true;
6375         case EXIT_REASON_EPT_MISCONFIG:
6376                 /*
6377                  * L2 never uses directly L1's EPT, but rather L0's own EPT
6378                  * table (shadow on EPT) or a merged EPT table that L0 built
6379                  * (EPT on EPT). So any problems with the structure of the
6380                  * table is L0's fault.
6381                  */
6382                 return true;
6383         case EXIT_REASON_PREEMPTION_TIMER:
6384                 return true;
6385         case EXIT_REASON_PML_FULL:
6386                 /*
6387                  * PML is emulated for an L1 VMM and should never be enabled in
6388                  * vmcs02, always "handle" PML_FULL by exiting to userspace.
6389                  */
6390                 return true;
6391         case EXIT_REASON_VMFUNC:
6392                 /* VM functions are emulated through L2->L0 vmexits. */
6393                 return true;
6394         case EXIT_REASON_BUS_LOCK:
6395                 /*
6396                  * At present, bus lock VM exit is never exposed to L1.
6397                  * Handle L2's bus locks in L0 directly.
6398                  */
6399                 return true;
6400 #ifdef CONFIG_KVM_HYPERV
6401         case EXIT_REASON_VMCALL:
6402                 /* Hyper-V L2 TLB flush hypercall is handled by L0 */
6403                 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
6404                         nested_evmcs_l2_tlb_flush_enabled(vcpu) &&
6405                         kvm_hv_is_tlb_flush_hcall(vcpu);
6406 #endif
6407         default:
6408                 break;
6409         }
6410         return false;
6411 }
6412
6413 /*
6414  * Return 1 if L1 wants to intercept an exit from L2.  Only call this when in
6415  * is_guest_mode (L2).
6416  */
6417 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
6418                                      union vmx_exit_reason exit_reason)
6419 {
6420         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6421         u32 intr_info;
6422
6423         switch ((u16)exit_reason.basic) {
6424         case EXIT_REASON_EXCEPTION_NMI:
6425                 intr_info = vmx_get_intr_info(vcpu);
6426                 if (is_nmi(intr_info))
6427                         return true;
6428                 else if (is_page_fault(intr_info))
6429                         return true;
6430                 return vmcs12->exception_bitmap &
6431                                 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
6432         case EXIT_REASON_EXTERNAL_INTERRUPT:
6433                 return nested_exit_on_intr(vcpu);
6434         case EXIT_REASON_TRIPLE_FAULT:
6435                 return true;
6436         case EXIT_REASON_INTERRUPT_WINDOW:
6437                 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
6438         case EXIT_REASON_NMI_WINDOW:
6439                 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
6440         case EXIT_REASON_TASK_SWITCH:
6441                 return true;
6442         case EXIT_REASON_CPUID:
6443                 return true;
6444         case EXIT_REASON_HLT:
6445                 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
6446         case EXIT_REASON_INVD:
6447                 return true;
6448         case EXIT_REASON_INVLPG:
6449                 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
6450         case EXIT_REASON_RDPMC:
6451                 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
6452         case EXIT_REASON_RDRAND:
6453                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
6454         case EXIT_REASON_RDSEED:
6455                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
6456         case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
6457                 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
6458         case EXIT_REASON_VMREAD:
6459                 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
6460                         vmcs12->vmread_bitmap);
6461         case EXIT_REASON_VMWRITE:
6462                 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
6463                         vmcs12->vmwrite_bitmap);
6464         case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
6465         case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
6466         case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
6467         case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
6468         case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
6469                 /*
6470                  * VMX instructions trap unconditionally. This allows L1 to
6471                  * emulate them for its L2 guest, i.e., allows 3-level nesting!
6472                  */
6473                 return true;
6474         case EXIT_REASON_CR_ACCESS:
6475                 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
6476         case EXIT_REASON_DR_ACCESS:
6477                 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
6478         case EXIT_REASON_IO_INSTRUCTION:
6479                 return nested_vmx_exit_handled_io(vcpu, vmcs12);
6480         case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
6481                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
6482         case EXIT_REASON_MSR_READ:
6483         case EXIT_REASON_MSR_WRITE:
6484                 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
6485         case EXIT_REASON_INVALID_STATE:
6486                 return true;
6487         case EXIT_REASON_MWAIT_INSTRUCTION:
6488                 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
6489         case EXIT_REASON_MONITOR_TRAP_FLAG:
6490                 return nested_vmx_exit_handled_mtf(vmcs12);
6491         case EXIT_REASON_MONITOR_INSTRUCTION:
6492                 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
6493         case EXIT_REASON_PAUSE_INSTRUCTION:
6494                 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
6495                         nested_cpu_has2(vmcs12,
6496                                 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
6497         case EXIT_REASON_MCE_DURING_VMENTRY:
6498                 return true;
6499         case EXIT_REASON_TPR_BELOW_THRESHOLD:
6500                 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
6501         case EXIT_REASON_APIC_ACCESS:
6502         case EXIT_REASON_APIC_WRITE:
6503         case EXIT_REASON_EOI_INDUCED:
6504                 /*
6505                  * The controls for "virtualize APIC accesses," "APIC-
6506                  * register virtualization," and "virtual-interrupt
6507                  * delivery" only come from vmcs12.
6508                  */
6509                 return true;
6510         case EXIT_REASON_INVPCID:
6511                 return
6512                         nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
6513                         nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
6514         case EXIT_REASON_WBINVD:
6515                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
6516         case EXIT_REASON_XSETBV:
6517                 return true;
6518         case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
6519                 /*
6520                  * This should never happen, since it is not possible to
6521                  * set XSS to a non-zero value---neither in L1 nor in L2.
6522                  * If if it were, XSS would have to be checked against
6523                  * the XSS exit bitmap in vmcs12.
6524                  */
6525                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES);
6526         case EXIT_REASON_UMWAIT:
6527         case EXIT_REASON_TPAUSE:
6528                 return nested_cpu_has2(vmcs12,
6529                         SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
6530         case EXIT_REASON_ENCLS:
6531                 return nested_vmx_exit_handled_encls(vcpu, vmcs12);
6532         case EXIT_REASON_NOTIFY:
6533                 /* Notify VM exit is not exposed to L1 */
6534                 return false;
6535         default:
6536                 return true;
6537         }
6538 }
6539
6540 /*
6541  * Conditionally reflect a VM-Exit into L1.  Returns %true if the VM-Exit was
6542  * reflected into L1.
6543  */
6544 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
6545 {
6546         struct vcpu_vmx *vmx = to_vmx(vcpu);
6547         union vmx_exit_reason exit_reason = vmx->exit_reason;
6548         unsigned long exit_qual;
6549         u32 exit_intr_info;
6550
6551         WARN_ON_ONCE(vmx->nested.nested_run_pending);
6552
6553         /*
6554          * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
6555          * has already loaded L2's state.
6556          */
6557         if (unlikely(vmx->fail)) {
6558                 trace_kvm_nested_vmenter_failed(
6559                         "hardware VM-instruction error: ",
6560                         vmcs_read32(VM_INSTRUCTION_ERROR));
6561                 exit_intr_info = 0;
6562                 exit_qual = 0;
6563                 goto reflect_vmexit;
6564         }
6565
6566         trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX);
6567
6568         /* If L0 (KVM) wants the exit, it trumps L1's desires. */
6569         if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
6570                 return false;
6571
6572         /* If L1 doesn't want the exit, handle it in L0. */
6573         if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
6574                 return false;
6575
6576         /*
6577          * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits.  For
6578          * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
6579          * need to be synthesized by querying the in-kernel LAPIC, but external
6580          * interrupts are never reflected to L1 so it's a non-issue.
6581          */
6582         exit_intr_info = vmx_get_intr_info(vcpu);
6583         if (is_exception_with_error_code(exit_intr_info)) {
6584                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6585
6586                 vmcs12->vm_exit_intr_error_code =
6587                         vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6588         }
6589         exit_qual = vmx_get_exit_qual(vcpu);
6590
6591 reflect_vmexit:
6592         nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
6593         return true;
6594 }
6595
6596 static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
6597                                 struct kvm_nested_state __user *user_kvm_nested_state,
6598                                 u32 user_data_size)
6599 {
6600         struct vcpu_vmx *vmx;
6601         struct vmcs12 *vmcs12;
6602         struct kvm_nested_state kvm_state = {
6603                 .flags = 0,
6604                 .format = KVM_STATE_NESTED_FORMAT_VMX,
6605                 .size = sizeof(kvm_state),
6606                 .hdr.vmx.flags = 0,
6607                 .hdr.vmx.vmxon_pa = INVALID_GPA,
6608                 .hdr.vmx.vmcs12_pa = INVALID_GPA,
6609                 .hdr.vmx.preemption_timer_deadline = 0,
6610         };
6611         struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6612                 &user_kvm_nested_state->data.vmx[0];
6613
6614         if (!vcpu)
6615                 return kvm_state.size + sizeof(*user_vmx_nested_state);
6616
6617         vmx = to_vmx(vcpu);
6618         vmcs12 = get_vmcs12(vcpu);
6619
6620         if (guest_can_use(vcpu, X86_FEATURE_VMX) &&
6621             (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
6622                 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
6623                 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
6624
6625                 if (vmx_has_valid_vmcs12(vcpu)) {
6626                         kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
6627
6628                         /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
6629                         if (nested_vmx_is_evmptr12_set(vmx))
6630                                 kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
6631
6632                         if (is_guest_mode(vcpu) &&
6633                             nested_cpu_has_shadow_vmcs(vmcs12) &&
6634                             vmcs12->vmcs_link_pointer != INVALID_GPA)
6635                                 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
6636                 }
6637
6638                 if (vmx->nested.smm.vmxon)
6639                         kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
6640
6641                 if (vmx->nested.smm.guest_mode)
6642                         kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
6643
6644                 if (is_guest_mode(vcpu)) {
6645                         kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
6646
6647                         if (vmx->nested.nested_run_pending)
6648                                 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
6649
6650                         if (vmx->nested.mtf_pending)
6651                                 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
6652
6653                         if (nested_cpu_has_preemption_timer(vmcs12) &&
6654                             vmx->nested.has_preemption_timer_deadline) {
6655                                 kvm_state.hdr.vmx.flags |=
6656                                         KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE;
6657                                 kvm_state.hdr.vmx.preemption_timer_deadline =
6658                                         vmx->nested.preemption_timer_deadline;
6659                         }
6660                 }
6661         }
6662
6663         if (user_data_size < kvm_state.size)
6664                 goto out;
6665
6666         if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
6667                 return -EFAULT;
6668
6669         if (!vmx_has_valid_vmcs12(vcpu))
6670                 goto out;
6671
6672         /*
6673          * When running L2, the authoritative vmcs12 state is in the
6674          * vmcs02. When running L1, the authoritative vmcs12 state is
6675          * in the shadow or enlightened vmcs linked to vmcs01, unless
6676          * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
6677          * vmcs12 state is in the vmcs12 already.
6678          */
6679         if (is_guest_mode(vcpu)) {
6680                 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
6681                 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
6682         } else  {
6683                 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
6684                 if (!vmx->nested.need_vmcs12_to_shadow_sync) {
6685                         if (nested_vmx_is_evmptr12_valid(vmx))
6686                                 /*
6687                                  * L1 hypervisor is not obliged to keep eVMCS
6688                                  * clean fields data always up-to-date while
6689                                  * not in guest mode, 'hv_clean_fields' is only
6690                                  * supposed to be actual upon vmentry so we need
6691                                  * to ignore it here and do full copy.
6692                                  */
6693                                 copy_enlightened_to_vmcs12(vmx, 0);
6694                         else if (enable_shadow_vmcs)
6695                                 copy_shadow_to_vmcs12(vmx);
6696                 }
6697         }
6698
6699         BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
6700         BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
6701
6702         /*
6703          * Copy over the full allocated size of vmcs12 rather than just the size
6704          * of the struct.
6705          */
6706         if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
6707                 return -EFAULT;
6708
6709         if (nested_cpu_has_shadow_vmcs(vmcs12) &&
6710             vmcs12->vmcs_link_pointer != INVALID_GPA) {
6711                 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
6712                                  get_shadow_vmcs12(vcpu), VMCS12_SIZE))
6713                         return -EFAULT;
6714         }
6715 out:
6716         return kvm_state.size;
6717 }
6718
6719 void vmx_leave_nested(struct kvm_vcpu *vcpu)
6720 {
6721         if (is_guest_mode(vcpu)) {
6722                 to_vmx(vcpu)->nested.nested_run_pending = 0;
6723                 nested_vmx_vmexit(vcpu, -1, 0, 0);
6724         }
6725         free_nested(vcpu);
6726 }
6727
6728 static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
6729                                 struct kvm_nested_state __user *user_kvm_nested_state,
6730                                 struct kvm_nested_state *kvm_state)
6731 {
6732         struct vcpu_vmx *vmx = to_vmx(vcpu);
6733         struct vmcs12 *vmcs12;
6734         enum vm_entry_failure_code ignored;
6735         struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6736                 &user_kvm_nested_state->data.vmx[0];
6737         int ret;
6738
6739         if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
6740                 return -EINVAL;
6741
6742         if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) {
6743                 if (kvm_state->hdr.vmx.smm.flags)
6744                         return -EINVAL;
6745
6746                 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)
6747                         return -EINVAL;
6748
6749                 /*
6750                  * KVM_STATE_NESTED_EVMCS used to signal that KVM should
6751                  * enable eVMCS capability on vCPU. However, since then
6752                  * code was changed such that flag signals vmcs12 should
6753                  * be copied into eVMCS in guest memory.
6754                  *
6755                  * To preserve backwards compatibility, allow user
6756                  * to set this flag even when there is no VMXON region.
6757                  */
6758                 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
6759                         return -EINVAL;
6760         } else {
6761                 if (!guest_can_use(vcpu, X86_FEATURE_VMX))
6762                         return -EINVAL;
6763
6764                 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
6765                         return -EINVAL;
6766         }
6767
6768         if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6769             (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6770                 return -EINVAL;
6771
6772         if (kvm_state->hdr.vmx.smm.flags &
6773             ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
6774                 return -EINVAL;
6775
6776         if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
6777                 return -EINVAL;
6778
6779         /*
6780          * SMM temporarily disables VMX, so we cannot be in guest mode,
6781          * nor can VMLAUNCH/VMRESUME be pending.  Outside SMM, SMM flags
6782          * must be zero.
6783          */
6784         if (is_smm(vcpu) ?
6785                 (kvm_state->flags &
6786                  (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
6787                 : kvm_state->hdr.vmx.smm.flags)
6788                 return -EINVAL;
6789
6790         if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6791             !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
6792                 return -EINVAL;
6793
6794         if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
6795             (!guest_can_use(vcpu, X86_FEATURE_VMX) ||
6796              !vmx->nested.enlightened_vmcs_enabled))
6797                         return -EINVAL;
6798
6799         vmx_leave_nested(vcpu);
6800
6801         if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA)
6802                 return 0;
6803
6804         vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
6805         ret = enter_vmx_operation(vcpu);
6806         if (ret)
6807                 return ret;
6808
6809         /* Empty 'VMXON' state is permitted if no VMCS loaded */
6810         if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
6811                 /* See vmx_has_valid_vmcs12.  */
6812                 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
6813                     (kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
6814                     (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA))
6815                         return -EINVAL;
6816                 else
6817                         return 0;
6818         }
6819
6820         if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) {
6821                 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
6822                     !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
6823                         return -EINVAL;
6824
6825                 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
6826 #ifdef CONFIG_KVM_HYPERV
6827         } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
6828                 /*
6829                  * nested_vmx_handle_enlightened_vmptrld() cannot be called
6830                  * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
6831                  * restored yet. EVMCS will be mapped from
6832                  * nested_get_vmcs12_pages().
6833                  */
6834                 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
6835                 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
6836 #endif
6837         } else {
6838                 return -EINVAL;
6839         }
6840
6841         if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
6842                 vmx->nested.smm.vmxon = true;
6843                 vmx->nested.vmxon = false;
6844
6845                 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
6846                         vmx->nested.smm.guest_mode = true;
6847         }
6848
6849         vmcs12 = get_vmcs12(vcpu);
6850         if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
6851                 return -EFAULT;
6852
6853         if (vmcs12->hdr.revision_id != VMCS12_REVISION)
6854                 return -EINVAL;
6855
6856         if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6857                 return 0;
6858
6859         vmx->nested.nested_run_pending =
6860                 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
6861
6862         vmx->nested.mtf_pending =
6863                 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
6864
6865         ret = -EINVAL;
6866         if (nested_cpu_has_shadow_vmcs(vmcs12) &&
6867             vmcs12->vmcs_link_pointer != INVALID_GPA) {
6868                 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
6869
6870                 if (kvm_state->size <
6871                     sizeof(*kvm_state) +
6872                     sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
6873                         goto error_guest_mode;
6874
6875                 if (copy_from_user(shadow_vmcs12,
6876                                    user_vmx_nested_state->shadow_vmcs12,
6877                                    sizeof(*shadow_vmcs12))) {
6878                         ret = -EFAULT;
6879                         goto error_guest_mode;
6880                 }
6881
6882                 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
6883                     !shadow_vmcs12->hdr.shadow_vmcs)
6884                         goto error_guest_mode;
6885         }
6886
6887         vmx->nested.has_preemption_timer_deadline = false;
6888         if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
6889                 vmx->nested.has_preemption_timer_deadline = true;
6890                 vmx->nested.preemption_timer_deadline =
6891                         kvm_state->hdr.vmx.preemption_timer_deadline;
6892         }
6893
6894         if (nested_vmx_check_controls(vcpu, vmcs12) ||
6895             nested_vmx_check_host_state(vcpu, vmcs12) ||
6896             nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
6897                 goto error_guest_mode;
6898
6899         vmx->nested.dirty_vmcs12 = true;
6900         vmx->nested.force_msr_bitmap_recalc = true;
6901         ret = nested_vmx_enter_non_root_mode(vcpu, false);
6902         if (ret)
6903                 goto error_guest_mode;
6904
6905         if (vmx->nested.mtf_pending)
6906                 kvm_make_request(KVM_REQ_EVENT, vcpu);
6907
6908         return 0;
6909
6910 error_guest_mode:
6911         vmx->nested.nested_run_pending = 0;
6912         return ret;
6913 }
6914
6915 void nested_vmx_set_vmcs_shadowing_bitmap(void)
6916 {
6917         if (enable_shadow_vmcs) {
6918                 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
6919                 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
6920         }
6921 }
6922
6923 /*
6924  * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6.  Undo
6925  * that madness to get the encoding for comparison.
6926  */
6927 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
6928
6929 static u64 nested_vmx_calc_vmcs_enum_msr(void)
6930 {
6931         /*
6932          * Note these are the so called "index" of the VMCS field encoding, not
6933          * the index into vmcs12.
6934          */
6935         unsigned int max_idx, idx;
6936         int i;
6937
6938         /*
6939          * For better or worse, KVM allows VMREAD/VMWRITE to all fields in
6940          * vmcs12, regardless of whether or not the associated feature is
6941          * exposed to L1.  Simply find the field with the highest index.
6942          */
6943         max_idx = 0;
6944         for (i = 0; i < nr_vmcs12_fields; i++) {
6945                 /* The vmcs12 table is very, very sparsely populated. */
6946                 if (!vmcs12_field_offsets[i])
6947                         continue;
6948
6949                 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
6950                 if (idx > max_idx)
6951                         max_idx = idx;
6952         }
6953
6954         return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT;
6955 }
6956
6957 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf,
6958                                            struct nested_vmx_msrs *msrs)
6959 {
6960         msrs->pinbased_ctls_low =
6961                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6962
6963         msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl;
6964         msrs->pinbased_ctls_high &=
6965                 PIN_BASED_EXT_INTR_MASK |
6966                 PIN_BASED_NMI_EXITING |
6967                 PIN_BASED_VIRTUAL_NMIS |
6968                 (enable_apicv ? PIN_BASED_POSTED_INTR : 0);
6969         msrs->pinbased_ctls_high |=
6970                 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6971                 PIN_BASED_VMX_PREEMPTION_TIMER;
6972 }
6973
6974 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf,
6975                                        struct nested_vmx_msrs *msrs)
6976 {
6977         msrs->exit_ctls_low =
6978                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
6979
6980         msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl;
6981         msrs->exit_ctls_high &=
6982 #ifdef CONFIG_X86_64
6983                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
6984 #endif
6985                 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
6986                 VM_EXIT_CLEAR_BNDCFGS;
6987         msrs->exit_ctls_high |=
6988                 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
6989                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
6990                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT |
6991                 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
6992
6993         /* We support free control of debug control saving. */
6994         msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
6995 }
6996
6997 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf,
6998                                         struct nested_vmx_msrs *msrs)
6999 {
7000         msrs->entry_ctls_low =
7001                 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
7002
7003         msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl;
7004         msrs->entry_ctls_high &=
7005 #ifdef CONFIG_X86_64
7006                 VM_ENTRY_IA32E_MODE |
7007 #endif
7008                 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
7009         msrs->entry_ctls_high |=
7010                 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER |
7011                  VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
7012
7013         /* We support free control of debug control loading. */
7014         msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
7015 }
7016
7017 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf,
7018                                            struct nested_vmx_msrs *msrs)
7019 {
7020         msrs->procbased_ctls_low =
7021                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
7022
7023         msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl;
7024         msrs->procbased_ctls_high &=
7025                 CPU_BASED_INTR_WINDOW_EXITING |
7026                 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
7027                 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
7028                 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
7029                 CPU_BASED_CR3_STORE_EXITING |
7030 #ifdef CONFIG_X86_64
7031                 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
7032 #endif
7033                 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
7034                 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
7035                 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
7036                 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
7037                 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
7038         /*
7039          * We can allow some features even when not supported by the
7040          * hardware. For example, L1 can specify an MSR bitmap - and we
7041          * can use it to avoid exits to L1 - even when L0 runs L2
7042          * without MSR bitmaps.
7043          */
7044         msrs->procbased_ctls_high |=
7045                 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
7046                 CPU_BASED_USE_MSR_BITMAPS;
7047
7048         /* We support free control of CR3 access interception. */
7049         msrs->procbased_ctls_low &=
7050                 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
7051 }
7052
7053 static void nested_vmx_setup_secondary_ctls(u32 ept_caps,
7054                                             struct vmcs_config *vmcs_conf,
7055                                             struct nested_vmx_msrs *msrs)
7056 {
7057         msrs->secondary_ctls_low = 0;
7058
7059         msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
7060         msrs->secondary_ctls_high &=
7061                 SECONDARY_EXEC_DESC |
7062                 SECONDARY_EXEC_ENABLE_RDTSCP |
7063                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7064                 SECONDARY_EXEC_WBINVD_EXITING |
7065                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
7066                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
7067                 SECONDARY_EXEC_RDRAND_EXITING |
7068                 SECONDARY_EXEC_ENABLE_INVPCID |
7069                 SECONDARY_EXEC_ENABLE_VMFUNC |
7070                 SECONDARY_EXEC_RDSEED_EXITING |
7071                 SECONDARY_EXEC_ENABLE_XSAVES |
7072                 SECONDARY_EXEC_TSC_SCALING |
7073                 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
7074
7075         /*
7076          * We can emulate "VMCS shadowing," even if the hardware
7077          * doesn't support it.
7078          */
7079         msrs->secondary_ctls_high |=
7080                 SECONDARY_EXEC_SHADOW_VMCS;
7081
7082         if (enable_ept) {
7083                 /* nested EPT: emulate EPT also to L1 */
7084                 msrs->secondary_ctls_high |=
7085                         SECONDARY_EXEC_ENABLE_EPT;
7086                 msrs->ept_caps =
7087                         VMX_EPT_PAGE_WALK_4_BIT |
7088                         VMX_EPT_PAGE_WALK_5_BIT |
7089                         VMX_EPTP_WB_BIT |
7090                         VMX_EPT_INVEPT_BIT |
7091                         VMX_EPT_EXECUTE_ONLY_BIT;
7092
7093                 msrs->ept_caps &= ept_caps;
7094                 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
7095                         VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
7096                         VMX_EPT_1GB_PAGE_BIT;
7097                 if (enable_ept_ad_bits) {
7098                         msrs->secondary_ctls_high |=
7099                                 SECONDARY_EXEC_ENABLE_PML;
7100                         msrs->ept_caps |= VMX_EPT_AD_BIT;
7101                 }
7102
7103                 /*
7104                  * Advertise EPTP switching irrespective of hardware support,
7105                  * KVM emulates it in software so long as VMFUNC is supported.
7106                  */
7107                 if (cpu_has_vmx_vmfunc())
7108                         msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING;
7109         }
7110
7111         /*
7112          * Old versions of KVM use the single-context version without
7113          * checking for support, so declare that it is supported even
7114          * though it is treated as global context.  The alternative is
7115          * not failing the single-context invvpid, and it is worse.
7116          */
7117         if (enable_vpid) {
7118                 msrs->secondary_ctls_high |=
7119                         SECONDARY_EXEC_ENABLE_VPID;
7120                 msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
7121                         VMX_VPID_EXTENT_SUPPORTED_MASK;
7122         }
7123
7124         if (enable_unrestricted_guest)
7125                 msrs->secondary_ctls_high |=
7126                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
7127
7128         if (flexpriority_enabled)
7129                 msrs->secondary_ctls_high |=
7130                         SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
7131
7132         if (enable_sgx)
7133                 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
7134 }
7135
7136 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf,
7137                                        struct nested_vmx_msrs *msrs)
7138 {
7139         msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
7140         msrs->misc_low |=
7141                 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
7142                 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
7143                 VMX_MISC_ACTIVITY_HLT |
7144                 VMX_MISC_ACTIVITY_WAIT_SIPI;
7145         msrs->misc_high = 0;
7146 }
7147
7148 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
7149 {
7150         /*
7151          * This MSR reports some information about VMX support. We
7152          * should return information about the VMX we emulate for the
7153          * guest, and the VMCS structure we give it - not about the
7154          * VMX support of the underlying hardware.
7155          */
7156         msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE,
7157                                                  X86_MEMTYPE_WB);
7158
7159         msrs->basic |= VMX_BASIC_TRUE_CTLS;
7160         if (cpu_has_vmx_basic_inout())
7161                 msrs->basic |= VMX_BASIC_INOUT;
7162 }
7163
7164 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs)
7165 {
7166         /*
7167          * These MSRs specify bits which the guest must keep fixed on
7168          * while L1 is in VMXON mode (in L1's root mode, or running an L2).
7169          * We picked the standard core2 setting.
7170          */
7171 #define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
7172 #define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
7173         msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
7174         msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
7175
7176         /* These MSRs specify bits which the guest must keep fixed off. */
7177         rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
7178         rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
7179
7180         if (vmx_umip_emulated())
7181                 msrs->cr4_fixed1 |= X86_CR4_UMIP;
7182 }
7183
7184 /*
7185  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
7186  * returned for the various VMX controls MSRs when nested VMX is enabled.
7187  * The same values should also be used to verify that vmcs12 control fields are
7188  * valid during nested entry from L1 to L2.
7189  * Each of these control msrs has a low and high 32-bit half: A low bit is on
7190  * if the corresponding bit in the (32-bit) control field *must* be on, and a
7191  * bit in the high half is on if the corresponding bit in the control field
7192  * may be on. See also vmx_control_verify().
7193  */
7194 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
7195 {
7196         struct nested_vmx_msrs *msrs = &vmcs_conf->nested;
7197
7198         /*
7199          * Note that as a general rule, the high half of the MSRs (bits in
7200          * the control fields which may be 1) should be initialized by the
7201          * intersection of the underlying hardware's MSR (i.e., features which
7202          * can be supported) and the list of features we want to expose -
7203          * because they are known to be properly supported in our code.
7204          * Also, usually, the low half of the MSRs (bits which must be 1) can
7205          * be set to 0, meaning that L1 may turn off any of these bits. The
7206          * reason is that if one of these bits is necessary, it will appear
7207          * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
7208          * fields of vmcs01 and vmcs02, will turn these bits off - and
7209          * nested_vmx_l1_wants_exit() will not pass related exits to L1.
7210          * These rules have exceptions below.
7211          */
7212         nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs);
7213
7214         nested_vmx_setup_exit_ctls(vmcs_conf, msrs);
7215
7216         nested_vmx_setup_entry_ctls(vmcs_conf, msrs);
7217
7218         nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs);
7219
7220         nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs);
7221
7222         nested_vmx_setup_misc_data(vmcs_conf, msrs);
7223
7224         nested_vmx_setup_basic(msrs);
7225
7226         nested_vmx_setup_cr_fixed(msrs);
7227
7228         msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
7229 }
7230
7231 void nested_vmx_hardware_unsetup(void)
7232 {
7233         int i;
7234
7235         if (enable_shadow_vmcs) {
7236                 for (i = 0; i < VMX_BITMAP_NR; i++)
7237                         free_page((unsigned long)vmx_bitmap[i]);
7238         }
7239 }
7240
7241 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
7242 {
7243         int i;
7244
7245         if (!cpu_has_vmx_shadow_vmcs())
7246                 enable_shadow_vmcs = 0;
7247         if (enable_shadow_vmcs) {
7248                 for (i = 0; i < VMX_BITMAP_NR; i++) {
7249                         /*
7250                          * The vmx_bitmap is not tied to a VM and so should
7251                          * not be charged to a memcg.
7252                          */
7253                         vmx_bitmap[i] = (unsigned long *)
7254                                 __get_free_page(GFP_KERNEL);
7255                         if (!vmx_bitmap[i]) {
7256                                 nested_vmx_hardware_unsetup();
7257                                 return -ENOMEM;
7258                         }
7259                 }
7260
7261                 init_vmcs_shadow_fields();
7262         }
7263
7264         exit_handlers[EXIT_REASON_VMCLEAR]      = handle_vmclear;
7265         exit_handlers[EXIT_REASON_VMLAUNCH]     = handle_vmlaunch;
7266         exit_handlers[EXIT_REASON_VMPTRLD]      = handle_vmptrld;
7267         exit_handlers[EXIT_REASON_VMPTRST]      = handle_vmptrst;
7268         exit_handlers[EXIT_REASON_VMREAD]       = handle_vmread;
7269         exit_handlers[EXIT_REASON_VMRESUME]     = handle_vmresume;
7270         exit_handlers[EXIT_REASON_VMWRITE]      = handle_vmwrite;
7271         exit_handlers[EXIT_REASON_VMOFF]        = handle_vmxoff;
7272         exit_handlers[EXIT_REASON_VMON]         = handle_vmxon;
7273         exit_handlers[EXIT_REASON_INVEPT]       = handle_invept;
7274         exit_handlers[EXIT_REASON_INVVPID]      = handle_invvpid;
7275         exit_handlers[EXIT_REASON_VMFUNC]       = handle_vmfunc;
7276
7277         return 0;
7278 }
7279
7280 struct kvm_x86_nested_ops vmx_nested_ops = {
7281         .leave_nested = vmx_leave_nested,
7282         .is_exception_vmexit = nested_vmx_is_exception_vmexit,
7283         .check_events = vmx_check_nested_events,
7284         .has_events = vmx_has_nested_events,
7285         .triple_fault = nested_vmx_triple_fault,
7286         .get_state = vmx_get_nested_state,
7287         .set_state = vmx_set_nested_state,
7288         .get_nested_state_pages = vmx_get_nested_state_pages,
7289         .write_log_dirty = nested_vmx_write_pml_buffer,
7290 #ifdef CONFIG_KVM_HYPERV
7291         .enable_evmcs = nested_enable_evmcs,
7292         .get_evmcs_version = nested_get_evmcs_version,
7293         .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush,
7294 #endif
7295 };
This page took 0.449981 seconds and 4 git commands to generate.