From: Paolo Bonzini Date: Wed, 13 May 2020 16:14:05 +0000 (-0400) Subject: Merge branch 'kvm-amd-fixes' into HEAD X-Git-Tag: v5.8-rc1~168^2~134 X-Git-Url: https://repo.jachan.dev/J-linux.git/commitdiff_plain/4aef2ec9022b217f74d0f4c9b84081f07cc223d9?hp=-c Merge branch 'kvm-amd-fixes' into HEAD --- 4aef2ec9022b217f74d0f4c9b84081f07cc223d9 diff --combined arch/arm64/kvm/guest.c index 8417b200bec9,50a279d3ddd7..863a0d158fb8 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@@ -29,17 -29,20 +29,17 @@@ #include "trace.h" -#define VM_STAT(x) { #x, offsetof(struct kvm, stat.x), KVM_STAT_VM } -#define VCPU_STAT(x) { #x, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU } - struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT(halt_successful_poll), - VCPU_STAT(halt_attempted_poll), - VCPU_STAT(halt_poll_invalid), - VCPU_STAT(halt_wakeup), - VCPU_STAT(hvc_exit_stat), - VCPU_STAT(wfe_exit_stat), - VCPU_STAT(wfi_exit_stat), - VCPU_STAT(mmio_exit_user), - VCPU_STAT(mmio_exit_kernel), - VCPU_STAT(exits), + VCPU_STAT("halt_successful_poll", halt_successful_poll), + VCPU_STAT("halt_attempted_poll", halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", halt_poll_invalid), + VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("hvc_exit_stat", hvc_exit_stat), + VCPU_STAT("wfe_exit_stat", wfe_exit_stat), + VCPU_STAT("wfi_exit_stat", wfi_exit_stat), + VCPU_STAT("mmio_exit_user", mmio_exit_user), + VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel), + VCPU_STAT("exits", exits), { NULL } }; @@@ -197,6 -200,13 +197,13 @@@ static int set_core_reg(struct kvm_vcp } memcpy((u32 *)regs + off, valp, KVM_REG_SIZE(reg->id)); + + if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) { + int i; + + for (i = 0; i < 16; i++) + *vcpu_reg32(vcpu, i) = (u32)*vcpu_reg32(vcpu, i); + } out: return err; } diff --combined arch/powerpc/kvm/powerpc.c index 7e24691e138a,ad2f172c26a6..052614e9d468 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@@ -521,6 -521,7 +521,7 @@@ int kvm_vm_ioctl_check_extension(struc case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_SET_GUEST_DEBUG: r = 1; break; case KVM_CAP_PPC_GUEST_DEBUG_SSTEP: @@@ -1764,9 -1765,8 +1765,9 @@@ int kvm_vcpu_ioctl_set_one_reg(struct k return r; } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; int r; vcpu_load(vcpu); diff --combined arch/s390/kvm/kvm-s390.c index 75471b646fd7,d05bb040fd42..389ff1b7cd43 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@@ -57,107 -57,110 +57,107 @@@ #define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \ (KVM_MAX_VCPUS + LOCAL_IRQS)) -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM - struct kvm_stats_debugfs_item debugfs_entries[] = { - { "userspace_handled", VCPU_STAT(exit_userspace) }, - { "exit_null", VCPU_STAT(exit_null) }, - { "exit_validity", VCPU_STAT(exit_validity) }, - { "exit_stop_request", VCPU_STAT(exit_stop_request) }, - { "exit_external_request", VCPU_STAT(exit_external_request) }, - { "exit_io_request", VCPU_STAT(exit_io_request) }, - { "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) }, - { "exit_instruction", VCPU_STAT(exit_instruction) }, - { "exit_pei", VCPU_STAT(exit_pei) }, - { "exit_program_interruption", VCPU_STAT(exit_program_interruption) }, - { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) }, - { "exit_operation_exception", VCPU_STAT(exit_operation_exception) }, - { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, - { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, - { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, - { "halt_no_poll_steal", VCPU_STAT(halt_no_poll_steal) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "instruction_lctlg", VCPU_STAT(instruction_lctlg) }, - { "instruction_lctl", VCPU_STAT(instruction_lctl) }, - { "instruction_stctl", VCPU_STAT(instruction_stctl) }, - { "instruction_stctg", VCPU_STAT(instruction_stctg) }, - { "deliver_ckc", VCPU_STAT(deliver_ckc) }, - { "deliver_cputm", VCPU_STAT(deliver_cputm) }, - { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) }, - { "deliver_external_call", VCPU_STAT(deliver_external_call) }, - { "deliver_service_signal", VCPU_STAT(deliver_service_signal) }, - { "deliver_virtio", VCPU_STAT(deliver_virtio) }, - { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) }, - { "deliver_prefix_signal", VCPU_STAT(deliver_prefix_signal) }, - { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) }, - { "deliver_program", VCPU_STAT(deliver_program) }, - { "deliver_io", VCPU_STAT(deliver_io) }, - { "deliver_machine_check", VCPU_STAT(deliver_machine_check) }, - { "exit_wait_state", VCPU_STAT(exit_wait_state) }, - { "inject_ckc", VCPU_STAT(inject_ckc) }, - { "inject_cputm", VCPU_STAT(inject_cputm) }, - { "inject_external_call", VCPU_STAT(inject_external_call) }, - { "inject_float_mchk", VM_STAT(inject_float_mchk) }, - { "inject_emergency_signal", VCPU_STAT(inject_emergency_signal) }, - { "inject_io", VM_STAT(inject_io) }, - { "inject_mchk", VCPU_STAT(inject_mchk) }, - { "inject_pfault_done", VM_STAT(inject_pfault_done) }, - { "inject_program", VCPU_STAT(inject_program) }, - { "inject_restart", VCPU_STAT(inject_restart) }, - { "inject_service_signal", VM_STAT(inject_service_signal) }, - { "inject_set_prefix", VCPU_STAT(inject_set_prefix) }, - { "inject_stop_signal", VCPU_STAT(inject_stop_signal) }, - { "inject_pfault_init", VCPU_STAT(inject_pfault_init) }, - { "inject_virtio", VM_STAT(inject_virtio) }, - { "instruction_epsw", VCPU_STAT(instruction_epsw) }, - { "instruction_gs", VCPU_STAT(instruction_gs) }, - { "instruction_io_other", VCPU_STAT(instruction_io_other) }, - { "instruction_lpsw", VCPU_STAT(instruction_lpsw) }, - { "instruction_lpswe", VCPU_STAT(instruction_lpswe) }, - { "instruction_pfmf", VCPU_STAT(instruction_pfmf) }, - { "instruction_ptff", VCPU_STAT(instruction_ptff) }, - { "instruction_stidp", VCPU_STAT(instruction_stidp) }, - { "instruction_sck", VCPU_STAT(instruction_sck) }, - { "instruction_sckpf", VCPU_STAT(instruction_sckpf) }, - { "instruction_spx", VCPU_STAT(instruction_spx) }, - { "instruction_stpx", VCPU_STAT(instruction_stpx) }, - { "instruction_stap", VCPU_STAT(instruction_stap) }, - { "instruction_iske", VCPU_STAT(instruction_iske) }, - { "instruction_ri", VCPU_STAT(instruction_ri) }, - { "instruction_rrbe", VCPU_STAT(instruction_rrbe) }, - { "instruction_sske", VCPU_STAT(instruction_sske) }, - { "instruction_ipte_interlock", VCPU_STAT(instruction_ipte_interlock) }, - { "instruction_essa", VCPU_STAT(instruction_essa) }, - { "instruction_stsi", VCPU_STAT(instruction_stsi) }, - { "instruction_stfl", VCPU_STAT(instruction_stfl) }, - { "instruction_tb", VCPU_STAT(instruction_tb) }, - { "instruction_tpi", VCPU_STAT(instruction_tpi) }, - { "instruction_tprot", VCPU_STAT(instruction_tprot) }, - { "instruction_tsch", VCPU_STAT(instruction_tsch) }, - { "instruction_sthyi", VCPU_STAT(instruction_sthyi) }, - { "instruction_sie", VCPU_STAT(instruction_sie) }, - { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) }, - { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) }, - { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) }, - { "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) }, - { "instruction_sigp_cond_emergency", VCPU_STAT(instruction_sigp_cond_emergency) }, - { "instruction_sigp_start", VCPU_STAT(instruction_sigp_start) }, - { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) }, - { "instruction_sigp_stop_store_status", VCPU_STAT(instruction_sigp_stop_store_status) }, - { "instruction_sigp_store_status", VCPU_STAT(instruction_sigp_store_status) }, - { "instruction_sigp_store_adtl_status", VCPU_STAT(instruction_sigp_store_adtl_status) }, - { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) }, - { "instruction_sigp_set_prefix", VCPU_STAT(instruction_sigp_prefix) }, - { "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) }, - { "instruction_sigp_cpu_reset", VCPU_STAT(instruction_sigp_cpu_reset) }, - { "instruction_sigp_init_cpu_reset", VCPU_STAT(instruction_sigp_init_cpu_reset) }, - { "instruction_sigp_unknown", VCPU_STAT(instruction_sigp_unknown) }, - { "instruction_diag_10", VCPU_STAT(diagnose_10) }, - { "instruction_diag_44", VCPU_STAT(diagnose_44) }, - { "instruction_diag_9c", VCPU_STAT(diagnose_9c) }, - { "diag_9c_ignored", VCPU_STAT(diagnose_9c_ignored) }, - { "instruction_diag_258", VCPU_STAT(diagnose_258) }, - { "instruction_diag_308", VCPU_STAT(diagnose_308) }, - { "instruction_diag_500", VCPU_STAT(diagnose_500) }, - { "instruction_diag_other", VCPU_STAT(diagnose_other) }, + VCPU_STAT("userspace_handled", exit_userspace), + VCPU_STAT("exit_null", exit_null), + VCPU_STAT("exit_validity", exit_validity), + VCPU_STAT("exit_stop_request", exit_stop_request), + VCPU_STAT("exit_external_request", exit_external_request), + VCPU_STAT("exit_io_request", exit_io_request), + VCPU_STAT("exit_external_interrupt", exit_external_interrupt), + VCPU_STAT("exit_instruction", exit_instruction), + VCPU_STAT("exit_pei", exit_pei), + VCPU_STAT("exit_program_interruption", exit_program_interruption), + VCPU_STAT("exit_instr_and_program_int", exit_instr_and_program), + VCPU_STAT("exit_operation_exception", exit_operation_exception), + VCPU_STAT("halt_successful_poll", halt_successful_poll), + VCPU_STAT("halt_attempted_poll", halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", halt_poll_invalid), + VCPU_STAT("halt_no_poll_steal", halt_no_poll_steal), + VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("instruction_lctlg", instruction_lctlg), + VCPU_STAT("instruction_lctl", instruction_lctl), + VCPU_STAT("instruction_stctl", instruction_stctl), + VCPU_STAT("instruction_stctg", instruction_stctg), + VCPU_STAT("deliver_ckc", deliver_ckc), + VCPU_STAT("deliver_cputm", deliver_cputm), + VCPU_STAT("deliver_emergency_signal", deliver_emergency_signal), + VCPU_STAT("deliver_external_call", deliver_external_call), + VCPU_STAT("deliver_service_signal", deliver_service_signal), + VCPU_STAT("deliver_virtio", deliver_virtio), + VCPU_STAT("deliver_stop_signal", deliver_stop_signal), + VCPU_STAT("deliver_prefix_signal", deliver_prefix_signal), + VCPU_STAT("deliver_restart_signal", deliver_restart_signal), + VCPU_STAT("deliver_program", deliver_program), + VCPU_STAT("deliver_io", deliver_io), + VCPU_STAT("deliver_machine_check", deliver_machine_check), + VCPU_STAT("exit_wait_state", exit_wait_state), + VCPU_STAT("inject_ckc", inject_ckc), + VCPU_STAT("inject_cputm", inject_cputm), + VCPU_STAT("inject_external_call", inject_external_call), + VM_STAT("inject_float_mchk", inject_float_mchk), + VCPU_STAT("inject_emergency_signal", inject_emergency_signal), + VM_STAT("inject_io", inject_io), + VCPU_STAT("inject_mchk", inject_mchk), + VM_STAT("inject_pfault_done", inject_pfault_done), + VCPU_STAT("inject_program", inject_program), + VCPU_STAT("inject_restart", inject_restart), + VM_STAT("inject_service_signal", inject_service_signal), + VCPU_STAT("inject_set_prefix", inject_set_prefix), + VCPU_STAT("inject_stop_signal", inject_stop_signal), + VCPU_STAT("inject_pfault_init", inject_pfault_init), + VM_STAT("inject_virtio", inject_virtio), + VCPU_STAT("instruction_epsw", instruction_epsw), + VCPU_STAT("instruction_gs", instruction_gs), + VCPU_STAT("instruction_io_other", instruction_io_other), + VCPU_STAT("instruction_lpsw", instruction_lpsw), + VCPU_STAT("instruction_lpswe", instruction_lpswe), + VCPU_STAT("instruction_pfmf", instruction_pfmf), + VCPU_STAT("instruction_ptff", instruction_ptff), + VCPU_STAT("instruction_stidp", instruction_stidp), + VCPU_STAT("instruction_sck", instruction_sck), + VCPU_STAT("instruction_sckpf", instruction_sckpf), + VCPU_STAT("instruction_spx", instruction_spx), + VCPU_STAT("instruction_stpx", instruction_stpx), + VCPU_STAT("instruction_stap", instruction_stap), + VCPU_STAT("instruction_iske", instruction_iske), + VCPU_STAT("instruction_ri", instruction_ri), + VCPU_STAT("instruction_rrbe", instruction_rrbe), + VCPU_STAT("instruction_sske", instruction_sske), + VCPU_STAT("instruction_ipte_interlock", instruction_ipte_interlock), + VCPU_STAT("instruction_essa", instruction_essa), + VCPU_STAT("instruction_stsi", instruction_stsi), + VCPU_STAT("instruction_stfl", instruction_stfl), + VCPU_STAT("instruction_tb", instruction_tb), + VCPU_STAT("instruction_tpi", instruction_tpi), + VCPU_STAT("instruction_tprot", instruction_tprot), + VCPU_STAT("instruction_tsch", instruction_tsch), + VCPU_STAT("instruction_sthyi", instruction_sthyi), + VCPU_STAT("instruction_sie", instruction_sie), + VCPU_STAT("instruction_sigp_sense", instruction_sigp_sense), + VCPU_STAT("instruction_sigp_sense_running", instruction_sigp_sense_running), + VCPU_STAT("instruction_sigp_external_call", instruction_sigp_external_call), + VCPU_STAT("instruction_sigp_emergency", instruction_sigp_emergency), + VCPU_STAT("instruction_sigp_cond_emergency", instruction_sigp_cond_emergency), + VCPU_STAT("instruction_sigp_start", instruction_sigp_start), + VCPU_STAT("instruction_sigp_stop", instruction_sigp_stop), + VCPU_STAT("instruction_sigp_stop_store_status", instruction_sigp_stop_store_status), + VCPU_STAT("instruction_sigp_store_status", instruction_sigp_store_status), + VCPU_STAT("instruction_sigp_store_adtl_status", instruction_sigp_store_adtl_status), + VCPU_STAT("instruction_sigp_set_arch", instruction_sigp_arch), + VCPU_STAT("instruction_sigp_set_prefix", instruction_sigp_prefix), + VCPU_STAT("instruction_sigp_restart", instruction_sigp_restart), + VCPU_STAT("instruction_sigp_cpu_reset", instruction_sigp_cpu_reset), + VCPU_STAT("instruction_sigp_init_cpu_reset", instruction_sigp_init_cpu_reset), + VCPU_STAT("instruction_sigp_unknown", instruction_sigp_unknown), + VCPU_STAT("instruction_diag_10", diagnose_10), + VCPU_STAT("instruction_diag_44", diagnose_44), + VCPU_STAT("instruction_diag_9c", diagnose_9c), + VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored), + VCPU_STAT("instruction_diag_258", diagnose_258), + VCPU_STAT("instruction_diag_308", diagnose_308), + VCPU_STAT("instruction_diag_500", diagnose_500), + VCPU_STAT("instruction_diag_other", diagnose_other), { NULL } }; @@@ -542,6 -545,7 +542,7 @@@ int kvm_vm_ioctl_check_extension(struc case KVM_CAP_S390_AIS: case KVM_CAP_S390_AIS_MIGRATION: case KVM_CAP_S390_VCPU_RESETS: + case KVM_CAP_SET_GUEST_DEBUG: r = 1; break; case KVM_CAP_S390_HPAGE_1M: @@@ -4333,9 -4337,8 +4334,9 @@@ static void store_regs(struct kvm_vcpu store_regs_fmt2(vcpu, kvm_run); } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; int rc; if (kvm_run->immediate_exit) diff --combined arch/x86/include/asm/kvm_host.h index a239a297be33,0a6b35353fc7..b3a5da27c2a5 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@@ -83,9 -83,6 +83,9 @@@ #define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) #define KVM_REQ_APICV_UPDATE \ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26) +#define KVM_REQ_HV_TLB_FLUSH \ + KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@@ -170,8 -167,6 +170,8 @@@ enum kvm_reg VCPU_EXREG_CR3, VCPU_EXREG_RFLAGS, VCPU_EXREG_SEGMENTS, + VCPU_EXREG_EXIT_INFO_1, + VCPU_EXREG_EXIT_INFO_2, }; enum { @@@ -377,12 -372,12 +377,12 @@@ struct rsvd_bits_validate }; struct kvm_mmu_root_info { - gpa_t cr3; + gpa_t pgd; hpa_t hpa; }; #define KVM_MMU_ROOT_INFO_INVALID \ - ((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE }) + ((struct kvm_mmu_root_info) { .pgd = INVALID_PAGE, .hpa = INVALID_PAGE }) #define KVM_MMU_NUM_PREV_ROOTS 3 @@@ -408,7 -403,7 +408,7 @@@ struct kvm_mmu void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte); hpa_t root_hpa; - gpa_t root_cr3; + gpa_t root_pgd; union kvm_mmu_role mmu_role; u8 root_level; u8 shadow_root_level; @@@ -583,6 -578,7 +583,7 @@@ struct kvm_vcpu_arch unsigned long cr4; unsigned long cr4_guest_owned_bits; unsigned long cr8; + u32 host_pkru; u32 pkru; u32 hflags; u64 efer; @@@ -1098,16 -1094,13 +1099,14 @@@ struct kvm_x86_ops void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); - u64 (*get_dr6)(struct kvm_vcpu *vcpu); - void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); + void (*tlb_flush_all)(struct kvm_vcpu *vcpu); + void (*tlb_flush_current)(struct kvm_vcpu *vcpu); int (*tlb_remote_flush)(struct kvm *kvm); int (*tlb_remote_flush_with_range)(struct kvm *kvm, struct kvm_tlb_range *range); @@@ -1120,13 -1113,7 +1119,13 @@@ */ void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); - void (*run)(struct kvm_vcpu *vcpu); + /* + * Flush any TLB entries created by the guest. Like tlb_flush_gva(), + * does not need to flush GPA->HPA mappings. + */ + void (*tlb_flush_guest)(struct kvm_vcpu *vcpu); + + enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); @@@ -1154,7 -1141,7 +1153,7 @@@ bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); - void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); + void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu); int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); @@@ -1176,8 -1163,10 +1175,8 @@@ struct x86_instruction_info *info, enum x86_intercept_stage stage, struct x86_exception *exception); - void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion *exit_fastpath); + void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); - int (*check_nested_events)(struct kvm_vcpu *vcpu); void (*request_immediate_exit)(struct kvm_vcpu *vcpu); void (*sched_in)(struct kvm_vcpu *kvm, int cpu); @@@ -1210,7 -1199,6 +1209,7 @@@ /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; + const struct kvm_x86_nested_ops *nested_ops; /* * Architecture specific hooks for vCPU blocking due to @@@ -1238,6 -1226,14 +1237,6 @@@ void (*setup_mce)(struct kvm_vcpu *vcpu); - int (*get_nested_state)(struct kvm_vcpu *vcpu, - struct kvm_nested_state __user *user_kvm_nested_state, - unsigned user_data_size); - int (*set_nested_state)(struct kvm_vcpu *vcpu, - struct kvm_nested_state __user *user_kvm_nested_state, - struct kvm_nested_state *kvm_state); - bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); - int (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); @@@ -1249,27 -1245,16 +1248,27 @@@ int (*get_msr_feature)(struct kvm_msr_entry *entry); - int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu, - uint16_t *vmcs_version); - uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu); - bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); }; +struct kvm_x86_nested_ops { + int (*check_events)(struct kvm_vcpu *vcpu); + int (*get_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + unsigned user_data_size); + int (*set_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state); + bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + + int (*enable_evmcs)(struct kvm_vcpu *vcpu, + uint16_t *vmcs_version); + uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu); +}; + struct kvm_x86_init_ops { int (*cpu_has_kvm_support)(void); int (*disabled_by_bios)(void); @@@ -1463,11 -1448,10 +1462,12 @@@ bool kvm_rdpmc(struct kvm_vcpu *vcpu) void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); + void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); +bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault); int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gfn_t gfn, void *data, int offset, int len, u32 access); @@@ -1525,11 -1509,8 +1525,11 @@@ int kvm_emulate_hypercall(struct kvm_vc int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); +void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gva_t gva, hpa_t root_hpa); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); +void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, + bool skip_mmu_sync); void kvm_configure_mmu(bool enable_tdp, int tdp_page_level); @@@ -1682,8 -1663,8 +1682,8 @@@ void kvm_set_msi_irq(struct kvm *kvm, s static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) { /* We can only post Fixed and LowPrio IRQs */ - return (irq->delivery_mode == dest_Fixed || - irq->delivery_mode == dest_LowestPrio); + return (irq->delivery_mode == APIC_DM_FIXED || + irq->delivery_mode == APIC_DM_LOWEST); } static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) diff --combined arch/x86/kvm/hyperv.c index 2f96ff9e60ee,54d4b98b49e1..f9d3b919823c --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@@ -1425,8 -1425,9 +1425,8 @@@ static u64 kvm_hv_flush_tlb(struct kvm_ * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ - kvm_make_vcpus_request_mask(kvm, - KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP, + kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, - vcpu_mask, &hv_vcpu->tlb_flush); + NULL, vcpu_mask, &hv_vcpu->tlb_flush); ret_success: /* We always do full TLB flush, set rep_done = rep_cnt. */ @@@ -1799,8 -1800,8 +1799,8 @@@ int kvm_vcpu_ioctl_get_hv_cpuid(struct }; int i, nent = ARRAY_SIZE(cpuid_entries); - if (kvm_x86_ops.nested_get_evmcs_version) - evmcs_ver = kvm_x86_ops.nested_get_evmcs_version(vcpu); + if (kvm_x86_ops.nested_ops->get_evmcs_version) + evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu); /* Skip NESTED_FEATURES if eVMCS is not supported */ if (!evmcs_ver) diff --combined arch/x86/kvm/svm/nested.c index a7c3b3030e59,9a2a62e5afeb..1429f506fe9e --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@@ -19,6 -19,7 +19,7 @@@ #include #include + #include #include "kvm_emulate.h" #include "trace.h" @@@ -207,10 -208,6 +208,10 @@@ static bool nested_vmcb_checks(struct v if ((vmcb->save.efer & EFER_SVME) == 0) return false; + if (((vmcb->save.cr0 & X86_CR0_CD) == 0) && + (vmcb->save.cr0 & X86_CR0_NW)) + return false; + if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) return false; @@@ -271,7 -268,7 +272,7 @@@ void enter_svm_guest_mode(struct vcpu_s svm->vmcb->save.rsp = nested_vmcb->save.rsp; svm->vmcb->save.rip = nested_vmcb->save.rip; svm->vmcb->save.dr7 = nested_vmcb->save.dr7; - svm->vmcb->save.dr6 = nested_vmcb->save.dr6; + svm->vcpu.arch.dr6 = nested_vmcb->save.dr6; svm->vmcb->save.cpl = nested_vmcb->save.cpl; svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL; @@@ -283,7 -280,7 +284,7 @@@ svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; svm->nested.intercept = nested_vmcb->control.intercept; - svm_flush_tlb(&svm->vcpu, true); + svm_flush_tlb(&svm->vcpu); svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) svm->vcpu.arch.hflags |= HF_VINTR_MASK; @@@ -345,12 -342,8 +346,12 @@@ int nested_svm_vmrun(struct vcpu_svm *s struct kvm_host_map map; u64 vmcb_gpa; - vmcb_gpa = svm->vmcb->save.rax; + if (is_smm(&svm->vcpu)) { + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; + } + vmcb_gpa = svm->vmcb->save.rax; ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); if (ret == -EINVAL) { kvm_inject_gp(&svm->vcpu, 0); @@@ -490,7 -483,7 +491,7 @@@ int nested_svm_vmexit(struct vcpu_svm * nested_vmcb->save.rsp = vmcb->save.rsp; nested_vmcb->save.rax = vmcb->save.rax; nested_vmcb->save.dr7 = vmcb->save.dr7; - nested_vmcb->save.dr6 = vmcb->save.dr6; + nested_vmcb->save.dr6 = svm->vcpu.arch.dr6; nested_vmcb->save.cpl = vmcb->save.cpl; nested_vmcb->control.int_ctl = vmcb->control.int_ctl; @@@ -614,26 -607,45 +615,45 @@@ static int nested_svm_exit_handled_msr( /* DB exceptions for our internal use must not cause vmexit */ static int nested_svm_intercept_db(struct vcpu_svm *svm) { - unsigned long dr6; + unsigned long dr6 = svm->vmcb->save.dr6; + + /* Always catch it and pass it to userspace if debugging. */ + if (svm->vcpu.guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + return NESTED_EXIT_HOST; /* if we're not singlestepping, it's not ours */ if (!svm->nmi_singlestep) - return NESTED_EXIT_DONE; + goto reflected_db; /* if it's not a singlestep exception, it's not ours */ - if (kvm_get_dr(&svm->vcpu, 6, &dr6)) - return NESTED_EXIT_DONE; if (!(dr6 & DR6_BS)) - return NESTED_EXIT_DONE; + goto reflected_db; /* if the guest is singlestepping, it should get the vmexit */ if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) { disable_nmi_singlestep(svm); - return NESTED_EXIT_DONE; + goto reflected_db; } /* it's ours, the nested hypervisor must not see this one */ return NESTED_EXIT_HOST; + + reflected_db: + /* + * Synchronize guest DR6 here just like in kvm_deliver_exception_payload; + * it will be moved into the nested VMCB by nested_svm_vmexit. Once + * exceptions will be moved to svm_check_nested_events, all this stuff + * will just go away and we could just return NESTED_EXIT_HOST + * unconditionally. db_interception will queue the exception, which + * will be processed by svm_check_nested_events if a nested vmexit is + * required, and we will just use kvm_deliver_exception_payload to copy + * the payload to DR6 before vmexit. + */ + WARN_ON(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT); + svm->vcpu.arch.dr6 &= ~(DR_TRAP_BITS | DR6_RTM); + svm->vcpu.arch.dr6 |= dr6 & ~DR6_FIXED_1; + return NESTED_EXIT_DONE; } static int nested_svm_intercept_ioio(struct vcpu_svm *svm) @@@ -690,6 -702,9 +710,9 @@@ static int nested_svm_intercept(struct if (svm->nested.intercept_exceptions & excp_bits) { if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR) vmexit = nested_svm_intercept_db(svm); + else if (exit_code == SVM_EXIT_EXCP_BASE + BP_VECTOR && + svm->vcpu.guest_debug & KVM_GUESTDBG_USE_SW_BP) + vmexit = NESTED_EXIT_HOST; else vmexit = NESTED_EXIT_DONE; } @@@ -788,7 -803,7 +811,7 @@@ static bool nested_exit_on_intr(struct return (svm->nested.intercept & 1ULL); } -int svm_check_nested_events(struct kvm_vcpu *vcpu) +static int svm_check_nested_events(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); bool block_nested_events = @@@ -829,7 -844,3 +852,7 @@@ int nested_svm_exit_special(struct vcpu return NESTED_EXIT_CONTINUE; } + +struct kvm_x86_nested_ops svm_nested_ops = { + .check_events = svm_check_nested_events, +}; diff --combined arch/x86/kvm/svm/svm.c index c86f7278509b,a862c768fd54..b627564e41f9 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@@ -33,7 -33,6 +33,7 @@@ #include #include #include +#include #include #include @@@ -1604,7 -1603,7 +1604,7 @@@ int svm_set_cr4(struct kvm_vcpu *vcpu, return 1; if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb(vcpu, true); + svm_flush_tlb(vcpu); vcpu->arch.cr4 = cr4; if (!npt_enabled) @@@ -1673,17 -1672,14 +1673,14 @@@ static void new_asid(struct vcpu_svm *s mark_dirty(svm->vmcb, VMCB_ASID); } - static u64 svm_get_dr6(struct kvm_vcpu *vcpu) + static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) { - return to_svm(vcpu)->vmcb->save.dr6; - } - - static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) - { - struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; - svm->vmcb->save.dr6 = value; - mark_dirty(svm->vmcb, VMCB_DR); + if (unlikely(value != vmcb->save.dr6)) { + vmcb->save.dr6 = value; + mark_dirty(vmcb, VMCB_DR); + } } static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) @@@ -1694,9 -1690,12 +1691,12 @@@ get_debugreg(vcpu->arch.db[1], 1); get_debugreg(vcpu->arch.db[2], 2); get_debugreg(vcpu->arch.db[3], 3); - vcpu->arch.dr6 = svm_get_dr6(vcpu); + /* + * We cannot reset svm->vmcb->save.dr6 to DR6_FIXED_1|DR6_RTM here, + * because db_interception might need it. We can do it before vmentry. + */ + vcpu->arch.dr6 = svm->vmcb->save.dr6; vcpu->arch.dr7 = svm->vmcb->save.dr7; - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; set_dr_intercepts(svm); } @@@ -1740,7 -1739,8 +1740,8 @@@ static int db_interception(struct vcpu_ if (!(svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && !svm->nmi_singlestep) { - kvm_queue_exception(&svm->vcpu, DB_VECTOR); + u32 payload = (svm->vmcb->save.dr6 ^ DR6_RTM) & ~DR6_FIXED_1; + kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload); return 1; } @@@ -1753,6 -1753,8 +1754,8 @@@ if (svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6; + kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7; kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; kvm_run->debug.arch.exception = DB_VECTOR; @@@ -1840,25 -1842,6 +1843,25 @@@ static bool is_erratum_383(void return true; } +/* + * Trigger machine check on the host. We assume all the MSRs are already set up + * by the CPU and that we still run on the same CPU as the MCE occurred on. + * We pass a fake environment to the machine check handler because we want + * the guest to be always treated like user space, no matter what context + * it used internally. + */ +static void kvm_machine_check(void) +{ +#if defined(CONFIG_X86_MCE) + struct pt_regs regs = { + .cs = 3, /* Fake ring 3 no matter what the guest ran on */ + .flags = X86_EFLAGS_IF, + }; + + do_machine_check(®s, 0); +#endif +} + static void svm_handle_mce(struct vcpu_svm *svm) { if (is_erratum_383()) { @@@ -1877,7 -1860,11 +1880,7 @@@ * On an #MC intercept the MCE handler is not called automatically in * the host. So do it by hand here. */ - asm volatile ( - "int $0x12\n"); - /* not sure if we ever come back to this point */ - - return; + kvm_machine_check(); } static int mc_interception(struct vcpu_svm *svm) @@@ -3169,17 -3156,10 +3172,17 @@@ static int svm_set_identity_map_addr(st return 0; } -void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) +void svm_flush_tlb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + /* + * Flush only the current ASID even if the TLB flush was invoked via + * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all + * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and + * unconditionally does a TLB flush on both nested VM-Enter and nested + * VM-Exit (via kvm_mmu_reset_context()). + */ if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; else @@@ -3299,21 -3279,10 +3302,21 @@@ static void svm_cancel_injection(struc svm_complete_interrupts(svm); } +static enum exit_fastpath_completion svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +{ + if (!is_guest_mode(vcpu) && + to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + to_svm(vcpu)->vmcb->control.exit_info_1) + return handle_fastpath_set_msr_irqoff(vcpu); + + return EXIT_FASTPATH_NONE; +} + void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); -static void svm_vcpu_run(struct kvm_vcpu *vcpu) +static enum exit_fastpath_completion svm_vcpu_run(struct kvm_vcpu *vcpu) { + enum exit_fastpath_completion exit_fastpath; struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; @@@ -3325,7 -3294,7 +3328,7 @@@ * again. */ if (unlikely(svm->nested.exit_required)) - return; + return EXIT_FASTPATH_NONE; /* * Disable singlestep if we're injecting an interrupt/exception. @@@ -3349,6 -3318,15 +3352,15 @@@ svm->vmcb->save.cr2 = vcpu->arch.cr2; + /* + * Run with all-zero DR6 unless needed, so that we can get the exact cause + * of a #DB. + */ + if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + svm_set_dr6(svm, vcpu->arch.dr6); + else + svm_set_dr6(svm, DR6_FIXED_1 | DR6_RTM); + clgi(); kvm_load_guest_xsave_state(vcpu); @@@ -3409,7 -3387,6 +3421,7 @@@ stgi(); /* Any pending NMI will happen here */ + exit_fastpath = svm_exit_handlers_fastpath(vcpu); if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_after_interrupt(&svm->vcpu); @@@ -3438,7 -3415,6 +3450,7 @@@ svm_handle_mce(svm); mark_all_clean(svm->vmcb); + return exit_fastpath; } static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) @@@ -3740,8 -3716,13 +3752,8 @@@ out return ret; } -static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion *exit_fastpath) +static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(vcpu) && - to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && - to_svm(vcpu)->vmcb->control.exit_info_1) - *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) @@@ -3837,13 -3818,6 +3849,13 @@@ static bool svm_need_emulation_on_page_ bool smap = cr4 & X86_CR4_SMAP; bool is_user = svm_get_cpl(vcpu) == 3; + /* + * If RIP is invalid, go ahead with emulation which will cause an + * internal error exit. + */ + if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) + return true; + /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. * @@@ -3902,9 -3876,9 +3914,9 @@@ static bool svm_apic_init_signal_blocke /* * TODO: Last condition latch INIT signals on vCPU when * vCPU is in guest-mode and vmcb12 defines intercept on INIT. - * To properly emulate the INIT intercept, SVM should implement - * kvm_x86_ops.check_nested_events() and call nested_svm_vmexit() - * there if an INIT signal is pending. + * To properly emulate the INIT intercept, + * svm_check_nested_events() should call nested_svm_vmexit() + * if an INIT signal is pending. */ return !gif_set(svm) || (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT)); @@@ -3967,18 -3941,14 +3979,16 @@@ static struct kvm_x86_ops svm_x86_ops _ .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, - .get_dr6 = svm_get_dr6, - .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, - .tlb_flush = svm_flush_tlb, + .tlb_flush_all = svm_flush_tlb, + .tlb_flush_current = svm_flush_tlb, .tlb_flush_gva = svm_flush_tlb_gva, + .tlb_flush_guest = svm_flush_tlb, .run = svm_vcpu_run, .handle_exit = handle_exit, @@@ -4032,8 -4002,6 +4042,8 @@@ .sched_in = svm_sched_in, .pmu_ops = &amd_pmu_ops, + .nested_ops = &svm_nested_ops, + .deliver_posted_interrupt = svm_deliver_avic_intr, .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, .update_pi_irte = svm_update_pi_irte, @@@ -4048,9 -4016,14 +4058,9 @@@ .mem_enc_reg_region = svm_register_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region, - .nested_enable_evmcs = NULL, - .nested_get_evmcs_version = NULL, - .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, .apic_init_signal_blocked = svm_apic_init_signal_blocked, - - .check_nested_events = svm_check_nested_events, }; static struct kvm_x86_init_ops svm_init_ops __initdata = { diff --combined arch/x86/kvm/vmx/nested.c index b516c24494e3,e44f33c82332..b644bbf85460 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@@ -307,7 -307,7 +307,7 @@@ static void vmx_switch_vmcs(struct kvm_ vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); - vmx_segment_cache_clear(vmx); + vmx_register_cache_reset(vcpu); } /* @@@ -328,19 -328,19 +328,19 @@@ static void nested_ept_inject_page_faul { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exit_reason; + u32 vm_exit_reason; unsigned long exit_qualification = vcpu->arch.exit_qualification; if (vmx->nested.pml_full) { - exit_reason = EXIT_REASON_PML_FULL; + vm_exit_reason = EXIT_REASON_PML_FULL; vmx->nested.pml_full = false; exit_qualification &= INTR_INFO_UNBLOCK_NMI; } else if (fault->error_code & PFERR_RSVD_MASK) - exit_reason = EXIT_REASON_EPT_MISCONFIG; + vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; else - exit_reason = EXIT_REASON_EPT_VIOLATION; + vm_exit_reason = EXIT_REASON_EPT_VIOLATION; - nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); + nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); vmcs12->guest_physical_address = fault->address; } @@@ -1073,48 -1073,6 +1073,48 @@@ static bool nested_cr3_valid(struct kvm return (val & invalid_mask) == 0; } +/* + * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit. + * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't + * enable VPID for L2 (implying it expects a TLB flush on VMX transitions). + * Here's why. + * + * If EPT is enabled by L0 a sync is never needed: + * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there + * cannot be unsync'd SPTEs for either L1 or L2. + * + * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter + * VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings + * (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush + * stale guest-physical mappings for L2 from the TLB. And as above, L0 isn't + * shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit. + * + * If EPT is disabled by L0: + * - if VPID is enabled by L1 (for L2), the situation is similar to when L1 + * enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't + * required to invalidate linear mappings (EPT is disabled so there are + * no combined or guest-physical mappings), i.e. L1 can't rely on the + * (virtual) CPU to flush stale linear mappings for either L2 or itself (L1). + * + * - however if VPID is disabled by L1, then a sync is needed as L1 expects all + * linear mappings (EPT is disabled so there are no combined or guest-physical + * mappings) to be invalidated on both VM-Enter and VM-Exit. + * + * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which + * additionally checks that L2 has been assigned a VPID (when EPT is disabled). + * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect + * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2 + * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has + * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1 + * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't + * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush + * stale TLB entries, at which point L0 will sync L2's MMU. + */ +static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu) +{ + return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu)); +} + /* * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are * emulating VM-Entry into a guest with EPT enabled. On failure, the expected @@@ -1142,14 -1100,8 +1142,14 @@@ static int nested_vmx_load_cr3(struct k } } + /* + * Unconditionally skip the TLB flush on fast CR3 switch, all TLB + * flushes are handled by nested_vmx_transition_tlb_flush(). See + * nested_vmx_transition_mmu_sync for details on skipping the MMU sync. + */ if (!nested_ept) - kvm_mmu_new_cr3(vcpu, cr3, false); + kvm_mmu_new_pgd(vcpu, cr3, true, + !nested_vmx_transition_mmu_sync(vcpu)); vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); @@@ -1180,48 -1132,11 +1180,48 @@@ static bool nested_has_guest_tlb_tag(st (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); } -static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) +static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, + bool is_vmenter) { struct vcpu_vmx *vmx = to_vmx(vcpu); - return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; + /* + * If VPID is disabled, linear and combined mappings are flushed on + * VM-Enter/VM-Exit, and guest-physical mappings are valid only for + * their associated EPTP. + */ + if (!enable_vpid) + return; + + /* + * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings + * for *all* contexts to be flushed on VM-Enter/VM-Exit. + * + * If VPID is enabled and used by vmc12, but L2 does not have a unique + * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate + * a VPID for L2, flush the current context as the effective ASID is + * common to both L1 and L2. + * + * Defer the flush so that it runs after vmcs02.EPTP has been set by + * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid + * redundant flushes further down the nested pipeline. + * + * If a TLB flush isn't required due to any of the above, and vpid12 is + * changing then the new "virtual" VPID (vpid12) will reuse the same + * "real" VPID (vpid02), and so needs to be sync'd. There is no direct + * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for + * all nested vCPUs. + */ + if (!nested_cpu_has_vpid(vmcs12)) { + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } else if (!nested_has_guest_tlb_tag(vcpu)) { + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } else if (is_vmenter && + vmcs12->virtual_processor_id != vmx->nested.last_vpid) { + vmx->nested.last_vpid = vmcs12->virtual_processor_id; + vpid_sync_context(nested_get_vpid02(vcpu)); + } } static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) @@@ -1785,6 -1700,10 +1785,6 @@@ static int copy_enlightened_to_vmcs12(s * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; - * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; - * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; - * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; - * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; * vmcs12->page_fault_error_code_mask = * evmcs->page_fault_error_code_mask; * vmcs12->page_fault_error_code_match = @@@ -1858,6 -1777,10 +1858,6 @@@ static int copy_vmcs12_to_enlightened(s * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; - * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; - * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; - * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; - * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; * evmcs->tpr_threshold = vmcs12->tpr_threshold; * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; * evmcs->exception_bitmap = vmcs12->exception_bitmap; @@@ -2524,7 -2447,32 +2524,7 @@@ static int prepare_vmcs02(struct kvm_vc if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); - if (enable_vpid) { - /* - * There is no direct mapping between vpid02 and vpid12, the - * vpid02 is per-vCPU for L0 and reused while the value of - * vpid12 is changed w/ one invvpid during nested vmentry. - * The vpid12 is allocated by L1 for L2, so it will not - * influence global bitmap(for vpid01 and vpid02 allocation) - * even if spawn a lot of nested vCPUs. - */ - if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { - if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { - vmx->nested.last_vpid = vmcs12->virtual_processor_id; - __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); - } - } else { - /* - * If L1 use EPT, then L0 needs to execute INVEPT on - * EPTP02 instead of EPTP01. Therefore, delay TLB - * flush until vmcs02->eptp is fully updated by - * KVM_REQ_LOAD_MMU_PGD. Note that this assumes - * KVM_REQ_TLB_FLUSH is evaluated after - * KVM_REQ_LOAD_MMU_PGD in vcpu_enter_guest(). - */ - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } - } + nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); if (nested_cpu_has_ept(vmcs12)) nested_ept_init_mmu_context(vcpu); @@@ -3250,9 -3198,6 +3250,9 @@@ enum nvmx_vmentry_status nested_vmx_ent u32 exit_reason = EXIT_REASON_INVALID_STATE; u32 exit_qual; + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); + evaluate_pending_interrupts = exec_controls_get(vmx) & (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) @@@ -3994,11 -3939,11 +3994,11 @@@ static void sync_vmcs02_to_vmcs12(struc * which already writes to vmcs12 directly. */ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 exit_reason, u32 exit_intr_info, + u32 vm_exit_reason, u32 exit_intr_info, unsigned long exit_qualification) { /* update exit information fields: */ - vmcs12->vm_exit_reason = exit_reason; + vmcs12->vm_exit_reason = vm_exit_reason; vmcs12->exit_qualification = exit_qualification; vmcs12->vm_exit_intr_info = exit_intr_info; @@@ -4095,7 -4040,24 +4095,7 @@@ static void load_vmcs12_host_state(stru if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; - /* - * If vmcs01 doesn't use VPID, CPU flushes TLB on every - * VMEntry/VMExit. Thus, no need to flush TLB. - * - * If vmcs12 doesn't use VPID, L1 expects TLB to be - * flushed on every VMEntry/VMExit. - * - * Otherwise, we can preserve TLB entries as long as we are - * able to tag L1 TLB entries differently than L2 TLB entries. - * - * If vmcs12 uses EPT, we need to execute this flush on EPTP01 - * and therefore we request the TLB flush to happen only after VMCS EPTP - * has been set by KVM_REQ_LOAD_MMU_PGD. - */ - if (enable_vpid && - (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } + nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); @@@ -4242,7 -4204,7 +4242,7 @@@ static void nested_vmx_restore_host_sta * VMFail, like everything else we just need to ensure our * software model is up-to-date. */ - if (enable_ept) + if (enable_ept && is_pae_paging(vcpu)) ept_save_pdptrs(vcpu); kvm_mmu_reset_context(vcpu); @@@ -4310,7 -4272,7 +4310,7 @@@ vmabort * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) */ -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, +void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, u32 exit_intr_info, unsigned long exit_qualification) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@@ -4319,10 -4281,6 +4319,10 @@@ /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); + /* Service the TLB flush request for L2 before switching to L1. */ + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); + leave_guest_mode(vcpu); if (nested_cpu_has_preemption_timer(vmcs12)) @@@ -4334,9 -4292,9 +4334,9 @@@ if (likely(!vmx->fail)) { sync_vmcs02_to_vmcs12(vcpu, vmcs12); - if (exit_reason != -1) - prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, - exit_qualification); + if (vm_exit_reason != -1) + prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, + exit_intr_info, exit_qualification); /* * Must happen outside of sync_vmcs02_to_vmcs12() as it will @@@ -4386,20 -4344,20 +4386,20 @@@ kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); vmx->nested.pi_desc = NULL; - /* - * We are now running in L2, mmu_notifier will force to reload the - * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. - */ - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + if (vmx->nested.reload_vmcs01_apic_access_page) { + vmx->nested.reload_vmcs01_apic_access_page = false; + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + } - if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) + if ((vm_exit_reason != -1) && + (enable_shadow_vmcs || vmx->nested.hv_evmcs)) vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; if (likely(!vmx->fail)) { - if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && + if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && nested_exit_intr_ack_set(vcpu)) { int irq = kvm_cpu_get_interrupt(vcpu); WARN_ON(irq < 0); @@@ -4407,7 -4365,7 +4407,7 @@@ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; } - if (exit_reason != -1) + if (vm_exit_reason != -1) trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, vmcs12->exit_qualification, vmcs12->idt_vectoring_info_field, @@@ -4596,13 -4554,13 +4596,13 @@@ static int nested_vmx_get_vmptr(struct gva_t gva; struct x86_exception e; - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmcs_read32(VMX_INSTRUCTION_INFO), false, sizeof(*vmpointer), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } @@@ -4861,7 -4819,7 +4861,7 @@@ static int handle_vmread(struct kvm_vcp { struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) : get_vmcs12(vcpu); - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; @@@ -4911,7 -4869,7 +4911,7 @@@ return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } } @@@ -4947,7 -4905,7 +4947,7 @@@ static int handle_vmwrite(struct kvm_vc { struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) : get_vmcs12(vcpu); - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; @@@ -4985,7 -4943,7 +4985,7 @@@ instr_info, false, len, &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &value, len, &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } } @@@ -5132,7 -5090,7 +5132,7 @@@ static int handle_vmptrld(struct kvm_vc /* Emulate the VMPTRST instruction */ static int handle_vmptrst(struct kvm_vcpu *vcpu) { - unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qual = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; struct x86_exception e; @@@ -5150,33 -5108,23 +5150,33 @@@ /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, sizeof(gpa_t), &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } return nested_vmx_succeed(vcpu); } +#define EPTP_PA_MASK GENMASK_ULL(51, 12) + +static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) +{ + return VALID_PAGE(root_hpa) && + ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); +} + /* Emulate the INVEPT instruction */ static int handle_invept(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmx_instruction_info, types; - unsigned long type; + unsigned long type, roots_to_free; + struct kvm_mmu *mmu; gva_t gva; struct x86_exception e; struct { u64 eptp, gpa; } operand; + int i; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || @@@ -5200,49 -5148,27 +5200,49 @@@ /* According to the Intel VMX instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } - switch (type) { - case VMX_EPT_EXTENT_GLOBAL: - case VMX_EPT_EXTENT_CONTEXT: /* - * TODO: Sync the necessary shadow EPT roots here, rather than - * at the next emulated VM-entry. + * Nested EPT roots are always held through guest_mmu, + * not root_mmu. */ + mmu = &vcpu->arch.guest_mmu; + + switch (type) { + case VMX_EPT_EXTENT_CONTEXT: + if (!nested_vmx_check_eptp(vcpu, operand.eptp)) + return nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + + roots_to_free = 0; + if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd, + operand.eptp)) + roots_to_free |= KVM_MMU_ROOT_CURRENT; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + if (nested_ept_root_matches(mmu->prev_roots[i].hpa, + mmu->prev_roots[i].pgd, + operand.eptp)) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + } + break; + case VMX_EPT_EXTENT_GLOBAL: + roots_to_free = KVM_MMU_ROOTS_ALL; break; default: - BUG_ON(1); + BUG(); break; } + if (roots_to_free) + kvm_mmu_free_roots(vcpu, mmu, roots_to_free); + return nested_vmx_succeed(vcpu); } @@@ -5282,11 -5208,11 +5282,11 @@@ static int handle_invvpid(struct kvm_vc /* according to the intel vmx instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } if (operand.vpid >> 16) @@@ -5300,37 -5226,27 +5300,37 @@@ is_noncanonical_address(operand.gla, vcpu)) return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - if (cpu_has_vmx_invvpid_individual_addr()) { - __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, - vpid02, operand.gla); - } else - __vmx_flush_tlb(vcpu, vpid02, false); + vpid_sync_vcpu_addr(vpid02, operand.gla); break; case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: if (!operand.vpid) return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - __vmx_flush_tlb(vcpu, vpid02, false); + vpid_sync_context(vpid02); break; case VMX_VPID_EXTENT_ALL_CONTEXT: - __vmx_flush_tlb(vcpu, vpid02, false); + vpid_sync_context(vpid02); break; default: WARN_ON_ONCE(1); return kvm_skip_emulated_instruction(vcpu); } + /* + * Sync the shadow page tables if EPT is disabled, L1 is invalidating + * linear mappings for L2 (tagged with L2's VPID). Free all roots as + * VPIDs are not tracked in the MMU role. + * + * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share + * an MMU when EPT is disabled. + * + * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. + */ + if (!enable_ept) + kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, + KVM_MMU_ROOTS_ALL); + return nested_vmx_succeed(vcpu); } @@@ -5411,8 -5327,8 +5411,8 @@@ static int handle_vmfunc(struct kvm_vcp fail: nested_vmx_vmexit(vcpu, vmx->exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); + vmx_get_intr_info(vcpu), + vmx_get_exit_qual(vcpu)); return 1; } @@@ -5463,7 -5379,7 +5463,7 @@@ static bool nested_vmx_exit_handled_io( if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@@ -5517,7 -5433,7 +5517,7 @@@ static bool nested_vmx_exit_handled_msr static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int cr = exit_qualification & 15; int reg; unsigned long val; @@@ -5533,6 -5449,15 +5533,6 @@@ return true; break; case 3: - if ((vmcs12->cr3_target_count >= 1 && - vmcs12->cr3_target_value0 == val) || - (vmcs12->cr3_target_count >= 2 && - vmcs12->cr3_target_value1 == val) || - (vmcs12->cr3_target_count >= 3 && - vmcs12->cr3_target_value2 == val) || - (vmcs12->cr3_target_count >= 4 && - vmcs12->cr3_target_value3 == val)) - return false; if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) return true; break; @@@ -5626,85 -5551,49 +5626,85 @@@ static bool nested_vmx_exit_handled_mtf } /* - * Return true if we should exit from L2 to L1 to handle an exit, or false if we - * should handle it ourselves in L0 (and then continue L2). Only call this - * when in is_guest_mode (L2). + * Return true if L0 wants to handle an exit from L2 regardless of whether or not + * L1 wants the exit. Only call this when in is_guest_mode (L2). */ -bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) +static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) { - u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - WARN_ON_ONCE(vmx->nested.nested_run_pending); - - if (unlikely(vmx->fail)) { - trace_kvm_nested_vmenter_failed( - "hardware VM-instruction error: ", - vmcs_read32(VM_INSTRUCTION_ERROR)); - return true; - } - - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, - vmcs_readl(EXIT_QUALIFICATION), - vmx->idt_vectoring_info, - intr_info, - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - KVM_ISA_VMX); + u32 intr_info; switch (exit_reason) { case EXIT_REASON_EXCEPTION_NMI: + intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) - return false; + return true; else if (is_page_fault(intr_info)) - return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; + return vcpu->arch.apf.host_apf_reason || !enable_ept; else if (is_debug(intr_info) && vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - return false; + return true; else if (is_breakpoint(intr_info) && vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - return false; + return true; + return false; + case EXIT_REASON_EXTERNAL_INTERRUPT: + return true; + case EXIT_REASON_MCE_DURING_VMENTRY: + return true; + case EXIT_REASON_EPT_VIOLATION: + /* + * L0 always deals with the EPT violation. If nested EPT is + * used, and the nested mmu code discovers that the address is + * missing in the guest EPT table (EPT12), the EPT violation + * will be injected with nested_ept_inject_page_fault() + */ + return true; + case EXIT_REASON_EPT_MISCONFIG: + /* + * L2 never uses directly L1's EPT, but rather L0's own EPT + * table (shadow on EPT) or a merged EPT table that L0 built + * (EPT on EPT). So any problems with the structure of the + * table is L0's fault. + */ + return true; + case EXIT_REASON_PREEMPTION_TIMER: + return true; + case EXIT_REASON_PML_FULL: + /* We emulate PML support to L1. */ + return true; + case EXIT_REASON_VMFUNC: + /* VM functions are emulated through L2->L0 vmexits. */ + return true; + case EXIT_REASON_ENCLS: + /* SGX is never exposed to L1 */ + return true; + default: + break; + } + return false; +} + +/* + * Return 1 if L1 wants to intercept an exit from L2. Only call this when in + * is_guest_mode (L2). + */ +static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 intr_info; + + switch (exit_reason) { + case EXIT_REASON_EXCEPTION_NMI: + intr_info = vmx_get_intr_info(vcpu); + if (is_nmi(intr_info)) + return true; + else if (is_page_fault(intr_info)) + return true; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); case EXIT_REASON_EXTERNAL_INTERRUPT: - return false; + return nested_exit_on_intr(vcpu); case EXIT_REASON_TRIPLE_FAULT: return true; case EXIT_REASON_INTERRUPT_WINDOW: @@@ -5769,7 -5658,7 +5769,7 @@@ nested_cpu_has2(vmcs12, SECONDARY_EXEC_PAUSE_LOOP_EXITING); case EXIT_REASON_MCE_DURING_VMENTRY: - return false; + return true; case EXIT_REASON_TPR_BELOW_THRESHOLD: return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); case EXIT_REASON_APIC_ACCESS: @@@ -5781,6 -5670,22 +5781,6 @@@ * delivery" only come from vmcs12. */ return true; - case EXIT_REASON_EPT_VIOLATION: - /* - * L0 always deals with the EPT violation. If nested EPT is - * used, and the nested mmu code discovers that the address is - * missing in the guest EPT table (EPT12), the EPT violation - * will be injected with nested_ept_inject_page_fault() - */ - return false; - case EXIT_REASON_EPT_MISCONFIG: - /* - * L2 never uses directly L1's EPT, but rather L0's own EPT - * table (shadow on EPT) or a merged EPT table that L0 built - * (EPT on EPT). So any problems with the structure of the - * table is L0's fault. - */ - return false; case EXIT_REASON_INVPCID: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && @@@ -5797,6 -5702,17 +5797,6 @@@ * the XSS exit bitmap in vmcs12. */ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); - case EXIT_REASON_PREEMPTION_TIMER: - return false; - case EXIT_REASON_PML_FULL: - /* We emulate PML support to L1. */ - return false; - case EXIT_REASON_VMFUNC: - /* VM functions are emulated through L2->L0 vmexits. */ - return false; - case EXIT_REASON_ENCLS: - /* SGX is never exposed to L1 */ - return false; case EXIT_REASON_UMWAIT: case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, @@@ -5806,67 -5722,6 +5806,67 @@@ } } +/* + * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was + * reflected into L1. + */ +bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exit_reason = vmx->exit_reason; + unsigned long exit_qual; + u32 exit_intr_info; + + WARN_ON_ONCE(vmx->nested.nested_run_pending); + + /* + * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM + * has already loaded L2's state. + */ + if (unlikely(vmx->fail)) { + trace_kvm_nested_vmenter_failed( + "hardware VM-instruction error: ", + vmcs_read32(VM_INSTRUCTION_ERROR)); + exit_intr_info = 0; + exit_qual = 0; + goto reflect_vmexit; + } + + exit_intr_info = vmx_get_intr_info(vcpu); + exit_qual = vmx_get_exit_qual(vcpu); + + trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual, + vmx->idt_vectoring_info, exit_intr_info, + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + KVM_ISA_VMX); + + /* If L0 (KVM) wants the exit, it trumps L1's desires. */ + if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) + return false; + + /* If L1 doesn't want the exit, handle it in L0. */ + if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) + return false; + + /* + * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For + * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would + * need to be synthesized by querying the in-kernel LAPIC, but external + * interrupts are never reflected to L1 so it's a non-issue. + */ + if ((exit_intr_info & + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + vmcs12->vm_exit_intr_error_code = + vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + } + +reflect_vmexit: + nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual); + return true; +} static int vmx_get_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, @@@ -6176,7 -6031,7 +6176,7 @@@ void nested_vmx_setup_ctls_msrs(struct * reason is that if one of these bits is necessary, it will appear * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control * fields of vmcs01 and vmcs02, will turn these bits off - and - * nested_vmx_exit_reflected() will not pass related exits to L1. + * nested_vmx_l1_wants_exit() will not pass related exits to L1. * These rules have exceptions below. */ @@@ -6441,14 -6296,12 +6441,14 @@@ __init int nested_vmx_hardware_setup(st exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; - ops->check_nested_events = vmx_check_nested_events; - ops->get_nested_state = vmx_get_nested_state; - ops->set_nested_state = vmx_set_nested_state; - ops->get_vmcs12_pages = nested_get_vmcs12_pages; - ops->nested_enable_evmcs = nested_enable_evmcs; - ops->nested_get_evmcs_version = nested_get_evmcs_version; - return 0; } + +struct kvm_x86_nested_ops vmx_nested_ops = { + .check_events = vmx_check_nested_events, + .get_state = vmx_get_nested_state, + .set_state = vmx_set_nested_state, + .get_vmcs12_pages = nested_get_vmcs12_pages, + .enable_evmcs = nested_enable_evmcs, + .get_evmcs_version = nested_get_evmcs_version, +}; diff --combined arch/x86/kvm/vmx/vmx.c index 455cd2c8dbce,89c766fad889..46aa3ca01929 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@@ -437,11 -437,6 +437,11 @@@ static const struct kvm_vmx_segment_fie VMX_SEGMENT_FIELD(LDTR), }; +static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) +{ + vmx->segment_cache.bitmask = 0; +} + static unsigned long host_idt_base; /* @@@ -1343,10 -1338,6 +1343,10 @@@ void vmx_vcpu_load_vmcs(struct kvm_vcp void *gdt = get_current_gdt_ro(); unsigned long sysenter_esp; + /* + * Flush all EPTP/VPID contexts, the new pCPU may have stale + * TLB entries from its previous association with the vCPU. + */ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); /* @@@ -1381,7 -1372,6 +1381,6 @@@ void vmx_vcpu_load(struct kvm_vcpu *vcp vmx_vcpu_pi_load(vcpu, cpu); - vmx->host_pkru = read_pkru(); vmx->host_debugctlmsr = get_debugctlmsr(); } @@@ -2847,64 -2837,18 +2846,64 @@@ static void exit_lmode(struct kvm_vcpu #endif -static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * INVEPT must be issued when EPT is enabled, irrespective of VPID, as + * the CPU is not required to invalidate guest-physical mappings on + * VM-Entry, even if VPID is disabled. Guest-physical mappings are + * associated with the root EPT structure and not any particular VPID + * (INVVPID also isn't required to invalidate guest-physical mappings). + */ + if (enable_ept) { + ept_sync_global(); + } else if (enable_vpid) { + if (cpu_has_vmx_invvpid_global()) { + vpid_sync_vcpu_global(); + } else { + vpid_sync_vcpu_single(vmx->vpid); + vpid_sync_vcpu_single(vmx->nested.vpid02); + } + } +} + +static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { - int vpid = to_vmx(vcpu)->vpid; + u64 root_hpa = vcpu->arch.mmu->root_hpa; + + /* No flush required if the current context is invalid. */ + if (!VALID_PAGE(root_hpa)) + return; + + if (enable_ept) + ept_sync_context(construct_eptp(vcpu, root_hpa)); + else if (!is_guest_mode(vcpu)) + vpid_sync_context(to_vmx(vcpu)->vpid); + else + vpid_sync_context(nested_get_vpid02(vcpu)); +} - if (!vpid_sync_vcpu_addr(vpid, addr)) - vpid_sync_context(vpid); +static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +{ + /* + * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in + * vmx_flush_tlb_guest() for an explanation of why this is ok. + */ + vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr); +} +static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ /* - * If VPIDs are not supported or enabled, then the above is a no-op. - * But we don't really need a TLB flush in that case anyway, because - * each VM entry/exit includes an implicit flush when VPID is 0. + * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0 + * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit + * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is + * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), + * i.e. no explicit INVVPID is necessary. */ + vpid_sync_context(to_vmx(vcpu)->vpid); } static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) @@@ -2942,13 -2886,12 +2941,13 @@@ void ept_save_pdptrs(struct kvm_vcpu *v { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - if (is_pae_paging(vcpu)) { - mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); - } + if (WARN_ON_ONCE(!is_pae_paging(vcpu))) + return; + + mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); } @@@ -3040,15 -2983,16 +3039,15 @@@ u64 construct_eptp(struct kvm_vcpu *vcp return eptp; } -void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3) +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; unsigned long guest_cr3; u64 eptp; - guest_cr3 = cr3; if (enable_ept) { - eptp = construct_eptp(vcpu, cr3); + eptp = construct_eptp(vcpu, pgd); vmcs_write64(EPT_POINTER, eptp); if (kvm_x86_ops.tlb_remote_flush) { @@@ -3069,8 -3013,6 +3068,8 @@@ else /* vmcs01.GUEST_CR3 is already up-to-date. */ update_guest_cr3 = false; ept_load_pdptrs(vcpu); + } else { + guest_cr3 = pgd; } if (update_guest_cr3) @@@ -4645,6 -4587,26 +4644,26 @@@ static int handle_machine_check(struct return 1; } + /* + * If the host has split lock detection disabled, then #AC is + * unconditionally injected into the guest, which is the pre split lock + * detection behaviour. + * + * If the host has split lock detection enabled then #AC is + * only injected into the guest when: + * - Guest CPL == 3 (user mode) + * - Guest has #AC detection enabled in CR0 + * - Guest EFLAGS has AC bit set + */ + static inline bool guest_inject_ac(struct kvm_vcpu *vcpu) + { + if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + return true; + + return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) && + (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); + } + static int handle_exception_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@@ -4698,7 -4660,7 +4717,7 @@@ } if (is_page_fault(intr_info)) { - cr2 = vmcs_readl(EXIT_QUALIFICATION); + cr2 = vmx_get_exit_qual(vcpu); /* EPT won't cause page fault directly */ WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); @@@ -4710,22 -4672,17 +4729,17 @@@ return handle_rmode_exception(vcpu, ex_no, error_code); switch (ex_no) { - case AC_VECTOR: - kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); - return 1; case DB_VECTOR: - dr6 = vmcs_readl(EXIT_QUALIFICATION); + dr6 = vmx_get_exit_qual(vcpu); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { - vcpu->arch.dr6 &= ~DR_TRAP_BITS; - vcpu->arch.dr6 |= dr6 | DR6_RTM; if (is_icebp(intr_info)) WARN_ON(!skip_emulated_instruction(vcpu)); - kvm_queue_exception(vcpu, DB_VECTOR); + kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); return 1; } - kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM; kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); /* fall through */ case BP_VECTOR: @@@ -4741,6 -4698,20 +4755,20 @@@ kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; break; + case AC_VECTOR: + if (guest_inject_ac(vcpu)) { + kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); + return 1; + } + + /* + * Handle split lock. Depending on detection mode this will + * either warn and disable split lock detection for this + * task or force SIGBUS on it. + */ + if (handle_guest_split_lock(kvm_rip_read(vcpu))) + return 1; + fallthrough; default: kvm_run->exit_reason = KVM_EXIT_EXCEPTION; kvm_run->ex.exception = ex_no; @@@ -4769,7 -4740,7 +4797,7 @@@ static int handle_io(struct kvm_vcpu *v int size, in, string; unsigned port; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); string = (exit_qualification & 16) != 0; ++vcpu->stat.io_exits; @@@ -4860,7 -4831,7 +4888,7 @@@ static int handle_cr(struct kvm_vcpu *v int err; int ret; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); cr = exit_qualification & 15; reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { @@@ -4937,7 -4908,7 +4965,7 @@@ static int handle_dr(struct kvm_vcpu *v unsigned long exit_qualification; int dr, dr7, reg; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; /* First, if DR does not exist, trigger UD */ @@@ -4955,16 -4926,14 +4983,14 @@@ * guest debugging itself. */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { - vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; + vcpu->run->debug.arch.dr6 = DR6_BD | DR6_RTM | DR6_FIXED_1; vcpu->run->debug.arch.dr7 = dr7; vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); vcpu->run->debug.arch.exception = DB_VECTOR; vcpu->run->exit_reason = KVM_EXIT_DEBUG; return 0; } else { - vcpu->arch.dr6 &= ~DR_TRAP_BITS; - vcpu->arch.dr6 |= DR6_BD | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); + kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); return 1; } } @@@ -4995,15 -4964,6 +5021,6 @@@ return kvm_skip_emulated_instruction(vcpu); } - static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) - { - return vcpu->arch.dr6; - } - - static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) - { - } - static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { get_debugreg(vcpu->arch.db[0], 0); @@@ -5050,7 -5010,7 +5067,7 @@@ static int handle_invd(struct kvm_vcpu static int handle_invlpg(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); kvm_mmu_invlpg(vcpu, exit_qualification); return kvm_skip_emulated_instruction(vcpu); @@@ -5082,7 -5042,7 +5099,7 @@@ static int handle_xsetbv(struct kvm_vcp static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int access_type, offset; access_type = exit_qualification & APIC_ACCESS_TYPE; @@@ -5103,7 -5063,7 +5120,7 @@@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int vector = exit_qualification & 0xff; /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ @@@ -5113,7 -5073,7 +5130,7 @@@ static int handle_apic_write(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 offset = exit_qualification & 0xfff; /* APIC-write VM exit is trap-like and thus no need to adjust IP */ @@@ -5134,7 -5094,7 +5151,7 @@@ static int handle_task_switch(struct kv idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); reason = (u32)exit_qualification >> 30; if (reason == TASK_SWITCH_GATE && idt_v) { @@@ -5184,7 -5144,7 +5201,7 @@@ static int handle_ept_violation(struct gpa_t gpa; u64 error_code; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); /* * EPT violation happened while executing iret from NMI, @@@ -5444,13 -5404,13 +5461,13 @@@ static int handle_invpcid(struct kvm_vc /* According to the Intel instruction reference, the memory operand * is read even if it isn't needed (e.g., for type==all) */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } @@@ -5479,11 -5439,11 +5496,11 @@@ if (kvm_get_active_pcid(vcpu) == operand.pcid) { kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd) == operand.pcid) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); @@@ -5520,7 -5480,7 +5537,7 @@@ static int handle_pml_full(struct kvm_v trace_kvm_pml_full(vcpu->vcpu_id); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); /* * PML buffer FULL happened while executing iret from NMI, @@@ -5634,8 -5594,8 +5651,8 @@@ static const int kvm_vmx_max_exit_handl static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) { - *info1 = vmcs_readl(EXIT_QUALIFICATION); - *info2 = vmcs_read32(VM_EXIT_INTR_INFO); + *info1 = vmx_get_exit_qual(vcpu); + *info2 = vmx_get_intr_info(vcpu); } static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) @@@ -5717,6 -5677,7 +5734,6 @@@ void dump_vmcs(void u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; unsigned long cr4; u64 efer; - int i, n; if (!dump_invalid_vmcs) { pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); @@@ -5853,6 -5814,14 +5870,6 @@@ pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); - n = vmcs_read32(CR3_TARGET_COUNT); - for (i = 0; i + 1 < n; i += 4) - pr_err("CR3 target%u=%016lx target%u=%016lx\n", - i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), - i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); - if (i < n) - pr_err("CR3 target%u=%016lx\n", - i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) pr_err("PLE Gap=%08x Window=%08x\n", vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); @@@ -5902,8 -5871,8 +5919,8 @@@ static int vmx_handle_exit(struct kvm_v */ nested_mark_vmcs12_pages_dirty(vcpu); - if (nested_vmx_exit_reflected(vcpu, exit_reason)) - return nested_vmx_reflect_vmexit(vcpu, exit_reason); + if (nested_vmx_reflect_vmexit(vcpu)) + return 1; } if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { @@@ -6124,15 -6093,7 +6141,15 @@@ void vmx_set_virtual_apic_mode(struct k if (flexpriority_enabled) { sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmx_flush_tlb(vcpu, true); + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + + /* + * Flush the TLB, reloading the APIC access page will + * only do so if its physical address has changed, but + * the guest may have inserted a non-APIC mapping into + * the TLB while the APIC access page was disabled. + */ + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } break; case LAPIC_MODE_X2APIC: @@@ -6146,32 -6107,12 +6163,32 @@@ vmx_update_msr_bitmap(vcpu); } -static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) +static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(vcpu)) { - vmcs_write64(APIC_ACCESS_ADDR, hpa); - vmx_flush_tlb(vcpu, true); + struct page *page; + + /* Defer reload until vmcs01 is the current VMCS. */ + if (is_guest_mode(vcpu)) { + to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; + return; } + + if (!(secondary_exec_controls_get(to_vmx(vcpu)) & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + return; + + page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_page(page)) + return; + + vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page)); + vmx_flush_tlb_current(vcpu); + + /* + * Do not pin apic access page in memory, the MMU notifier + * will call us again if it is migrated or swapped out. + */ + put_page(page); } static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) @@@ -6289,16 -6230,16 +6306,16 @@@ static void vmx_apicv_post_state_restor static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ - if (is_page_fault(vmx->exit_intr_info)) { + if (is_page_fault(intr_info)) { vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); /* Handle machine checks before interrupts are enabled */ - } else if (is_machine_check(vmx->exit_intr_info)) { + } else if (is_machine_check(intr_info)) { kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - } else if (is_nmi(vmx->exit_intr_info)) { + } else if (is_nmi(intr_info)) { kvm_before_interrupt(&vmx->vcpu); asm("int $2"); kvm_after_interrupt(&vmx->vcpu); @@@ -6313,8 -6254,9 +6330,8 @@@ static void handle_external_interrupt_i unsigned long tmp; #endif gate_desc *desc; - u32 intr_info; + u32 intr_info = vmx_get_intr_info(vcpu); - intr_info = vmcs_read32(VM_EXIT_INTR_INFO); if (WARN_ONCE(!is_external_intr(intr_info), "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; @@@ -6350,7 -6292,8 +6367,7 @@@ } STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); -static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion *exit_fastpath) +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@@ -6358,6 -6301,9 +6375,6 @@@ handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI) handle_exception_nmi_irqoff(vmx); - else if (!is_guest_mode(vcpu) && - vmx->exit_reason == EXIT_REASON_MSR_WRITE) - *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); } static bool vmx_has_emulated_msr(int index) @@@ -6391,8 -6337,11 +6408,8 @@@ static void vmx_recover_nmi_blocking(st if (enable_vnmi) { if (vmx->loaded_vmcs->nmi_known_unmasked) return; - /* - * Can't use vmx->exit_intr_info since we're not sure what - * the exit reason is. - */ - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + exit_intr_info = vmx_get_intr_info(&vmx->vcpu); unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* @@@ -6561,9 -6510,8 +6578,9 @@@ void vmx_update_host_rsp(struct vcpu_vm bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); -static void vmx_vcpu_run(struct kvm_vcpu *vcpu) +static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu) { + enum exit_fastpath_completion exit_fastpath; struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; @@@ -6575,7 -6523,7 +6592,7 @@@ /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ if (vmx->emulation_required) - return; + return EXIT_FASTPATH_NONE; if (vmx->ple_window_dirty) { vmx->ple_window_dirty = false; @@@ -6615,11 -6563,6 +6632,6 @@@ kvm_load_guest_xsave_state(vcpu); - if (static_cpu_has(X86_FEATURE_PKU) && - kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && - vcpu->arch.pkru != vmx->host_pkru) - __write_pkru(vcpu->arch.pkru); - pt_guest_enter(vmx); if (vcpu_to_pmu(vcpu)->version) @@@ -6700,51 -6643,32 +6712,39 @@@ loadsegment(es, __USER_DS); #endif - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) - | (1 << VCPU_EXREG_RFLAGS) - | (1 << VCPU_EXREG_PDPTR) - | (1 << VCPU_EXREG_SEGMENTS) - | (1 << VCPU_EXREG_CR3)); - vcpu->arch.regs_dirty = 0; + vmx_register_cache_reset(vcpu); pt_guest_exit(vmx); - /* - * eager fpu is enabled if PKEY is supported and CR4 is switched - * back on host, so it is safe to read guest PKRU from current - * XSAVE. - */ - if (static_cpu_has(X86_FEATURE_PKU) && - kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { - vcpu->arch.pkru = rdpkru(); - if (vcpu->arch.pkru != vmx->host_pkru) - __write_pkru(vmx->host_pkru); - } - kvm_load_host_xsave_state(vcpu); vmx->nested.nested_run_pending = 0; vmx->idt_vectoring_info = 0; - vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); - if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) + if (unlikely(vmx->fail)) { + vmx->exit_reason = 0xdead; + return EXIT_FASTPATH_NONE; + } + + vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + if (unlikely((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); - if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) - return; + if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + return EXIT_FASTPATH_NONE; + + if (!is_guest_mode(vcpu) && vmx->exit_reason == EXIT_REASON_MSR_WRITE) + exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); + else + exit_fastpath = EXIT_FASTPATH_NONE; vmx->loaded_vmcs->launched = 1; vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); + + return exit_fastpath; } static void vmx_free_vcpu(struct kvm_vcpu *vcpu) @@@ -7785,18 -7709,14 +7785,16 @@@ static struct kvm_x86_ops vmx_x86_ops _ .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, - .get_dr6 = vmx_get_dr6, - .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, - .tlb_flush = vmx_flush_tlb, + .tlb_flush_all = vmx_flush_tlb_all, + .tlb_flush_current = vmx_flush_tlb_current, .tlb_flush_gva = vmx_flush_tlb_gva, + .tlb_flush_guest = vmx_flush_tlb_guest, .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, @@@ -7862,7 -7782,6 +7860,7 @@@ .post_block = vmx_post_block, .pmu_ops = &intel_pmu_ops, + .nested_ops = &vmx_nested_ops, .update_pi_irte = vmx_update_pi_irte, @@@ -7878,6 -7797,12 +7876,6 @@@ .pre_leave_smm = vmx_pre_leave_smm, .enable_smi_window = enable_smi_window, - .check_nested_events = NULL, - .get_nested_state = NULL, - .set_nested_state = NULL, - .get_vmcs12_pages = NULL, - .nested_enable_evmcs = NULL, - .nested_get_evmcs_version = NULL, .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, }; diff --combined arch/x86/kvm/x86.c index 8c0b77ac8dc6,d11eba8b85c6..542a00008caa --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@@ -97,6 -97,9 +97,6 @@@ static u64 __read_mostly efer_reserved_ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; -#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ -#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ - #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@@ -191,44 -194,45 +191,44 @@@ u64 __read_mostly supported_xss EXPORT_SYMBOL_GPL(supported_xss); struct kvm_stats_debugfs_item debugfs_entries[] = { - { "pf_fixed", VCPU_STAT(pf_fixed) }, - { "pf_guest", VCPU_STAT(pf_guest) }, - { "tlb_flush", VCPU_STAT(tlb_flush) }, - { "invlpg", VCPU_STAT(invlpg) }, - { "exits", VCPU_STAT(exits) }, - { "io_exits", VCPU_STAT(io_exits) }, - { "mmio_exits", VCPU_STAT(mmio_exits) }, - { "signal_exits", VCPU_STAT(signal_exits) }, - { "irq_window", VCPU_STAT(irq_window_exits) }, - { "nmi_window", VCPU_STAT(nmi_window_exits) }, - { "halt_exits", VCPU_STAT(halt_exits) }, - { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, - { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, - { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "hypercalls", VCPU_STAT(hypercalls) }, - { "request_irq", VCPU_STAT(request_irq_exits) }, - { "irq_exits", VCPU_STAT(irq_exits) }, - { "host_state_reload", VCPU_STAT(host_state_reload) }, - { "fpu_reload", VCPU_STAT(fpu_reload) }, - { "insn_emulation", VCPU_STAT(insn_emulation) }, - { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, - { "irq_injections", VCPU_STAT(irq_injections) }, - { "nmi_injections", VCPU_STAT(nmi_injections) }, - { "req_event", VCPU_STAT(req_event) }, - { "l1d_flush", VCPU_STAT(l1d_flush) }, - { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, - { "mmu_pte_write", VM_STAT(mmu_pte_write) }, - { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, - { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, - { "mmu_flooded", VM_STAT(mmu_flooded) }, - { "mmu_recycled", VM_STAT(mmu_recycled) }, - { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, - { "mmu_unsync", VM_STAT(mmu_unsync) }, - { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages, .mode = 0444) }, - { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) }, - { "max_mmu_page_hash_collisions", - VM_STAT(max_mmu_page_hash_collisions) }, + VCPU_STAT("pf_fixed", pf_fixed), + VCPU_STAT("pf_guest", pf_guest), + VCPU_STAT("tlb_flush", tlb_flush), + VCPU_STAT("invlpg", invlpg), + VCPU_STAT("exits", exits), + VCPU_STAT("io_exits", io_exits), + VCPU_STAT("mmio_exits", mmio_exits), + VCPU_STAT("signal_exits", signal_exits), + VCPU_STAT("irq_window", irq_window_exits), + VCPU_STAT("nmi_window", nmi_window_exits), + VCPU_STAT("halt_exits", halt_exits), + VCPU_STAT("halt_successful_poll", halt_successful_poll), + VCPU_STAT("halt_attempted_poll", halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", halt_poll_invalid), + VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("hypercalls", hypercalls), + VCPU_STAT("request_irq", request_irq_exits), + VCPU_STAT("irq_exits", irq_exits), + VCPU_STAT("host_state_reload", host_state_reload), + VCPU_STAT("fpu_reload", fpu_reload), + VCPU_STAT("insn_emulation", insn_emulation), + VCPU_STAT("insn_emulation_fail", insn_emulation_fail), + VCPU_STAT("irq_injections", irq_injections), + VCPU_STAT("nmi_injections", nmi_injections), + VCPU_STAT("req_event", req_event), + VCPU_STAT("l1d_flush", l1d_flush), + VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), + VM_STAT("mmu_pte_write", mmu_pte_write), + VM_STAT("mmu_pte_updated", mmu_pte_updated), + VM_STAT("mmu_pde_zapped", mmu_pde_zapped), + VM_STAT("mmu_flooded", mmu_flooded), + VM_STAT("mmu_recycled", mmu_recycled), + VM_STAT("mmu_cache_miss", mmu_cache_miss), + VM_STAT("mmu_unsync", mmu_unsync), + VM_STAT("remote_tlb_flush", remote_tlb_flush), + VM_STAT("largepages", lpages, .mode = 0444), + VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444), + VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions), { NULL } }; @@@ -568,11 -572,12 +568,12 @@@ void kvm_requeue_exception(struct kvm_v } EXPORT_SYMBOL_GPL(kvm_requeue_exception); - static void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, - unsigned long payload) + void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, + unsigned long payload) { kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); } + EXPORT_SYMBOL_GPL(kvm_queue_exception_p); static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code, unsigned long payload) @@@ -607,28 -612,15 +608,28 @@@ void kvm_inject_page_fault(struct kvm_v } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); -static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) +bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) { - if (mmu_is_nested(vcpu) && !fault->nested_page_fault) - vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); - else - vcpu->arch.mmu->inject_page_fault(vcpu, fault); + struct kvm_mmu *fault_mmu; + WARN_ON_ONCE(fault->vector != PF_VECTOR); + + fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu : + vcpu->arch.walk_mmu; + + /* + * Invalidate the TLB entry for the faulting address, if it exists, + * else the access will fault indefinitely (and to emulate hardware). + */ + if ((fault->error_code & PFERR_PRESENT_MASK) && + !(fault->error_code & PFERR_RSVD_MASK)) + kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address, + fault_mmu->root_hpa); + fault_mmu->inject_page_fault(vcpu, fault); return fault->nested_page_fault; } +EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); void kvm_inject_nmi(struct kvm_vcpu *vcpu) { @@@ -845,11 -837,25 +846,25 @@@ void kvm_load_guest_xsave_state(struct vcpu->arch.ia32_xss != host_xss) wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } + + if (static_cpu_has(X86_FEATURE_PKU) && + (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || + (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) && + vcpu->arch.pkru != vcpu->arch.host_pkru) + __write_pkru(vcpu->arch.pkru); } EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) { + if (static_cpu_has(X86_FEATURE_PKU) && + (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || + (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) { + vcpu->arch.pkru = rdpkru(); + if (vcpu->arch.pkru != vcpu->arch.host_pkru) + __write_pkru(vcpu->arch.host_pkru); + } + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { if (vcpu->arch.xcr0 != host_xcr0) @@@ -935,19 -941,6 +950,6 @@@ EXPORT_SYMBOL_GPL(kvm_set_xcr) __reserved_bits; \ }) - static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) - { - u64 reserved_bits = __cr4_reserved_bits(cpu_has, c); - - if (kvm_cpu_cap_has(X86_FEATURE_LA57)) - reserved_bits &= ~X86_CR4_LA57; - - if (kvm_cpu_cap_has(X86_FEATURE_UMIP)) - reserved_bits &= ~X86_CR4_UMIP; - - return reserved_bits; - } - static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) @@@ -1015,7 -1008,7 +1017,7 @@@ int kvm_set_cr3(struct kvm_vcpu *vcpu, if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { if (!skip_tlb_flush) { kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } return 0; } @@@ -1027,7 -1020,7 +1029,7 @@@ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; - kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); + kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush); vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); @@@ -1067,12 -1060,6 +1069,6 @@@ static void kvm_update_dr0123(struct kv } } - static void kvm_update_dr6(struct kvm_vcpu *vcpu) - { - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - kvm_x86_ops.set_dr6(vcpu, vcpu->arch.dr6); - } - static void kvm_update_dr7(struct kvm_vcpu *vcpu) { unsigned long dr7; @@@ -1112,7 -1099,6 +1108,6 @@@ static int __kvm_set_dr(struct kvm_vcp if (val & 0xffffffff00000000ULL) return -1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); - kvm_update_dr6(vcpu); break; case 5: /* fall through */ @@@ -1148,10 -1134,7 +1143,7 @@@ int kvm_get_dr(struct kvm_vcpu *vcpu, i case 4: /* fall through */ case 6: - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - *val = vcpu->arch.dr6; - else - *val = kvm_x86_ops.get_dr6(vcpu); + *val = vcpu->arch.dr6; break; case 5: /* fall through */ @@@ -2686,16 -2669,10 +2678,16 @@@ static void kvmclock_reset(struct kvm_v vcpu->arch.time = 0; } -static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) +static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops.tlb_flush(vcpu, invalidate_gpa); + kvm_x86_ops.tlb_flush_all(vcpu); +} + +static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_ops.tlb_flush_guest(vcpu); } static void record_steal_time(struct kvm_vcpu *vcpu) @@@ -2721,7 -2698,7 +2713,7 @@@ trace_kvm_pv_tlb_flush(vcpu->vcpu_id, st->preempted & KVM_VCPU_FLUSH_TLB); if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) - kvm_vcpu_flush_tlb(vcpu, false); + kvm_vcpu_flush_tlb_guest(vcpu); vcpu->arch.st.preempted = 0; @@@ -3075,6 -3052,17 +3067,17 @@@ int kvm_get_msr_common(struct kvm_vcpu case MSR_IA32_PERF_CTL: case MSR_AMD64_DC_CFG: case MSR_F15H_EX_CFG: + /* + * Intel Sandy Bridge CPUs must support the RAPL (running average power + * limit) MSRs. Just return 0, as we do not want to expose the host + * data here. Do not conditionalize this on CPUID, as KVM does not do + * so for existing CPU-specific MSRs. + */ + case MSR_RAPL_POWER_UNIT: + case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */ + case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */ + case MSR_PKG_ENERGY_STATUS: /* Total package */ + case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: @@@ -3389,6 -3377,7 +3392,7 @@@ int kvm_vm_ioctl_check_extension(struc case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: + case KVM_CAP_SET_GUEST_DEBUG: r = 1; break; case KVM_CAP_SYNC_REGS: @@@ -3442,14 -3431,14 +3446,14 @@@ r = KVM_X2APIC_API_VALID_FLAGS; break; case KVM_CAP_NESTED_STATE: - r = kvm_x86_ops.get_nested_state ? - kvm_x86_ops.get_nested_state(NULL, NULL, 0) : 0; + r = kvm_x86_ops.nested_ops->get_state ? + kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0; break; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: r = kvm_x86_ops.enable_direct_tlbflush != NULL; break; case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - r = kvm_x86_ops.nested_enable_evmcs != NULL; + r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; break; default: break; @@@ -3574,6 -3563,9 +3578,9 @@@ void kvm_arch_vcpu_load(struct kvm_vcp kvm_x86_ops.vcpu_load(vcpu, cpu); + /* Save host pkru register if supported */ + vcpu->arch.host_pkru = read_pkru(); + /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); @@@ -4025,7 -4017,6 +4032,6 @@@ static int kvm_vcpu_ioctl_x86_set_debug memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); vcpu->arch.dr6 = dbgregs->dr6; - kvm_update_dr6(vcpu); vcpu->arch.dr7 = dbgregs->dr7; kvm_update_dr7(vcpu); @@@ -4235,9 -4226,9 +4241,9 @@@ static int kvm_vcpu_ioctl_enable_cap(st return kvm_hv_activate_synic(vcpu, cap->cap == KVM_CAP_HYPERV_SYNIC2); case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - if (!kvm_x86_ops.nested_enable_evmcs) + if (!kvm_x86_ops.nested_ops->enable_evmcs) return -ENOTTY; - r = kvm_x86_ops.nested_enable_evmcs(vcpu, &vmcs_version); + r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version); if (!r) { user_ptr = (void __user *)(uintptr_t)cap->args[0]; if (copy_to_user(user_ptr, &vmcs_version, @@@ -4552,7 -4543,7 +4558,7 @@@ long kvm_arch_vcpu_ioctl(struct file *f u32 user_data_size; r = -EINVAL; - if (!kvm_x86_ops.get_nested_state) + if (!kvm_x86_ops.nested_ops->get_state) break; BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); @@@ -4560,8 -4551,8 +4566,8 @@@ if (get_user(user_data_size, &user_kvm_nested_state->size)) break; - r = kvm_x86_ops.get_nested_state(vcpu, user_kvm_nested_state, - user_data_size); + r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state, + user_data_size); if (r < 0) break; @@@ -4582,7 -4573,7 +4588,7 @@@ int idx; r = -EINVAL; - if (!kvm_x86_ops.set_nested_state) + if (!kvm_x86_ops.nested_ops->set_state) break; r = -EFAULT; @@@ -4604,7 -4595,7 +4610,7 @@@ break; idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvm_x86_ops.set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state); srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } @@@ -5064,10 -5055,13 +5070,13 @@@ set_identity_unlock r = -EFAULT; if (copy_from_user(&u.ps, argp, sizeof(u.ps))) goto out; + mutex_lock(&kvm->lock); r = -ENXIO; if (!kvm->arch.vpit) - goto out; + goto set_pit_out; r = kvm_vm_ioctl_set_pit(kvm, &u.ps); + set_pit_out: + mutex_unlock(&kvm->lock); break; } case KVM_GET_PIT2: { @@@ -5087,10 -5081,13 +5096,13 @@@ r = -EFAULT; if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) goto out; + mutex_lock(&kvm->lock); r = -ENXIO; if (!kvm->arch.vpit) - goto out; + goto set_pit2_out; r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); + set_pit2_out: + mutex_unlock(&kvm->lock); break; } case KVM_REINJECT_CONTROL: { @@@ -5854,6 -5851,7 +5866,7 @@@ static int emulator_cmpxchg_emulated(st { struct kvm_host_map map; struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + u64 page_line_mask; gpa_t gpa; char *kaddr; bool exchanged; @@@ -5868,7 -5866,16 +5881,16 @@@ (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto emul_write; - if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) + /* + * Emulate the atomic as a straight write to avoid #AC if SLD is + * enabled in the host and the access splits a cache line. + */ + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + page_line_mask = ~(cache_line_size() - 1); + else + page_line_mask = PAGE_MASK; + + if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask)) goto emul_write; if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map)) @@@ -6396,7 -6403,7 +6418,7 @@@ static bool inject_emulated_exception(s { struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; if (ctxt->exception.vector == PF_VECTOR) - return kvm_propagate_fault(vcpu, &ctxt->exception); + return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); if (ctxt->exception.error_code_valid) kvm_queue_exception_e(vcpu, ctxt->exception.vector, @@@ -6659,7 -6666,7 +6681,7 @@@ static int kvm_vcpu_do_singlestep(struc if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM; - kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; + kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; return 0; @@@ -6719,9 -6726,7 +6741,7 @@@ static bool kvm_vcpu_check_breakpoint(s vcpu->arch.db); if (dr6 != 0) { - vcpu->arch.dr6 &= ~DR_TRAP_BITS; - vcpu->arch.dr6 |= dr6 | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); + kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); *r = 1; return true; } @@@ -7699,8 -7704,8 +7719,8 @@@ static int inject_pending_event(struct * from L2 to L1 due to pending L1 events which require exit * from L2 to L1. */ - if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) { - r = kvm_x86_ops.check_nested_events(vcpu); + if (is_guest_mode(vcpu)) { + r = kvm_x86_ops.nested_ops->check_events(vcpu); if (r != 0) return r; } @@@ -7761,8 -7766,8 +7781,8 @@@ * proposal and current concerns. Perhaps we should be setting * KVM_REQ_EVENT only on certain events and not unconditionally? */ - if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) { - r = kvm_x86_ops.check_nested_events(vcpu); + if (is_guest_mode(vcpu)) { + r = kvm_x86_ops.nested_ops->check_events(vcpu); if (r != 0) return r; } @@@ -8042,7 -8047,7 +8062,7 @@@ void kvm_make_scan_ioapic_request_mask( zalloc_cpumask_var(&cpus, GFP_ATOMIC); kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, - vcpu_bitmap, cpus); + NULL, vcpu_bitmap, cpus); free_cpumask_var(cpus); } @@@ -8072,6 -8077,7 +8092,7 @@@ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv */ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { + struct kvm_vcpu *except; unsigned long old, new, expected; if (!kvm_x86_ops.check_apicv_inhibit_reasons || @@@ -8096,7 -8102,17 +8117,17 @@@ trace_kvm_apicv_update_request(activate, bit); if (kvm_x86_ops.pre_update_apicv_exec_ctrl) kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate); - kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); + + /* + * Sending request to update APICV for all other vcpus, + * while update the calling vcpu immediately instead of + * waiting for another #VMEXIT to handle the request. + */ + except = kvm_get_running_vcpu(); + kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE, + except); + if (except) + kvm_vcpu_update_apicv(except); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); @@@ -8153,13 -8169,24 +8184,13 @@@ int kvm_arch_mmu_notifier_invalidate_ra void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { - struct page *page = NULL; - if (!lapic_in_kernel(vcpu)) return; if (!kvm_x86_ops.set_apic_access_page_addr) return; - page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) - return; - kvm_x86_ops.set_apic_access_page_addr(vcpu, page_to_phys(page)); - - /* - * Do not pin apic access page in memory, the MMU notifier - * will call us again if it is migrated or swapped out. - */ - put_page(page); + kvm_x86_ops.set_apic_access_page_addr(vcpu); } void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) @@@ -8179,13 -8206,13 +8210,13 @@@ static int vcpu_enter_guest(struct kvm_ bool req_int_win = dm_request_for_irq_injection(vcpu) && kvm_cpu_accept_dm_intr(vcpu); - enum exit_fastpath_completion exit_fastpath = EXIT_FASTPATH_NONE; + enum exit_fastpath_completion exit_fastpath; bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { - if (unlikely(!kvm_x86_ops.get_vmcs12_pages(vcpu))) { + if (unlikely(!kvm_x86_ops.nested_ops->get_vmcs12_pages(vcpu))) { r = 0; goto out; } @@@ -8207,17 -8234,8 +8238,17 @@@ kvm_mmu_sync_roots(vcpu); if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) kvm_mmu_load_pgd(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - kvm_vcpu_flush_tlb(vcpu, true); + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { + kvm_vcpu_flush_tlb_all(vcpu); + + /* Flushing all ASIDs flushes the current ASID... */ + kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); + if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) + kvm_vcpu_flush_tlb_guest(vcpu); + if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; @@@ -8406,7 -8424,7 +8437,7 @@@ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } - kvm_x86_ops.run(vcpu); + exit_fastpath = kvm_x86_ops.run(vcpu); /* * Do this here before restoring debug registers on the host. And @@@ -8418,7 -8436,6 +8449,6 @@@ WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); kvm_x86_ops.sync_dirty_debug_regs(vcpu); kvm_update_dr0123(vcpu); - kvm_update_dr6(vcpu); kvm_update_dr7(vcpu); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } @@@ -8438,7 -8455,7 +8468,7 @@@ vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_x86_ops.handle_exit_irqoff(vcpu, &exit_fastpath); + kvm_x86_ops.handle_exit_irqoff(vcpu); /* * Consume any pending interrupts, including the possible source of @@@ -8527,8 -8544,8 +8557,8 @@@ static inline int vcpu_block(struct kv static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) - kvm_x86_ops.check_nested_events(vcpu); + if (is_guest_mode(vcpu)) + kvm_x86_ops.nested_ops->check_events(vcpu); return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); @@@ -8710,9 -8727,8 +8740,9 @@@ static void kvm_put_guest_fpu(struct kv trace_kvm_fpu(0); } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; int r; vcpu_load(vcpu); @@@ -8730,18 -8746,18 +8760,18 @@@ r = -EAGAIN; if (signal_pending(current)) { r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; + kvm_run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.signal_exits; } goto out; } - if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { + if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { r = -EINVAL; goto out; } - if (vcpu->run->kvm_dirty_regs) { + if (kvm_run->kvm_dirty_regs) { r = sync_regs(vcpu); if (r != 0) goto out; @@@ -8771,7 -8787,7 +8801,7 @@@ out: kvm_put_guest_fpu(vcpu); - if (vcpu->run->kvm_valid_regs) + if (kvm_run->kvm_valid_regs) store_regs(vcpu); post_kvm_run_save(vcpu); kvm_sigset_deactivate(vcpu); @@@ -9480,7 -9496,6 +9510,6 @@@ void kvm_vcpu_reset(struct kvm_vcpu *vc memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); vcpu->arch.dr6 = DR6_INIT; - kvm_update_dr6(vcpu); vcpu->arch.dr7 = DR7_FIXED_1; kvm_update_dr7(vcpu); @@@ -9662,7 -9677,9 +9691,9 @@@ int kvm_arch_hardware_setup(void *opaqu if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) supported_xss = 0; - cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data); + #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) + cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); + #undef __kvm_cpu_cap_has if (kvm_has_tsc_control) { /* @@@ -9694,7 -9711,8 +9725,8 @@@ int kvm_arch_check_processor_compat(voi WARN_ON(!irqs_disabled()); - if (kvm_host_cr4_reserved_bits(c) != cr4_reserved_bits) + if (__cr4_reserved_bits(cpu_has, c) != + __cr4_reserved_bits(cpu_has, &boot_cpu_data)) return -EIO; return ops->check_processor_compatibility(); diff --combined include/linux/kvm_host.h index 3cc6ccbb1183,131cc1527d68..abfa71cb5d2d --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@@ -503,7 -503,6 +503,7 @@@ struct kvm struct srcu_struct srcu; struct srcu_struct irq_srcu; pid_t userspace_pid; + unsigned int max_halt_poll_ns; }; #define kvm_err(fmt, ...) \ @@@ -814,8 -813,11 +814,11 @@@ void kvm_flush_remote_tlbs(struct kvm * void kvm_reload_remote_mmus(struct kvm *kvm); bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, + struct kvm_vcpu *except, unsigned long *vcpu_bitmap, cpumask_var_t tmp); bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); + bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, + struct kvm_vcpu *except); bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req, unsigned long *vcpu_bitmap); @@@ -867,7 -869,7 +870,7 @@@ int kvm_arch_vcpu_ioctl_set_mpstate(str struct kvm_mp_state *mp_state); int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg); -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu); int kvm_arch_init(void *opaque); void kvm_arch_exit(void); @@@ -1131,11 -1133,6 +1134,11 @@@ struct kvm_stats_debugfs_item #define KVM_DBGFS_GET_MODE(dbgfs_item) \ ((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644) +#define VM_STAT(n, x, ...) \ + { n, offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ } +#define VCPU_STAT(n, x, ...) \ + { n, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ } + extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; @@@ -1358,12 -1355,6 +1361,12 @@@ static inline void kvm_vcpu_set_dy_elig } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ +static inline bool kvm_is_visible_memslot(struct kvm_memory_slot *memslot) +{ + return (memslot && memslot->id < KVM_USER_MEM_SLOTS && + !(memslot->flags & KVM_MEMSLOT_INVALID)); +} + struct kvm_vcpu *kvm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); diff --combined tools/testing/selftests/kvm/.gitignore index 5947cc119abc,a9b2b48947ff..222e50104296 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@@ -1,3 -1,4 +1,4 @@@ + # SPDX-License-Identifier: GPL-2.0-only /s390x/memop /s390x/resets /s390x/sync_regs_test @@@ -6,6 -7,7 +7,6 @@@ /x86_64/hyperv_cpuid /x86_64/mmio_warning_test /x86_64/platform_info_test -/x86_64/set_memory_region_test /x86_64/set_sregs_test /x86_64/smm_test /x86_64/state_test @@@ -20,5 -22,4 +21,5 @@@ /demand_paging_test /dirty_log_test /kvm_create_max_vcpus +/set_memory_region_test /steal_time diff --combined tools/testing/selftests/kvm/Makefile index 7af62030c12f,44b6ef513164..c66f4eec3411 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@@ -17,6 -17,7 +17,6 @@@ TEST_GEN_PROGS_x86_64 += x86_64/evmcs_t TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test -TEST_GEN_PROGS_x86_64 += x86_64/set_memory_region_test TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test TEST_GEN_PROGS_x86_64 += x86_64/smm_test TEST_GEN_PROGS_x86_64 += x86_64/state_test @@@ -27,18 -28,17 +27,19 @@@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_dir TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test + TEST_GEN_PROGS_x86_64 += x86_64/debug_regs TEST_GEN_PROGS_x86_64 += clear_dirty_log_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus +TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time TEST_GEN_PROGS_aarch64 += clear_dirty_log_test TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus +TEST_GEN_PROGS_aarch64 += set_memory_region_test TEST_GEN_PROGS_aarch64 += steal_time TEST_GEN_PROGS_s390x = s390x/memop @@@ -47,7 -47,6 +48,7 @@@ TEST_GEN_PROGS_s390x += s390x/sync_regs TEST_GEN_PROGS_s390x += demand_paging_test TEST_GEN_PROGS_s390x += dirty_log_test TEST_GEN_PROGS_s390x += kvm_create_max_vcpus +TEST_GEN_PROGS_s390x += set_memory_region_test TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) LIBKVM += $(LIBKVM_$(UNAME_M)) diff --combined tools/testing/selftests/kvm/include/kvm_util.h index 53b11d725d81,92e184a422ee..e244c6ecfc1d --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@@ -10,7 -10,6 +10,7 @@@ #include "test_util.h" #include "asm/kvm.h" +#include "linux/list.h" #include "linux/kvm.h" #include @@@ -114,7 -113,6 +114,7 @@@ int _vcpu_ioctl(struct kvm_vm *vm, uint void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); +void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid); vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, uint32_t data_memslot, uint32_t pgd_memslot); @@@ -145,6 -143,8 +145,8 @@@ struct kvm_run *vcpu_state(struct kvm_v void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid); + void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_guest_debug *debug); void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_mp_state *mp_state); void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); @@@ -256,7 -256,6 +258,7 @@@ bool vm_is_unrestricted_guest(struct kv unsigned int vm_get_page_size(struct kvm_vm *vm); unsigned int vm_get_page_shift(struct kvm_vm *vm); unsigned int vm_get_max_gfn(struct kvm_vm *vm); +int vm_get_fd(struct kvm_vm *vm); unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages); @@@ -314,26 -313,11 +316,26 @@@ uint64_t get_ucall(struct kvm_vm *vm, u #define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage) #define GUEST_DONE() ucall(UCALL_DONE, 0) -#define GUEST_ASSERT(_condition) do { \ - if (!(_condition)) \ - ucall(UCALL_ABORT, 2, \ - "Failed guest assert: " \ - #_condition, __LINE__); \ +#define __GUEST_ASSERT(_condition, _nargs, _args...) do { \ + if (!(_condition)) \ + ucall(UCALL_ABORT, 2 + _nargs, \ + "Failed guest assert: " \ + #_condition, __LINE__, _args); \ } while (0) +#define GUEST_ASSERT(_condition) \ + __GUEST_ASSERT((_condition), 0, 0) + +#define GUEST_ASSERT_1(_condition, arg1) \ + __GUEST_ASSERT((_condition), 1, (arg1)) + +#define GUEST_ASSERT_2(_condition, arg1, arg2) \ + __GUEST_ASSERT((_condition), 2, (arg1), (arg2)) + +#define GUEST_ASSERT_3(_condition, arg1, arg2, arg3) \ + __GUEST_ASSERT((_condition), 3, (arg1), (arg2), (arg3)) + +#define GUEST_ASSERT_4(_condition, arg1, arg2, arg3, arg4) \ + __GUEST_ASSERT((_condition), 4, (arg1), (arg2), (arg3), (arg4)) + #endif /* SELFTEST_KVM_UTIL_H */ diff --combined tools/testing/selftests/kvm/lib/kvm_util.c index 33ab0a36d230,9622431069bc..c9cede5c7d0d --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@@ -161,9 -161,6 +161,9 @@@ struct kvm_vm *_vm_create(enum vm_guest vm = calloc(1, sizeof(*vm)); TEST_ASSERT(vm != NULL, "Insufficient Memory"); + INIT_LIST_HEAD(&vm->vcpus); + INIT_LIST_HEAD(&vm->userspace_mem_regions); + vm->mode = mode; vm->type = 0; @@@ -261,7 -258,8 +261,7 @@@ void kvm_vm_restart(struct kvm_vm *vmp if (vmp->has_irqchip) vm_create_irqchip(vmp); - for (region = vmp->userspace_mem_region_head; region; - region = region->next) { + list_for_each_entry(region, &vmp->userspace_mem_regions, list) { int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" " rc: %i errno: %i\n" @@@ -321,7 -319,8 +321,7 @@@ userspace_mem_region_find(struct kvm_v { struct userspace_mem_region *region; - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + list_for_each_entry(region, &vm->userspace_mem_regions, list) { uint64_t existing_start = region->region.guest_phys_addr; uint64_t existing_end = region->region.guest_phys_addr + region->region.memory_size - 1; @@@ -379,11 -378,11 +379,11 @@@ kvm_userspace_memory_region_find(struc */ struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid) { - struct vcpu *vcpup; + struct vcpu *vcpu; - for (vcpup = vm->vcpu_head; vcpup; vcpup = vcpup->next) { - if (vcpup->id == vcpuid) - return vcpup; + list_for_each_entry(vcpu, &vm->vcpus, list) { + if (vcpu->id == vcpuid) + return vcpu; } return NULL; @@@ -393,16 -392,18 +393,16 @@@ * VM VCPU Remove * * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID + * vcpu - VCPU to remove * * Output Args: None * * Return: None, TEST_ASSERT failures for all error conditions * - * Within the VM specified by vm, removes the VCPU given by vcpuid. + * Removes a vCPU from a VM and frees its resources. */ -static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid) +static void vm_vcpu_rm(struct vcpu *vcpu) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; ret = munmap(vcpu->state, sizeof(*vcpu->state)); @@@ -412,17 -413,21 +412,17 @@@ TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i " "errno: %i", ret, errno); - if (vcpu->next) - vcpu->next->prev = vcpu->prev; - if (vcpu->prev) - vcpu->prev->next = vcpu->next; - else - vm->vcpu_head = vcpu->next; + list_del(&vcpu->list); free(vcpu); } void kvm_vm_release(struct kvm_vm *vmp) { + struct vcpu *vcpu, *tmp; int ret; - while (vmp->vcpu_head) - vm_vcpu_rm(vmp, vmp->vcpu_head->id); + list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list) + vm_vcpu_rm(vcpu); ret = close(vmp->fd); TEST_ASSERT(ret == 0, "Close of vm fd failed,\n" @@@ -433,38 -438,35 +433,38 @@@ " vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno); } +static void __vm_mem_region_delete(struct kvm_vm *vm, + struct userspace_mem_region *region) +{ + int ret; + + list_del(®ion->list); + + region->region.memory_size = 0; + ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, " + "rc: %i errno: %i", ret, errno); + + sparsebit_free(®ion->unused_phy_pages); + ret = munmap(region->mmap_start, region->mmap_size); + TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i", ret, errno); + + free(region); +} + /* * Destroys and frees the VM pointed to by vmp. */ void kvm_vm_free(struct kvm_vm *vmp) { - int ret; + struct userspace_mem_region *region, *tmp; if (vmp == NULL) return; /* Free userspace_mem_regions. */ - while (vmp->userspace_mem_region_head) { - struct userspace_mem_region *region - = vmp->userspace_mem_region_head; - - region->region.memory_size = 0; - ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, - ®ion->region); - TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, " - "rc: %i errno: %i", ret, errno); - - vmp->userspace_mem_region_head = region->next; - sparsebit_free(®ion->unused_phy_pages); - ret = munmap(region->mmap_start, region->mmap_size); - TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i", - ret, errno); - - free(region); - } + list_for_each_entry_safe(region, tmp, &vmp->userspace_mem_regions, list) + __vm_mem_region_delete(vmp, region); /* Free sparsebit arrays. */ sparsebit_free(&vmp->vpages_valid); @@@ -610,10 -612,12 +610,10 @@@ void vm_userspace_mem_region_add(struc (uint64_t) region->region.memory_size); /* Confirm no region with the requested slot already exists. */ - for (region = vm->userspace_mem_region_head; region; - region = region->next) { - if (region->region.slot == slot) - break; - } - if (region != NULL) + list_for_each_entry(region, &vm->userspace_mem_regions, list) { + if (region->region.slot != slot) + continue; + TEST_FAIL("A mem region with the requested slot " "already exists.\n" " requested slot: %u paddr: 0x%lx npages: 0x%lx\n" @@@ -622,7 -626,6 +622,7 @@@ region->region.slot, (uint64_t) region->region.guest_phys_addr, (uint64_t) region->region.memory_size); + } /* Allocate and initialize new mem region structure. */ region = calloc(1, sizeof(*region)); @@@ -683,7 -686,10 +683,7 @@@ guest_paddr, (uint64_t) region->region.memory_size); /* Add to linked-list of memory regions. */ - if (vm->userspace_mem_region_head) - vm->userspace_mem_region_head->prev = region; - region->next = vm->userspace_mem_region_head; - vm->userspace_mem_region_head = region; + list_add(®ion->list, &vm->userspace_mem_regions); } /* @@@ -706,17 -712,20 +706,17 @@@ memslot2region(struct kvm_vm *vm, uint3 { struct userspace_mem_region *region; - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + list_for_each_entry(region, &vm->userspace_mem_regions, list) { if (region->region.slot == memslot) - break; - } - if (region == NULL) { - fprintf(stderr, "No mem region with the requested slot found,\n" - " requested slot: %u\n", memslot); - fputs("---- vm dump ----\n", stderr); - vm_dump(stderr, vm, 2); - TEST_FAIL("Mem region not found"); + return region; } - return region; + fprintf(stderr, "No mem region with the requested slot found,\n" + " requested slot: %u\n", memslot); + fputs("---- vm dump ----\n", stderr); + vm_dump(stderr, vm, 2); + TEST_FAIL("Mem region not found"); + return NULL; } /* @@@ -779,24 -788,6 +779,24 @@@ void vm_mem_region_move(struct kvm_vm * ret, errno, slot, new_gpa); } +/* + * VM Memory Region Delete + * + * Input Args: + * vm - Virtual Machine + * slot - Slot of the memory region to delete + * + * Output Args: None + * + * Return: None + * + * Delete a memory region. + */ +void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) +{ + __vm_mem_region_delete(vm, memslot2region(vm, slot)); +} + /* * VCPU mmap Size * @@@ -872,7 -863,10 +872,7 @@@ void vm_vcpu_add(struct kvm_vm *vm, uin "vcpu id: %u errno: %i", vcpuid, errno); /* Add to linked-list of VCPUs. */ - if (vm->vcpu_head) - vm->vcpu_head->prev = vcpu; - vcpu->next = vm->vcpu_head; - vm->vcpu_head = vcpu; + list_add(&vcpu->list, &vm->vcpus); } /* @@@ -1065,8 -1059,8 +1065,8 @@@ void virt_map(struct kvm_vm *vm, uint64 void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) { struct userspace_mem_region *region; - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + + list_for_each_entry(region, &vm->userspace_mem_regions, list) { if ((gpa >= region->region.guest_phys_addr) && (gpa <= (region->region.guest_phys_addr + region->region.memory_size - 1))) @@@ -1098,8 -1092,8 +1098,8 @@@ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) { struct userspace_mem_region *region; - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + + list_for_each_entry(region, &vm->userspace_mem_regions, list) { if ((hva >= region->host_mem) && (hva <= (region->host_mem + region->region.memory_size - 1))) @@@ -1207,6 -1201,15 +1207,15 @@@ void vcpu_run_complete_io(struct kvm_v ret, errno); } + void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_guest_debug *debug) + { + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret = ioctl(vcpu->fd, KVM_SET_GUEST_DEBUG, debug); + + TEST_ASSERT(ret == 0, "KVM_SET_GUEST_DEBUG failed: %d", ret); + } + /* * VM VCPU Set MP State * @@@ -1526,7 -1529,8 +1535,7 @@@ void vm_dump(FILE *stream, struct kvm_v fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd); fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size); fprintf(stream, "%*sMem Regions:\n", indent, ""); - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + list_for_each_entry(region, &vm->userspace_mem_regions, list) { fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx " "host_virt: %p\n", indent + 2, "", (uint64_t) region->region.guest_phys_addr, @@@ -1545,7 -1549,7 +1554,7 @@@ virt_dump(stream, vm, indent + 4); } fprintf(stream, "%*sVCPUs:\n", indent, ""); - for (vcpu = vm->vcpu_head; vcpu; vcpu = vcpu->next) + list_for_each_entry(vcpu, &vm->vcpus, list) vcpu_dump(stream, vm, vcpu->id, indent + 2); } @@@ -1739,11 -1743,6 +1748,11 @@@ unsigned int vm_get_max_gfn(struct kvm_ return vm->max_gfn; } +int vm_get_fd(struct kvm_vm *vm) +{ + return vm->fd; +} + static unsigned int vm_calc_num_pages(unsigned int num_pages, unsigned int page_shift, unsigned int new_page_shift, diff --combined virt/kvm/kvm_main.c index 33e1eee96f75,731c1e517716..7525f3838160 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@@ -259,6 -259,7 +259,7 @@@ static inline bool kvm_kick_many_cpus(c } bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, + struct kvm_vcpu *except, unsigned long *vcpu_bitmap, cpumask_var_t tmp) { int i, cpu, me; @@@ -268,7 -269,8 +269,8 @@@ me = get_cpu(); kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) + if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) || + vcpu == except) continue; kvm_make_request(req, vcpu); @@@ -288,19 -290,25 +290,25 @@@ return called; } - bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) + bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, + struct kvm_vcpu *except) { cpumask_var_t cpus; bool called; zalloc_cpumask_var(&cpus, GFP_ATOMIC); - called = kvm_make_vcpus_request_mask(kvm, req, NULL, cpus); + called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus); free_cpumask_var(cpus); return called; } + bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) + { + return kvm_make_all_cpus_request_except(kvm, req, NULL); + } + #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL void kvm_flush_remote_tlbs(struct kvm *kvm) { @@@ -710,8 -718,6 +718,8 @@@ static struct kvm *kvm_create_vm(unsign goto out_err_no_arch_destroy_vm; } + kvm->max_halt_poll_ns = halt_poll_ns; + r = kvm_arch_init_vm(kvm, type); if (r) goto out_err_no_arch_destroy_vm; @@@ -1604,13 -1610,16 +1612,13 @@@ struct kvm_memory_slot *kvm_vcpu_gfn_to { return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); } +EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); - if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || - memslot->flags & KVM_MEMSLOT_INVALID) - return false; - - return true; + return kvm_is_visible_memslot(memslot); } EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); @@@ -2715,16 -2724,15 +2723,16 @@@ out if (!kvm_arch_no_poll(vcpu)) { if (!vcpu_valid_wakeup(vcpu)) { shrink_halt_poll_ns(vcpu); - } else if (halt_poll_ns) { + } else if (vcpu->kvm->max_halt_poll_ns) { if (block_ns <= vcpu->halt_poll_ns) ; /* we had a long block, shrink polling */ - else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) + else if (vcpu->halt_poll_ns && + block_ns > vcpu->kvm->max_halt_poll_ns) shrink_halt_poll_ns(vcpu); /* we had a short halt and our poll time is too small */ - else if (vcpu->halt_poll_ns < halt_poll_ns && - block_ns < halt_poll_ns) + else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns && + block_ns < vcpu->kvm->max_halt_poll_ns) grow_halt_poll_ns(vcpu); } else { vcpu->halt_poll_ns = 0; @@@ -3031,6 -3039,8 +3039,6 @@@ static int kvm_vm_ioctl_create_vcpu(str if (r) goto vcpu_free_run_page; - kvm_create_vcpu_debugfs(vcpu); - mutex_lock(&kvm->lock); if (kvm_get_vcpu_by_id(kvm, id)) { r = -EEXIST; @@@ -3059,11 -3069,11 +3067,11 @@@ mutex_unlock(&kvm->lock); kvm_arch_vcpu_postcreate(vcpu); + kvm_create_vcpu_debugfs(vcpu); return r; unlock_vcpu_destroy: mutex_unlock(&kvm->lock); - debugfs_remove_recursive(vcpu->debugfs_dentry); kvm_arch_vcpu_destroy(vcpu); vcpu_free_run_page: free_page((unsigned long)vcpu->run); @@@ -3133,7 -3143,7 +3141,7 @@@ static long kvm_vcpu_ioctl(struct file synchronize_rcu(); put_pid(oldpid); } - r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); + r = kvm_arch_vcpu_ioctl_run(vcpu); trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; } @@@ -3158,6 -3168,7 +3166,6 @@@ out_free1 case KVM_SET_REGS: { struct kvm_regs *kvm_regs; - r = -ENOMEM; kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); if (IS_ERR(kvm_regs)) { r = PTR_ERR(kvm_regs); @@@ -3513,7 -3524,6 +3521,7 @@@ static long kvm_vm_ioctl_check_extensio case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: + case KVM_CAP_HALT_POLL: return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@@ -3564,13 -3574,6 +3572,13 @@@ static int kvm_vm_ioctl_enable_cap_gene return 0; } #endif + case KVM_CAP_HALT_POLL: { + if (cap->flags || cap->args[0] != (unsigned int)cap->args[0]) + return -EINVAL; + + kvm->max_halt_poll_ns = cap->args[0]; + return 0; + } default: return kvm_vm_ioctl_enable_cap(kvm, cap); }