Merge tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <[email protected]>

Thu, 4 Oct 2012 16:30:33 +0000 (09:30 -0700)

committer Linus Torvalds <[email protected]>

Thu, 4 Oct 2012 16:30:33 +0000 (09:30 -0700)
author Linus Torvalds <[email protected]>
Thu, 4 Oct 2012 16:30:33 +0000 (09:30 -0700)
committer Linus Torvalds <[email protected]>
Thu, 4 Oct 2012 16:30:33 +0000 (09:30 -0700)
diff --combined arch/s390/include/asm/processor.h

index f3e0aabfc6bcb003a367b9b0e5d9d2087989e28e,eac4fb5fb826b1edb7ff38697a38f3a8c94b5443..56831dfa9198c6e2192605fd5ee9d68c538d8b17
--- 1/arch/s390/include/asm/processor.h
--- 2/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@@ -11,15 -11,12 +11,15 @@@
   #ifndef __ASM_S390_PROCESSOR_H
   #define __ASM_S390_PROCESSOR_H
   
+ +#ifndef __ASSEMBLY__
+ +
   #include <linux/linkage.h>
   #include <linux/irqflags.h>
   #include <asm/cpu.h>
   #include <asm/page.h>
   #include <asm/ptrace.h>
   #include <asm/setup.h>
+ +#include <asm/runtime_instr.h>
   
   /*
    * Default implementation of macro that returns current
@@@ -78,20 -75,11 +78,20 @@@ struct thread_struct 
         unsigned long gmap_addr;        /* address of last gmap fault. */
         struct per_regs per_user;       /* User specified PER registers */
         struct per_event per_event;     /* Cause of the last PER trap */
+ +      unsigned long per_flags;        /* Flags to control debug behavior */
           /* pfault_wait is used to block the process on a pfault event */
         unsigned long pfault_wait;
         struct list_head list;
+ +      /* cpu runtime instrumentation */
+ +      struct runtime_instr_cb *ri_cb;
+ +      int ri_signum;
+ +#ifdef CONFIG_64BIT
+ +      unsigned char trap_tdb[256];    /* Transaction abort diagnose block */
+ +#endif
   };
   
+ +#define PER_FLAG_NO_TE                1UL     /* Flag to disable transactions. */
+ +
   typedef struct thread_struct thread_struct;
   
   /*
@@@ -142,12 -130,6 +142,12 @@@ struct task_struct
   struct mm_struct;
   struct seq_file;
   
+ +#ifdef CONFIG_64BIT
+ +extern void show_cacheinfo(struct seq_file *m);
+ +#else
+ +static inline void show_cacheinfo(struct seq_file *m) { }
+ +#endif
+ +
   /* Free all resources held by a thread. */
   extern void release_thread(struct task_struct *);
   extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
@@@ -158,7 -140,7 +158,8 @@@
   extern unsigned long thread_saved_pc(struct task_struct *t);
   
   extern void show_code(struct pt_regs *regs);
+ +extern void print_fn_code(unsigned char *code, unsigned long len);
+ extern int insn_to_mnemonic(unsigned char *instruction, char buf[8]);
   
   unsigned long get_wchan(struct task_struct *p);
   #define task_pt_regs(tsk) ((struct pt_regs *) \
@@@ -350,6 -332,23 +351,6 @@@ extern void (*s390_base_ext_handler_fn)
   
   #define ARCH_LOW_ADDRESS_LIMIT        0x7fffffffUL
   
- -/*
- - * Helper macro for exception table entries
- - */
- -#ifndef CONFIG_64BIT
- -#define EX_TABLE(_fault,_target)                      \
- -      ".section __ex_table,\"a\"\n"                   \
- -      "       .align 4\n"                             \
- -      "       .long  " #_fault "," #_target "\n"      \
- -      ".previous\n"
- -#else
- -#define EX_TABLE(_fault,_target)                      \
- -      ".section __ex_table,\"a\"\n"                   \
- -      "       .align 8\n"                             \
- -      "       .quad  " #_fault "," #_target "\n"      \
- -      ".previous\n"
- -#endif
- -
   extern int memcpy_real(void *, void *, size_t);
   extern void memcpy_absolute(void *, void *, size_t);
   
@@@ -360,25 -359,4 +361,25 @@@
         memcpy_absolute(&(dest), &__tmp, sizeof(__tmp));        \
   }
   
- -#endif                                 /* __ASM_S390_PROCESSOR_H           */
+ +/*
+ + * Helper macro for exception table entries
+ + */
+ +#define EX_TABLE(_fault, _target)     \
+ +      ".section __ex_table,\"a\"\n"   \
+ +      ".align 4\n"                    \
+ +      ".long  (" #_fault ") - .\n"    \
+ +      ".long  (" #_target ") - .\n"   \
+ +      ".previous\n"
+ +
+ +#else /* __ASSEMBLY__ */
+ +
+ +#define EX_TABLE(_fault, _target)     \
+ +      .section __ex_table,"a" ;       \
+ +      .align  4 ;                     \
+ +      .long   (_fault) - . ;          \
+ +      .long   (_target) - . ;         \
+ +      .previous
+ +
+ +#endif /* __ASSEMBLY__ */
+ +
+ +#endif /* __ASM_S390_PROCESSOR_H */
diff --combined arch/s390/kernel/dis.c

index cc84a24c023ff8b2de506e4957861dbd2c9a3e71,ffb622b16ab5251f2b718ad0be832d03e02b4a4e..f00286bd2ef9050ad0cb166415272915eb7c5f5f
--- 1/arch/s390/kernel/dis.c
--- 2/arch/s390/kernel/dis.c
+++ b/arch/s390/kernel/dis.c
@@@ -315,11 -315,6 +315,11 @@@ enum 
         LONG_INSN_POPCNT,
         LONG_INSN_RISBHG,
         LONG_INSN_RISBLG,
+ +      LONG_INSN_RINEXT,
+ +      LONG_INSN_RIEMIT,
+ +      LONG_INSN_TABORT,
+ +      LONG_INSN_TBEGIN,
+ +      LONG_INSN_TBEGINC,
   };
   
   static char *long_insn_name[] = {
@@@ -334,12 -329,7 +334,12 @@@
         [LONG_INSN_LLGHRL] = "llghrl",
         [LONG_INSN_POPCNT] = "popcnt",
         [LONG_INSN_RISBHG] = "risbhg",
- -      [LONG_INSN_RISBLG] = "risblk",
+ +      [LONG_INSN_RISBLG] = "risblg",
+ +      [LONG_INSN_RINEXT] = "rinext",
+ +      [LONG_INSN_RIEMIT] = "riemit",
+ +      [LONG_INSN_TABORT] = "tabort",
+ +      [LONG_INSN_TBEGIN] = "tbegin",
+ +      [LONG_INSN_TBEGINC] = "tbeginc",
   };
   
   static struct insn opcode[] = {
@@@ -592,17 -582,6 +592,17 @@@ static struct insn opcode_a7[] = 
         { "", 0, INSTR_INVALID }
   };
   
+ +static struct insn opcode_aa[] = {
+ +#ifdef CONFIG_64BIT
+ +      { { 0, LONG_INSN_RINEXT }, 0x00, INSTR_RI_RI },
+ +      { "rion", 0x01, INSTR_RI_RI },
+ +      { "tric", 0x02, INSTR_RI_RI },
+ +      { "rioff", 0x03, INSTR_RI_RI },
+ +      { { 0, LONG_INSN_RIEMIT }, 0x04, INSTR_RI_RI },
+ +#endif
+ +      { "", 0, INSTR_INVALID }
+ +};
+ +
   static struct insn opcode_b2[] = {
   #ifdef CONFIG_64BIT
         { "sske", 0x2b, INSTR_RRF_M0RR },
@@@ -615,9 -594,6 +615,9 @@@
         { "lpswe", 0xb2, INSTR_S_RD },
         { "srnmt", 0xb9, INSTR_S_RD },
         { "lfas", 0xbd, INSTR_S_RD },
+ +      { "etndg", 0xec, INSTR_RRE_R0 },
+ +      { { 0, LONG_INSN_TABORT }, 0xfc, INSTR_S_RD },
+ +      { "tend", 0xf8, INSTR_S_RD },
   #endif
         { "stidp", 0x02, INSTR_S_RD },
         { "sck", 0x04, INSTR_S_RD },
@@@ -1174,7 -1150,6 +1174,7 @@@ static struct insn opcode_e3[] = 
         { "stfh", 0xcb, INSTR_RXY_RRRD },
         { "chf", 0xcd, INSTR_RXY_RRRD },
         { "clhf", 0xcf, INSTR_RXY_RRRD },
+ +      { "ntstg", 0x25, INSTR_RXY_RRRD },
   #endif
         { "lrv", 0x1e, INSTR_RXY_RRRD },
         { "lrvh", 0x1f, INSTR_RXY_RRRD },
@@@ -1198,8 -1173,6 +1198,8 @@@ static struct insn opcode_e5[] = 
         { "mvhhi", 0x44, INSTR_SIL_RDI },
         { "mvhi", 0x4c, INSTR_SIL_RDI },
         { "mvghi", 0x48, INSTR_SIL_RDI },
+ +      { { 0, LONG_INSN_TBEGIN }, 0x60, INSTR_SIL_RDU },
+ +      { { 0, LONG_INSN_TBEGINC }, 0x61, INSTR_SIL_RDU },
   #endif
         { "lasp", 0x00, INSTR_SSE_RDRD },
         { "tprot", 0x01, INSTR_SSE_RDRD },
@@@ -1237,9 -1210,6 +1237,9 @@@ static struct insn opcode_eb[] = 
         { "cliy", 0x55, INSTR_SIY_URD },
         { "oiy", 0x56, INSTR_SIY_URD },
         { "xiy", 0x57, INSTR_SIY_URD },
+ +      { "lric", 0x60, INSTR_RSY_RDRM },
+ +      { "stric", 0x61, INSTR_RSY_RDRM },
+ +      { "mric", 0x62, INSTR_RSY_RDRM },
         { "icmh", 0x80, INSTR_RSE_RURD },
         { "icmh", 0x80, INSTR_RSY_RURD },
         { "icmy", 0x81, INSTR_RSY_RURD },
@@@ -1438,9 -1408,6 +1438,9 @@@ static struct insn *find_insn(unsigned 
         case 0xa7:
                 table = opcode_a7;
                 break;
+ +      case 0xaa:
+ +              table = opcode_aa;
+ +              break;
         case 0xb2:
                 table = opcode_b2;
                 break;
@@@ -1501,6 -1468,33 +1501,33 @@@
         return NULL;
   }
   
+ /**
+  * insn_to_mnemonic - decode an s390 instruction
+  * @instruction: instruction to decode
+  * @buf: buffer to fill with mnemonic
+  *
+  * Decode the instruction at @instruction and store the corresponding
+  * mnemonic into @buf.
+  * @buf is left unchanged if the instruction could not be decoded.
+  * Returns:
+  *  %0 on success, %-ENOENT if the instruction was not found.
+  */
+ int insn_to_mnemonic(unsigned char *instruction, char buf[8])
+ {
+       struct insn *insn;
+ 
+       insn = find_insn(instruction);
+       if (!insn)
+               return -ENOENT;
+       if (insn->name[0] == '\0')
+               snprintf(buf, sizeof(buf), "%s",
+                        long_insn_name[(int) insn->name[1]]);
+       else
+               snprintf(buf, sizeof(buf), "%.5s", insn->name);
+       return 0;
+ }
+ EXPORT_SYMBOL_GPL(insn_to_mnemonic);
+ 
   static int print_insn(char *buffer, unsigned char *code, unsigned long addr)
   {
         struct insn *insn;
@@@ -1634,26 -1628,3 +1661,26 @@@ void show_code(struct pt_regs *regs
         }
         printk("\n");
   }
+ +
+ +void print_fn_code(unsigned char *code, unsigned long len)
+ +{
+ +      char buffer[64], *ptr;
+ +      int opsize, i;
+ +
+ +      while (len) {
+ +              ptr = buffer;
+ +              opsize = insn_length(*code);
+ +              ptr += sprintf(ptr, "%p: ", code);
+ +              for (i = 0; i < opsize; i++)
+ +                      ptr += sprintf(ptr, "%02x", code[i]);
+ +              *ptr++ = '\t';
+ +              if (i < 4)
+ +                      *ptr++ = '\t';
+ +              ptr += print_insn(ptr, code, (unsigned long) code);
+ +              *ptr++ = '\n';
+ +              *ptr++ = 0;
+ +              printk(buffer);
+ +              code += opsize;
+ +              len -= opsize;
+ +      }
+ +}
diff --combined arch/s390/kvm/Kconfig

index 9b04a32e56958f2cf5d4397a3721e94ccb2e44d5,a6e2677724e169238a8dfa0a6b97200a764734fb..b58dd869cb320ffeedafe7bcb6489db59bbb2cba
--- 1/arch/s390/kvm/Kconfig
--- 2/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@@ -5,7 -5,7 +5,7 @@@ source "virt/kvm/Kconfig
   
   menuconfig VIRTUALIZATION
         def_bool y
- -      prompt "Virtualization"
+ +      prompt "KVM"
         ---help---
           Say Y here to get to see options for using your Linux host to run other
           operating systems inside virtual machines (guests).
@@@ -21,6 -21,7 +21,7 @@@ config KV
         depends on HAVE_KVM && EXPERIMENTAL
         select PREEMPT_NOTIFIERS
         select ANON_INODES
+       select HAVE_KVM_CPU_RELAX_INTERCEPT
         ---help---
           Support hosting paravirtualized guest machines using the SIE
           virtualization capability on the mainframe. This should work
diff --combined arch/s390/kvm/priv.c

index 310be61bead74bd3baf5fcb1d3e50cc132604408,ed256fdd7b58040ac9208ea9aa8ff01979bb8334..d768906f15c81b27b82698ae49368579a2169ef9
--- 1/arch/s390/kvm/priv.c
--- 2/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@@ -20,6 -20,7 +20,7 @@@
   #include <asm/sysinfo.h>
   #include "gaccess.h"
   #include "kvm-s390.h"
+ #include "trace.h"
   
   static int handle_set_prefix(struct kvm_vcpu *vcpu)
   {
@@@ -59,6 -60,7 +60,7 @@@
         kvm_s390_set_prefix(vcpu, address);
   
         VCPU_EVENT(vcpu, 5, "setting prefix to %x", address);
+       trace_kvm_s390_handle_prefix(vcpu, 1, address);
   out:
         return 0;
   }
@@@ -91,6 -93,7 +93,7 @@@ static int handle_store_prefix(struct k
         }
   
         VCPU_EVENT(vcpu, 5, "storing prefix to %x", address);
+       trace_kvm_s390_handle_prefix(vcpu, 0, address);
   out:
         return 0;
   }
@@@ -119,6 -122,7 +122,7 @@@ static int handle_store_cpu_address(str
         }
   
         VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr);
+       trace_kvm_s390_handle_stap(vcpu, useraddr);
   out:
         return 0;
   }
@@@ -164,9 -168,11 +168,11 @@@ static int handle_stfl(struct kvm_vcpu 
                            &facility_list, sizeof(facility_list));
         if (rc == -EFAULT)
                 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-       else
+       else {
                 VCPU_EVENT(vcpu, 5, "store facility list value %x",
                            facility_list);
+               trace_kvm_s390_handle_stfl(vcpu, facility_list);
+       }
         return 0;
   }
   
@@@ -211,7 -217,7 +217,7 @@@ static void handle_stsi_3_2_2(struct kv
         spin_unlock(&fi->lock);
   
         /* deal with other level 3 hypervisors */
- -      if (stsi(mem, 3, 2, 2) == -ENOSYS)
+ +      if (stsi(mem, 3, 2, 2))
                 mem->count = 0;
         if (mem->count < 8)
                 mem->count++;
@@@ -259,7 -265,7 +265,7 @@@ static int handle_stsi(struct kvm_vcpu 
                 mem = get_zeroed_page(GFP_KERNEL);
                 if (!mem)
                         goto out_fail;
- -              if (stsi((void *) mem, fc, sel1, sel2) == -ENOSYS)
+ +              if (stsi((void *) mem, fc, sel1, sel2))
                         goto out_mem;
                 break;
         case 3:
@@@ -278,6 -284,7 +284,7 @@@
                 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
                 goto out_mem;
         }
+       trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
         free_page(mem);
         vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
         vcpu->run->s.regs.gprs[0] = 0;
diff --combined arch/x86/Kconfig

index 7f9a395c52548e021253dbe78511427700f05fb2,a42e2e99caae813f7d12644b367286881a5f3363..b72777ff32a9439ad401779b3bdce873f0d0b744
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -7,13 -7,11 +7,13 @@@ config 64BI
           Say no to build a 32-bit kernel - formerly known as i386
   
   config X86_32
- -      def_bool !64BIT
+ +      def_bool y
+ +      depends on !64BIT
         select CLKSRC_I8253
   
   config X86_64
- -      def_bool 64BIT
+ +      def_bool y
+ +      depends on 64BIT
         select X86_DEV_DMA_OPS
   
   ### Arch settings
@@@ -38,7 -36,6 +38,7 @@@ config X8
         select HAVE_KRETPROBES
         select HAVE_OPTPROBES
         select HAVE_FTRACE_MCOUNT_RECORD
+ +      select HAVE_FENTRY if X86_64
         select HAVE_C_RECORDMCOUNT
         select HAVE_DYNAMIC_FTRACE
         select HAVE_FUNCTION_TRACER
@@@ -63,8 -60,6 +63,8 @@@
         select HAVE_MIXED_BREAKPOINTS_REGS
         select PERF_EVENTS
         select HAVE_PERF_EVENTS_NMI
+ +      select HAVE_PERF_REGS
+ +      select HAVE_PERF_USER_STACK_DUMP
         select ANON_INODES
         select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
         select HAVE_CMPXCHG_LOCAL if !M386
@@@ -102,12 -97,9 +102,12 @@@
         select KTIME_SCALAR if X86_32
         select GENERIC_STRNCPY_FROM_USER
         select GENERIC_STRNLEN_USER
+ +      select HAVE_RCU_USER_QS if X86_64
+ +      select HAVE_IRQ_TIME_ACCOUNTING
   
   config INSTRUCTION_DECODER
- -      def_bool (KPROBES || PERF_EVENTS || UPROBES)
+ +      def_bool y
+ +      depends on KPROBES || PERF_EVENTS || UPROBES
   
   config OUTPUT_FORMAT
         string
@@@ -135,15 -127,13 +135,15 @@@ config SBU
         bool
   
   config NEED_DMA_MAP_STATE
- -       def_bool (X86_64 || INTEL_IOMMU || DMA_API_DEBUG)
+ +      def_bool y
+ +      depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG
   
   config NEED_SG_DMA_LENGTH
         def_bool y
   
   config GENERIC_ISA_DMA
- -      def_bool ISA_DMA_API
+ +      def_bool y
+ +      depends on ISA_DMA_API
   
   config GENERIC_BUG
         def_bool y
@@@ -160,16 -150,13 +160,16 @@@ config GENERIC_GPI
         bool
   
   config ARCH_MAY_HAVE_PC_FDC
- -      def_bool ISA_DMA_API
+ +      def_bool y
+ +      depends on ISA_DMA_API
   
   config RWSEM_GENERIC_SPINLOCK
- -      def_bool !X86_XADD
+ +      def_bool y
+ +      depends on !X86_XADD
   
   config RWSEM_XCHGADD_ALGORITHM
- -      def_bool X86_XADD
+ +      def_bool y
+ +      depends on X86_XADD
   
   config GENERIC_CALIBRATE_DELAY
         def_bool y
@@@ -586,23 -573,18 +586,18 @@@ config PARAVIRT_TIME_ACCOUNTIN
   
   source "arch/x86/xen/Kconfig"
   
- config KVM_CLOCK
-       bool "KVM paravirtualized clock"
-       select PARAVIRT
-       select PARAVIRT_CLOCK
-       ---help---
-         Turning on this option will allow you to run a paravirtualized clock
-         when running over the KVM hypervisor. Instead of relying on a PIT
-         (or probably other) emulation by the underlying device model, the host
-         provides the guest with timing infrastructure such as time of day, and
-         system time
- 
   config KVM_GUEST
-       bool "KVM Guest support"
+       bool "KVM Guest support (including kvmclock)"
+       select PARAVIRT
         select PARAVIRT
+       select PARAVIRT_CLOCK
+       default y if PARAVIRT_GUEST
         ---help---
           This option enables various optimizations for running under the KVM
-         hypervisor.
+         hypervisor. It includes a paravirtualized clock, so that instead
+         of relying on a PIT (or probably other) emulation by the
+         underlying device model, the host provides the guest with
+         timing infrastructure such as time of day, and system time
   
   source "arch/x86/lguest/Kconfig"
   
@@@ -759,14 -741,13 +754,14 @@@ config SWIOTL
         def_bool y if X86_64
         ---help---
           Support for software bounce buffers used on x86-64 systems
- -        which don't have a hardware IOMMU (e.g. the current generation
- -        of Intel's x86-64 CPUs). Using this PCI devices which can only
- -        access 32-bits of memory can be used on systems with more than
- -        3 GB of memory. If unsure, say Y.
+ +        which don't have a hardware IOMMU. Using this PCI devices
+ +        which can only access 32-bits of memory can be used on systems
+ +        with more than 3 GB of memory.
+ +        If unsure, say Y.
   
   config IOMMU_HELPER
- -      def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
+ +      def_bool y
+ +      depends on CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU
   
   config MAXSMP
         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
@@@ -810,6 -791,17 +805,6 @@@ config SCHED_M
           making when dealing with multi-core CPU chips at a cost of slightly
           increased overhead in some places. If unsure say N here.
   
- -config IRQ_TIME_ACCOUNTING
- -      bool "Fine granularity task level IRQ time accounting"
- -      default n
- -      ---help---
- -        Select this option to enable fine granularity task irq time
- -        accounting. This is done by reading a timestamp on each
- -        transitions between softirq and hardirq state, so there can be a
- -        small performance impact.
- -
- -        If in doubt, say N here.
- -
   source "kernel/Kconfig.preempt"
   
   config X86_UP_APIC
@@@ -874,7 -866,6 +869,7 @@@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQ
   
   config X86_MCE
         bool "Machine Check / overheating reporting"
+ +      default y
         ---help---
           Machine Check support allows the processor to notify the
           kernel if it detects a problem (e.g. overheating, data corruption).
@@@ -986,25 -977,25 +981,25 @@@ config X86_REBOOTFIXUP
           Say N otherwise.
   
   config MICROCODE
- -      tristate "/dev/cpu/microcode - microcode support"
+ +      tristate "CPU microcode loading support"
         select FW_LOADER
         ---help---
+ +
           If you say Y here, you will be able to update the microcode on
           certain Intel and AMD processors. The Intel support is for the
- -        IA32 family, e.g. Pentium Pro, Pentium II, Pentium III,
- -        Pentium 4, Xeon etc. The AMD support is for family 0x10 and
- -        0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra.
- -        You will obviously need the actual microcode binary data itself
- -        which is not shipped with the Linux kernel.
+ +        IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4,
+ +        Xeon etc. The AMD support is for families 0x10 and later. You will
+ +        obviously need the actual microcode binary data itself which is not
+ +        shipped with the Linux kernel.
   
           This option selects the general module only, you need to select
           at least one vendor specific module as well.
   
- -        To compile this driver as a module, choose M here: the
- -        module will be called microcode.
+ +        To compile this driver as a module, choose M here: the module
+ +        will be called microcode.
   
   config MICROCODE_INTEL
- -      bool "Intel microcode patch loading support"
+ +      bool "Intel microcode loading support"
         depends on MICROCODE
         default MICROCODE
         select FW_LOADER
@@@ -1017,7 -1008,7 +1012,7 @@@
           <http://www.urbanmyth.org/microcode/>.
   
   config MICROCODE_AMD
- -      bool "AMD microcode patch loading support"
+ +      bool "AMD microcode loading support"
         depends on MICROCODE
         select FW_LOADER
         ---help---
@@@ -1163,12 -1154,10 +1158,12 @@@ config X86_PA
           consumes more pagetable space per process.
   
   config ARCH_PHYS_ADDR_T_64BIT
- -      def_bool X86_64 || X86_PAE
+ +      def_bool y
+ +      depends on X86_64 || X86_PAE
   
   config ARCH_DMA_ADDR_T_64BIT
- -      def_bool X86_64 || HIGHMEM64G
+ +      def_bool y
+ +      depends on X86_64 || HIGHMEM64G
   
   config DIRECT_GBPAGES
         bool "Enable 1GB pages for kernel pagetables" if EXPERT
@@@ -1291,8 -1280,8 +1286,8 @@@ config ARCH_SELECT_MEMORY_MODE
         depends on ARCH_SPARSEMEM_ENABLE
   
   config ARCH_MEMORY_PROBE
- -      def_bool X86_64
- -      depends on MEMORY_HOTPLUG
+ +      def_bool y
+ +      depends on X86_64 && MEMORY_HOTPLUG
   
   config ARCH_PROC_KCORE_TEXT
         def_bool y
@@@ -1493,17 -1482,6 +1488,17 @@@ config ARCH_RANDO
           If supported, this is a high bandwidth, cryptographically
           secure hardware random number generator.
   
+ +config X86_SMAP
+ +      def_bool y
+ +      prompt "Supervisor Mode Access Prevention" if EXPERT
+ +      ---help---
+ +        Supervisor Mode Access Prevention (SMAP) is a security
+ +        feature in newer Intel processors.  There is a small
+ +        performance cost if this enabled and turned on; there is
+ +        also a small increase in the kernel size if this is enabled.
+ +
+ +        If unsure, say Y.
+ +
   config EFI
         bool "EFI runtime service support"
         depends on ACPI
@@@ -1992,6 -1970,7 +1987,6 @@@ config PCI_MMCONFI
   
   config PCI_CNB20LE_QUIRK
         bool "Read CNB20LE Host Bridge Windows" if EXPERT
- -      default n
         depends on PCI && EXPERIMENTAL
         help
           Read the PCI windows out of the CNB20LE host bridge. This allows
@@@ -2202,18 -2181,18 +2197,18 @@@ config COMPA
         depends on IA32_EMULATION || X86_X32
         select ARCH_WANT_OLD_COMPAT_IPC
   
+ +if COMPAT
   config COMPAT_FOR_U64_ALIGNMENT
- -      def_bool COMPAT
- -      depends on X86_64
+ +      def_bool y
   
   config SYSVIPC_COMPAT
         def_bool y
- -      depends on COMPAT && SYSVIPC
+ +      depends on SYSVIPC
   
   config KEYS_COMPAT
- -      bool
- -      depends on COMPAT && KEYS
- -      default y
+ +      def_bool y
+ +      depends on KEYS
+ +endif
   
   endmenu
   
diff --combined arch/x86/include/asm/kvm.h

index 41e08cb6a0924bbbc9b92c4ea8a2de7b4af206ef,521bf252e34b0854f55f439dbf22a33b95e9619e..a65ec29e6ffb0e4cbc69bdbaf353f207f0495c26
--- 1/arch/x86/include/asm/kvm.h
--- 2/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@@ -9,22 -9,6 +9,22 @@@
   #include <linux/types.h>
   #include <linux/ioctl.h>
   
+ +#define DE_VECTOR 0
+ +#define DB_VECTOR 1
+ +#define BP_VECTOR 3
+ +#define OF_VECTOR 4
+ +#define BR_VECTOR 5
+ +#define UD_VECTOR 6
+ +#define NM_VECTOR 7
+ +#define DF_VECTOR 8
+ +#define TS_VECTOR 10
+ +#define NP_VECTOR 11
+ +#define SS_VECTOR 12
+ +#define GP_VECTOR 13
+ +#define PF_VECTOR 14
+ +#define MF_VECTOR 16
+ +#define MC_VECTOR 18
+ +
   /* Select x86 specific features in <linux/kvm.h> */
   #define __KVM_HAVE_PIT
   #define __KVM_HAVE_IOAPIC
@@@ -41,6 -25,7 +41,7 @@@
   #define __KVM_HAVE_DEBUGREGS
   #define __KVM_HAVE_XSAVE
   #define __KVM_HAVE_XCRS
+ #define __KVM_HAVE_READONLY_MEM
   
   /* Architectural interrupt line count. */
   #define KVM_NR_INTERRUPTS 256
diff --combined arch/x86/include/asm/kvm_host.h

index 1eaa6b056670d19d965bbf46050e0a9266ddef7f,c9a91368fc5e0dd77597059f9e803613e4b24ebf..b2e11f4524354db111134fb28ef9dfd99f3edaee
--- 1/arch/x86/include/asm/kvm_host.h
--- 2/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@@ -75,6 -75,22 +75,6 @@@
   #define KVM_HPAGE_MASK(x)     (~(KVM_HPAGE_SIZE(x) - 1))
   #define KVM_PAGES_PER_HPAGE(x)        (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
   
- -#define DE_VECTOR 0
- -#define DB_VECTOR 1
- -#define BP_VECTOR 3
- -#define OF_VECTOR 4
- -#define BR_VECTOR 5
- -#define UD_VECTOR 6
- -#define NM_VECTOR 7
- -#define DF_VECTOR 8
- -#define TS_VECTOR 10
- -#define NP_VECTOR 11
- -#define SS_VECTOR 12
- -#define GP_VECTOR 13
- -#define PF_VECTOR 14
- -#define MF_VECTOR 16
- -#define MC_VECTOR 18
- -
   #define SELECTOR_TI_MASK (1 << 2)
   #define SELECTOR_RPL_MASK 0x03
   
@@@ -271,10 -287,24 +271,24 @@@ struct kvm_mmu 
         union kvm_mmu_page_role base_role;
         bool direct_map;
   
+       /*
+        * Bitmap; bit set = permission fault
+        * Byte index: page fault error code [4:1]
+        * Bit index: pte permissions in ACC_* format
+        */
+       u8 permissions[16];
+ 
         u64 *pae_root;
         u64 *lm_root;
         u64 rsvd_bits_mask[2][4];
   
+       /*
+        * Bitmap: bit set = last pte in walk
+        * index[0:1]: level (zero-based)
+        * index[2]: pte.ps
+        */
+       u8 last_pte_bitmap;
+ 
         bool nx;
   
         u64 pdptrs[4]; /* pae */
@@@ -398,12 -428,15 +412,15 @@@ struct kvm_vcpu_arch 
         struct x86_emulate_ctxt emulate_ctxt;
         bool emulate_regs_need_sync_to_vcpu;
         bool emulate_regs_need_sync_from_vcpu;
+       int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
   
         gpa_t time;
         struct pvclock_vcpu_time_info hv_clock;
         unsigned int hw_tsc_khz;
         unsigned int time_offset;
         struct page *time_page;
+       /* set guest stopped flag in pvclock flags field */
+       bool pvclock_set_guest_stopped_request;
   
         struct {
                 u64 msr_val;
@@@ -438,6 -471,7 +455,7 @@@
         unsigned long dr6;
         unsigned long dr7;
         unsigned long eff_db[KVM_NR_DB_REGS];
+       unsigned long guest_debug_dr7;
   
         u64 mcg_cap;
         u64 mcg_status;
@@@ -484,14 -518,24 +502,24 @@@
   };
   
   struct kvm_lpage_info {
-       unsigned long rmap_pde;
         int write_count;
   };
   
   struct kvm_arch_memory_slot {
+       unsigned long *rmap[KVM_NR_PAGE_SIZES];
         struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
   };
   
+ struct kvm_apic_map {
+       struct rcu_head rcu;
+       u8 ldr_bits;
+       /* fields bellow are used to decode ldr values in different modes */
+       u32 cid_shift, cid_mask, lid_mask;
+       struct kvm_lapic *phys_map[256];
+       /* first index is cluster id second is cpu id in a cluster */
+       struct kvm_lapic *logical_map[16][16];
+ };
+ 
   struct kvm_arch {
         unsigned int n_used_mmu_pages;
         unsigned int n_requested_mmu_pages;
@@@ -509,6 -553,8 +537,8 @@@
         struct kvm_ioapic *vioapic;
         struct kvm_pit *vpit;
         int vapics_in_nmi_mode;
+       struct mutex apic_map_lock;
+       struct kvm_apic_map *apic_map;
   
         unsigned int tss_addr;
         struct page *apic_access_page;
@@@ -602,8 -648,7 +632,7 @@@ struct kvm_x86_ops 
         void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
         void (*vcpu_put)(struct kvm_vcpu *vcpu);
   
-       void (*set_guest_debug)(struct kvm_vcpu *vcpu,
-                               struct kvm_guest_debug *dbg);
+       void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
         int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
         int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
         u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@@ -941,6 -986,7 +970,7 @@@ extern bool kvm_rebooting
   
   #define KVM_ARCH_WANT_MMU_NOTIFIER
   int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
   int kvm_age_hva(struct kvm *kvm, unsigned long hva);
   int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
   void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
diff --combined arch/x86/kernel/Makefile

index 8d7a619718b5fac1b245985cbc185c108c5a4a94,7203298e0b832757eb858df88390342c24d57eff..a48ea05157d3bbcb8cb56c4a2f92bb45cad446e8
--- 1/arch/x86/kernel/Makefile
--- 2/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@@ -81,8 -81,7 +81,7 @@@ obj-$(CONFIG_DEBUG_RODATA_TEST)       += test
   obj-$(CONFIG_DEBUG_NX_TEST)   += test_nx.o
   obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
   
- obj-$(CONFIG_KVM_GUEST)               += kvm.o
- obj-$(CONFIG_KVM_CLOCK)               += kvmclock.o
+ obj-$(CONFIG_KVM_GUEST)               += kvm.o kvmclock.o
   obj-$(CONFIG_PARAVIRT)                += paravirt.o paravirt_patch_$(BITS).o
   obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
   obj-$(CONFIG_PARAVIRT_CLOCK)  += pvclock.o
@@@ -100,8 -99,6 +99,8 @@@ obj-$(CONFIG_SWIOTLB)                        += pci-swiotlb.
   obj-$(CONFIG_OF)                      += devicetree.o
   obj-$(CONFIG_UPROBES)                 += uprobes.o
   
+ +obj-$(CONFIG_PERF_EVENTS)             += perf_regs.o
+ +
   ###
   # 64 bit specific files
   ifeq ($(CONFIG_X86_64),y)
diff --combined arch/x86/kernel/setup.c

index 4f165479c4537cee4badf71a4862826936d7a228,b3386ae3438bef82283774db6c734704038adea6..d609be046b5749991c01919561055072d717f970
--- 1/arch/x86/kernel/setup.c
--- 2/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@@ -957,11 -957,13 +957,11 @@@ void __init setup_arch(char **cmdline_p
         initmem_init();
         memblock_find_dma_reserve();
   
- #ifdef CONFIG_KVM_CLOCK
+ #ifdef CONFIG_KVM_GUEST
         kvmclock_init();
   #endif
   
- -      x86_init.paging.pagetable_setup_start(swapper_pg_dir);
- -      paging_init();
- -      x86_init.paging.pagetable_setup_done(swapper_pg_dir);
+ +      x86_init.paging.pagetable_init();
   
         if (boot_cpu_data.cpuid_level >= 0) {
                 /* A CPU has %cr4 if and only if it has CPUID */
diff --combined arch/x86/kvm/vmx.c

index 851aa7c3b890f511fc350f2275a33801f58e8350,5d46c905e06fb01c12f66f0255c27b72fbf15b1c..ad6b1dd06f8b967356d4f081f4cde119f2a2bfe8
--- 1/arch/x86/kvm/vmx.c
--- 2/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@@ -127,6 -127,8 +127,8 @@@ module_param(ple_gap, int, S_IRUGO)
   static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
   module_param(ple_window, int, S_IRUGO);
   
+ extern const ulong vmx_return;
+ 
   #define NR_AUTOLOAD_MSRS 8
   #define VMCS02_POOL_SIZE 1
   
@@@ -405,16 -407,16 +407,16 @@@ struct vcpu_vmx 
         struct {
                 int vm86_active;
                 ulong save_rflags;
+               struct kvm_segment segs[8];
+       } rmode;
+       struct {
+               u32 bitmask; /* 4 bits per segment (1 bit per field) */
                 struct kvm_save_segment {
                         u16 selector;
                         unsigned long base;
                         u32 limit;
                         u32 ar;
-               } tr, es, ds, fs, gs;
-       } rmode;
-       struct {
-               u32 bitmask; /* 4 bits per segment (1 bit per field) */
-               struct kvm_save_segment seg[8];
+               } seg[8];
         } segment_cache;
         int vpid;
         bool emulation_required;
@@@ -450,7 -452,7 +452,7 @@@ static inline struct vcpu_vmx *to_vmx(s
   #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
                                 [number##_HIGH] = VMCS12_OFFSET(name)+4
   
- static unsigned short vmcs_field_to_offset_table[] = {
+ static const unsigned short vmcs_field_to_offset_table[] = {
         FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
         FIELD(GUEST_ES_SELECTOR, guest_es_selector),
         FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
@@@ -596,10 -598,9 +598,9 @@@ static inline struct vmcs12 *get_vmcs12
   static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
   {
         struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
-       if (is_error_page(page)) {
-               kvm_release_page_clean(page);
+       if (is_error_page(page))
                 return NULL;
-       }
+ 
         return page;
   }
   
@@@ -667,7 -668,7 +668,7 @@@ static struct vmx_capability 
                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
         }
   
- static struct kvm_vmx_segment_field {
+ static const struct kvm_vmx_segment_field {
         unsigned selector;
         unsigned base;
         unsigned limit;
@@@ -1343,7 -1344,7 +1344,7 @@@ static bool update_transition_efer(stru
         guest_efer = vmx->vcpu.arch.efer;
   
         /*
-        * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+        * NX is emulated; LMA and LME handled by hardware; SCE meaningless
          * outside long mode
          */
         ignore_bits = EFER_NX | EFER_SCE;
@@@ -1493,12 -1494,8 +1494,12 @@@ static void __vmx_load_host_state(struc
   #ifdef CONFIG_X86_64
         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
   #endif
- -      if (user_has_fpu())
- -              clts();
+ +      /*
+ +       * If the FPU is not active (through the host task or
+ +       * the guest vcpu), then restore the cr0.TS bit.
+ +       */
+ +      if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded)
+ +              stts();
         load_gdt(&__get_cpu_var(host_gdt));
   }
   
@@@ -1995,7 -1992,7 +1996,7 @@@ static __init void nested_vmx_setup_ctl
   #endif
                 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
                 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
-               CPU_BASED_RDPMC_EXITING |
+               CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
         /*
          * We can allow some features even when not supported by the
@@@ -2291,16 -2288,6 +2292,6 @@@ static void vmx_cache_reg(struct kvm_vc
         }
   }
   
- static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
- {
-       if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
-               vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
-       else
-               vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
- 
-       update_exception_bitmap(vcpu);
- }
- 
   static __init int cpu_has_kvm_support(void)
   {
         return cpu_has_vmx();
@@@ -2698,20 -2685,17 +2689,17 @@@ static __exit void hardware_unsetup(voi
         free_kvm_area();
   }
   
- static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
+ static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save)
   {
-       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+       const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+       struct kvm_segment tmp = *save;
   
-       if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
-               vmcs_write16(sf->selector, save->selector);
-               vmcs_writel(sf->base, save->base);
-               vmcs_write32(sf->limit, save->limit);
-               vmcs_write32(sf->ar_bytes, save->ar);
-       } else {
-               u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
-                       << AR_DPL_SHIFT;
-               vmcs_write32(sf->ar_bytes, 0x93 | dpl);
+       if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
+               tmp.base = vmcs_readl(sf->base);
+               tmp.selector = vmcs_read16(sf->selector);
+               tmp.s = 1;
         }
+       vmx_set_segment(vcpu, &tmp, seg);
   }
   
   static void enter_pmode(struct kvm_vcpu *vcpu)
@@@ -2724,10 -2708,7 +2712,7 @@@
   
         vmx_segment_cache_clear(vmx);
   
-       vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
-       vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
-       vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
-       vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
+       vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
   
         flags = vmcs_readl(GUEST_RFLAGS);
         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
@@@ -2742,10 -2723,10 +2727,10 @@@
         if (emulate_invalid_guest_state)
                 return;
   
-       fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es);
-       fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds);
-       fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
-       fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
+       fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+       fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+       fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
+       fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
   
         vmx_segment_cache_clear(vmx);
   
@@@ -2773,14 -2754,10 +2758,10 @@@ static gva_t rmode_tss_base(struct kvm 
         return kvm->arch.tss_addr;
   }
   
- static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
+ static void fix_rmode_seg(int seg, struct kvm_segment *save)
   {
-       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+       const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
   
-       save->selector = vmcs_read16(sf->selector);
-       save->base = vmcs_readl(sf->base);
-       save->limit = vmcs_read32(sf->limit);
-       save->ar = vmcs_read32(sf->ar_bytes);
         vmcs_write16(sf->selector, save->base >> 4);
         vmcs_write32(sf->base, save->base & 0xffff0);
         vmcs_write32(sf->limit, 0xffff);
@@@ -2800,9 -2777,16 +2781,16 @@@ static void enter_rmode(struct kvm_vcp
         if (enable_unrestricted_guest)
                 return;
   
+       vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+       vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
+       vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
+       vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
+       vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+ 
         vmx->emulation_required = 1;
         vmx->rmode.vm86_active = 1;
   
+ 
         /*
          * Very old userspace does not call KVM_SET_TSS_ADDR before entering
          * vcpu. Call it here with phys address pointing 16M below 4G.
@@@ -2817,14 -2801,8 +2805,8 @@@
   
         vmx_segment_cache_clear(vmx);
   
-       vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
-       vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
- 
-       vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
- 
-       vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
   
         flags = vmcs_readl(GUEST_RFLAGS);
@@@ -3117,35 -3095,24 +3099,24 @@@ static void vmx_get_segment(struct kvm_
                             struct kvm_segment *var, int seg)
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct kvm_save_segment *save;
         u32 ar;
   
         if (vmx->rmode.vm86_active
             && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
                 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
-               || seg == VCPU_SREG_GS)
-           && !emulate_invalid_guest_state) {
-               switch (seg) {
-               case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
-               case VCPU_SREG_ES: save = &vmx->rmode.es; break;
-               case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
-               case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
-               case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
-               default: BUG();
-               }
-               var->selector = save->selector;
-               var->base = save->base;
-               var->limit = save->limit;
-               ar = save->ar;
+               || seg == VCPU_SREG_GS)) {
+               *var = vmx->rmode.segs[seg];
                 if (seg == VCPU_SREG_TR
                     || var->selector == vmx_read_guest_seg_selector(vmx, seg))
-                       goto use_saved_rmode_seg;
+                       return;
+               var->base = vmx_read_guest_seg_base(vmx, seg);
+               var->selector = vmx_read_guest_seg_selector(vmx, seg);
+               return;
         }
         var->base = vmx_read_guest_seg_base(vmx, seg);
         var->limit = vmx_read_guest_seg_limit(vmx, seg);
         var->selector = vmx_read_guest_seg_selector(vmx, seg);
         ar = vmx_read_guest_seg_ar(vmx, seg);
- use_saved_rmode_seg:
         if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
                 ar = 0;
         var->type = ar & 15;
@@@ -3227,23 -3194,21 +3198,21 @@@ static void vmx_set_segment(struct kvm_
                             struct kvm_segment *var, int seg)
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+       const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
         u32 ar;
   
         vmx_segment_cache_clear(vmx);
   
         if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
                 vmcs_write16(sf->selector, var->selector);
-               vmx->rmode.tr.selector = var->selector;
-               vmx->rmode.tr.base = var->base;
-               vmx->rmode.tr.limit = var->limit;
-               vmx->rmode.tr.ar = vmx_segment_access_rights(var);
+               vmx->rmode.segs[VCPU_SREG_TR] = *var;
                 return;
         }
         vmcs_writel(sf->base, var->base);
         vmcs_write32(sf->limit, var->limit);
         vmcs_write16(sf->selector, var->selector);
         if (vmx->rmode.vm86_active && var->s) {
+               vmx->rmode.segs[seg] = *var;
                 /*
                  * Hack real-mode segments into vm86 compatibility.
                  */
@@@ -3258,7 -3223,7 +3227,7 @@@
          * qemu binaries.
          *   IA32 arch specifies that at the time of processor reset the
          * "Accessed" bit in the AR field of segment registers is 1. And qemu
-        * is setting it to 0 in the usedland code. This causes invalid guest
+        * is setting it to 0 in the userland code. This causes invalid guest
          * state vmexit when "unrestricted guest" mode is turned on.
          *    Fix for this setup issue in cpu_reset is being pushed in the qemu
          * tree. Newer qemu binaries with that qemu fix would not need this
@@@ -3288,16 -3253,10 +3257,10 @@@
                                      vmcs_readl(GUEST_CS_BASE) >> 4);
                         break;
                 case VCPU_SREG_ES:
-                       fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
-                       break;
                 case VCPU_SREG_DS:
-                       fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
-                       break;
                 case VCPU_SREG_GS:
-                       fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
-                       break;
                 case VCPU_SREG_FS:
-                       fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+                       fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
                         break;
                 case VCPU_SREG_SS:
                         vmcs_write16(GUEST_SS_SELECTOR,
@@@ -3351,9 -3310,9 +3314,9 @@@ static bool rmode_segment_valid(struct 
   
         if (var.base != (var.selector << 4))
                 return false;
-       if (var.limit != 0xffff)
+       if (var.limit < 0xffff)
                 return false;
-       if (ar != 0xf3)
+       if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3)
                 return false;
   
         return true;
@@@ -3605,7 -3564,7 +3568,7 @@@ out
   
   static void seg_setup(int seg)
   {
-       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+       const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
         unsigned int ar;
   
         vmcs_write16(sf->selector, 0);
@@@ -3623,7 -3582,6 +3586,7 @@@
   
   static int alloc_apic_access_page(struct kvm *kvm)
   {
+ +      struct page *page;
         struct kvm_userspace_memory_region kvm_userspace_mem;
         int r = 0;
   
@@@ -3638,13 -3596,7 +3601,13 @@@
         if (r)
                 goto out;
   
- -      kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
+ +      page = gfn_to_page(kvm, 0xfee00);
+ +      if (is_error_page(page)) {
+ +              r = -EFAULT;
+ +              goto out;
+ +      }
+ +
+ +      kvm->arch.apic_access_page = page;
   out:
         mutex_unlock(&kvm->slots_lock);
         return r;
@@@ -3652,7 -3604,6 +3615,7 @@@
   
   static int alloc_identity_pagetable(struct kvm *kvm)
   {
+ +      struct page *page;
         struct kvm_userspace_memory_region kvm_userspace_mem;
         int r = 0;
   
@@@ -3668,13 -3619,8 +3631,13 @@@
         if (r)
                 goto out;
   
- -      kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
- -                      kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
+ +      page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
+ +      if (is_error_page(page)) {
+ +              r = -EFAULT;
+ +              goto out;
+ +      }
+ +
+ +      kvm->arch.ept_identity_pagetable = page;
   out:
         mutex_unlock(&kvm->slots_lock);
         return r;
@@@ -3747,7 -3693,7 +3710,7 @@@ static void vmx_set_constant_host_state
         unsigned long tmpl;
         struct desc_ptr dt;
   
- -      vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS);  /* 22.2.3 */
+ +      vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS);  /* 22.2.3 */
         vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
         vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
   
@@@ -3770,8 -3716,7 +3733,7 @@@
         native_store_idt(&dt);
         vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
   
-       asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
-       vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
+       vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
   
         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
@@@ -4005,8 -3950,6 +3967,6 @@@ static int vmx_vcpu_reset(struct kvm_vc
                 kvm_rip_write(vcpu, 0);
         kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
   
-       vmcs_writel(GUEST_DR7, 0x400);
- 
         vmcs_writel(GUEST_GDTR_BASE, 0);
         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
   
@@@ -4456,7 -4399,7 +4416,7 @@@ vmx_patch_hypercall(struct kvm_vcpu *vc
         hypercall[2] = 0xc1;
   }
   
- /* called to set cr0 as approriate for a mov-to-cr0 exit. */
+ /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
   static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
   {
         if (to_vmx(vcpu)->nested.vmxon &&
@@@ -4547,7 -4490,7 +4507,7 @@@ static int handle_cr(struct kvm_vcpu *v
                                 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
                                 return 0;
                         }
- -              };
+ +              }
                 break;
         case 2: /* clts */
                 handle_clts(vcpu);
@@@ -5701,7 -5644,7 +5661,7 @@@ static int handle_vmptrst(struct kvm_vc
    * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
    * to be done to userspace and return 0.
    */
- static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
+ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
         [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
@@@ -6229,17 -6172,10 +6189,10 @@@ static void atomic_switch_perf_msrs(str
                                         msrs[i].host);
   }
   
- #ifdef CONFIG_X86_64
- #define R "r"
- #define Q "q"
- #else
- #define R "e"
- #define Q "l"
- #endif
- 
   static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long debugctlmsr;
   
         if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@@ -6279,34 -6215,35 +6232,35 @@@
                 vmx_set_interrupt_shadow(vcpu, 0);
   
         atomic_switch_perf_msrs(vmx);
+       debugctlmsr = get_debugctlmsr();
   
         vmx->__launched = vmx->loaded_vmcs->launched;
         asm(
                 /* Store host registers */
-               "push %%"R"dx; push %%"R"bp;"
-               "push %%"R"cx \n\t" /* placeholder for guest rcx */
-               "push %%"R"cx \n\t"
-               "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
+               "push %%" _ASM_DX "; push %%" _ASM_BP ";"
+               "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
+               "push %%" _ASM_CX " \n\t"
+               "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
                 "je 1f \n\t"
-               "mov %%"R"sp, %c[host_rsp](%0) \n\t"
+               "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
                 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
                 "1: \n\t"
                 /* Reload cr2 if changed */
-               "mov %c[cr2](%0), %%"R"ax \n\t"
-               "mov %%cr2, %%"R"dx \n\t"
-               "cmp %%"R"ax, %%"R"dx \n\t"
+               "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
+               "mov %%cr2, %%" _ASM_DX " \n\t"
+               "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
                 "je 2f \n\t"
-               "mov %%"R"ax, %%cr2 \n\t"
+               "mov %%" _ASM_AX", %%cr2 \n\t"
                 "2: \n\t"
                 /* Check if vmlaunch of vmresume is needed */
                 "cmpl $0, %c[launched](%0) \n\t"
                 /* Load guest registers.  Don't clobber flags. */
-               "mov %c[rax](%0), %%"R"ax \n\t"
-               "mov %c[rbx](%0), %%"R"bx \n\t"
-               "mov %c[rdx](%0), %%"R"dx \n\t"
-               "mov %c[rsi](%0), %%"R"si \n\t"
-               "mov %c[rdi](%0), %%"R"di \n\t"
-               "mov %c[rbp](%0), %%"R"bp \n\t"
+               "mov %c[rax](%0), %%" _ASM_AX " \n\t"
+               "mov %c[rbx](%0), %%" _ASM_BX " \n\t"
+               "mov %c[rdx](%0), %%" _ASM_DX " \n\t"
+               "mov %c[rsi](%0), %%" _ASM_SI " \n\t"
+               "mov %c[rdi](%0), %%" _ASM_DI " \n\t"
+               "mov %c[rbp](%0), %%" _ASM_BP " \n\t"
   #ifdef CONFIG_X86_64
                 "mov %c[r8](%0),  %%r8  \n\t"
                 "mov %c[r9](%0),  %%r9  \n\t"
@@@ -6317,24 -6254,24 +6271,24 @@@
                 "mov %c[r14](%0), %%r14 \n\t"
                 "mov %c[r15](%0), %%r15 \n\t"
   #endif
-               "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
+               "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
   
                 /* Enter guest mode */
-               "jne .Llaunched \n\t"
+               "jne 1f \n\t"
                 __ex(ASM_VMX_VMLAUNCH) "\n\t"
-               "jmp .Lkvm_vmx_return \n\t"
-               ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
-               ".Lkvm_vmx_return: "
+               "jmp 2f \n\t"
+               "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
+               "2: "
                 /* Save guest registers, load host registers, keep flags */
-               "mov %0, %c[wordsize](%%"R"sp) \n\t"
+               "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
                 "pop %0 \n\t"
-               "mov %%"R"ax, %c[rax](%0) \n\t"
-               "mov %%"R"bx, %c[rbx](%0) \n\t"
-               "pop"Q" %c[rcx](%0) \n\t"
-               "mov %%"R"dx, %c[rdx](%0) \n\t"
-               "mov %%"R"si, %c[rsi](%0) \n\t"
-               "mov %%"R"di, %c[rdi](%0) \n\t"
-               "mov %%"R"bp, %c[rbp](%0) \n\t"
+               "mov %%" _ASM_AX ", %c[rax](%0) \n\t"
+               "mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
+               __ASM_SIZE(pop) " %c[rcx](%0) \n\t"
+               "mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
+               "mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
+               "mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
+               "mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
   #ifdef CONFIG_X86_64
                 "mov %%r8,  %c[r8](%0) \n\t"
                 "mov %%r9,  %c[r9](%0) \n\t"
@@@ -6345,11 -6282,15 +6299,15 @@@
                 "mov %%r14, %c[r14](%0) \n\t"
                 "mov %%r15, %c[r15](%0) \n\t"
   #endif
-               "mov %%cr2, %%"R"ax   \n\t"
-               "mov %%"R"ax, %c[cr2](%0) \n\t"
+               "mov %%cr2, %%" _ASM_AX "   \n\t"
+               "mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
   
-               "pop  %%"R"bp; pop  %%"R"dx \n\t"
+               "pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t"
                 "setbe %c[fail](%0) \n\t"
+               ".pushsection .rodata \n\t"
+               ".global vmx_return \n\t"
+               "vmx_return: " _ASM_PTR " 2b \n\t"
+               ".popsection"
               : : "c"(vmx), "d"((unsigned long)HOST_RSP),
                 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
                 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
@@@ -6374,12 -6315,18 +6332,18 @@@
                 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
                 [wordsize]"i"(sizeof(ulong))
               : "cc", "memory"
-               , R"ax", R"bx", R"di", R"si"
   #ifdef CONFIG_X86_64
+               , "rax", "rbx", "rdi", "rsi"
                 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+ #else
+               , "eax", "ebx", "edi", "esi"
   #endif
               );
   
+       /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+       if (debugctlmsr)
+               update_debugctlmsr(debugctlmsr);
+ 
   #ifndef CONFIG_X86_64
         /*
          * The sysexit path does not restore ds/es, so we must set them to
@@@ -6424,9 -6371,6 +6388,6 @@@
         vmx_complete_interrupts(vmx);
   }
   
- #undef R
- #undef Q
- 
   static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@@ -6592,7 -6536,7 +6553,7 @@@ static void vmx_cpuid_update(struct kvm
         /* Exposing INVPCID only when PCID is exposed */
         best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
         if (vmx_invpcid_supported() &&
- -          best && (best->ecx & bit(X86_FEATURE_INVPCID)) &&
+ +          best && (best->ebx & bit(X86_FEATURE_INVPCID)) &&
             guest_cpuid_has_pcid(vcpu)) {
                 exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
@@@ -6602,7 -6546,7 +6563,7 @@@
                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
                              exec_control);
                 if (best)
- -                      best->ecx &= ~bit(X86_FEATURE_INVPCID);
+ +                      best->ebx &= ~bit(X86_FEATURE_INVPCID);
         }
   }
   
@@@ -7281,7 -7225,7 +7242,7 @@@ static struct kvm_x86_ops vmx_x86_ops 
         .vcpu_load = vmx_vcpu_load,
         .vcpu_put = vmx_vcpu_put,
   
-       .set_guest_debug = set_guest_debug,
+       .update_db_bp_intercept = update_exception_bitmap,
         .get_msr = vmx_get_msr,
         .set_msr = vmx_set_msr,
         .get_segment_base = vmx_get_segment_base,
diff --combined arch/x86/kvm/x86.c

index 1f09552572fa15709b51022fdbaa8044b3fdb73d,b16d4a5bfa41610f10a292468c87a4be6ee1c50b..1eefebe5d72758873df0d13e4ae8c686a7d016ee
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -246,20 -246,14 +246,14 @@@ static void drop_user_return_notifiers(
   
   u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
   {
-       if (irqchip_in_kernel(vcpu->kvm))
-               return vcpu->arch.apic_base;
-       else
-               return vcpu->arch.apic_base;
+       return vcpu->arch.apic_base;
   }
   EXPORT_SYMBOL_GPL(kvm_get_apic_base);
   
   void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
   {
         /* TODO: reserve bits check */
-       if (irqchip_in_kernel(vcpu->kvm))
-               kvm_lapic_set_base(vcpu, data);
-       else
-               vcpu->arch.apic_base = data;
+       kvm_lapic_set_base(vcpu, data);
   }
   EXPORT_SYMBOL_GPL(kvm_set_apic_base);
   
@@@ -698,6 -692,18 +692,18 @@@ unsigned long kvm_get_cr8(struct kvm_vc
   }
   EXPORT_SYMBOL_GPL(kvm_get_cr8);
   
+ static void kvm_update_dr7(struct kvm_vcpu *vcpu)
+ {
+       unsigned long dr7;
+ 
+       if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+               dr7 = vcpu->arch.guest_debug_dr7;
+       else
+               dr7 = vcpu->arch.dr7;
+       kvm_x86_ops->set_dr7(vcpu, dr7);
+       vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
+ }
+ 
   static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
   {
         switch (dr) {
@@@ -723,10 -729,7 +729,7 @@@
                 if (val & 0xffffffff00000000ULL)
                         return -1; /* #GP */
                 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
-               if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
-                       kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
-                       vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
-               }
+               kvm_update_dr7(vcpu);
                 break;
         }
   
@@@ -823,7 -826,7 +826,7 @@@ static u32 msrs_to_save[] = 
   
   static unsigned num_msrs_to_save;
   
- static u32 emulated_msrs[] = {
+ static const u32 emulated_msrs[] = {
         MSR_IA32_TSCDEADLINE,
         MSR_IA32_MISC_ENABLE,
         MSR_IA32_MCG_STATUS,
@@@ -1097,7 -1100,7 +1100,7 @@@ void kvm_write_tsc(struct kvm_vcpu *vcp
                  * For each generation, we track the original measured
                  * nanosecond time, offset, and write, so if TSCs are in
                  * sync, we can match exact offset, and if not, we can match
-                * exact software computaion in compute_guest_tsc()
+                * exact software computation in compute_guest_tsc()
                  *
                  * These values are tracked in kvm->arch.cur_xxx variables.
                  */
@@@ -1140,6 -1143,7 +1143,7 @@@ static int kvm_guest_time_update(struc
         unsigned long this_tsc_khz;
         s64 kernel_ns, max_kernel_ns;
         u64 tsc_timestamp;
+       u8 pvclock_flags;
   
         /* Keep irq disabled to prevent changes to the clock */
         local_irq_save(flags);
@@@ -1221,7 -1225,14 +1225,14 @@@
         vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
         vcpu->last_kernel_ns = kernel_ns;
         vcpu->last_guest_tsc = tsc_timestamp;
-       vcpu->hv_clock.flags = 0;
+ 
+       pvclock_flags = 0;
+       if (vcpu->pvclock_set_guest_stopped_request) {
+               pvclock_flags |= PVCLOCK_GUEST_STOPPED;
+               vcpu->pvclock_set_guest_stopped_request = false;
+       }
+ 
+       vcpu->hv_clock.flags = pvclock_flags;
   
         /*
          * The interface expects us to write an even number signaling that the
@@@ -1504,7 -1515,7 +1515,7 @@@ static int kvm_pv_enable_async_pf(struc
   {
         gpa_t gpa = data & ~0x3f;
   
-       /* Bits 2:5 are resrved, Should be zero */
+       /* Bits 2:5 are reserved, Should be zero */
         if (data & 0x3c)
                 return 1;
   
@@@ -1639,10 -1650,9 +1650,9 @@@ int kvm_set_msr_common(struct kvm_vcpu 
                 vcpu->arch.time_page =
                                 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
   
-               if (is_error_page(vcpu->arch.time_page)) {
-                       kvm_release_page_clean(vcpu->arch.time_page);
+               if (is_error_page(vcpu->arch.time_page))
                         vcpu->arch.time_page = NULL;
-               }
+ 
                 break;
         }
         case MSR_KVM_ASYNC_PF_EN:
@@@ -1727,7 -1737,7 +1737,7 @@@
                  * Ignore all writes to this no longer documented MSR.
                  * Writes are only relevant for old K7 processors,
                  * all pre-dating SVM, but a recommended workaround from
-                * AMD for these chips. It is possible to speicify the
+                * AMD for these chips. It is possible to specify the
                  * affected processor models on the command line, hence
                  * the need to ignore the workaround.
                  */
@@@ -2000,9 -2010,6 +2010,9 @@@ int kvm_get_msr_common(struct kvm_vcpu 
         case MSR_KVM_STEAL_TIME:
                 data = vcpu->arch.st.msr_val;
                 break;
+ +      case MSR_KVM_PV_EOI_EN:
+ +              data = vcpu->arch.pv_eoi.msr_val;
+ +              break;
         case MSR_IA32_P5_MC_ADDR:
         case MSR_IA32_P5_MC_TYPE:
         case MSR_IA32_MCG_CAP:
@@@ -2177,6 -2184,8 +2187,8 @@@ int kvm_dev_ioctl_check_extension(long 
         case KVM_CAP_GET_TSC_KHZ:
         case KVM_CAP_PCI_2_3:
         case KVM_CAP_KVMCLOCK_CTRL:
+       case KVM_CAP_READONLY_MEM:
+       case KVM_CAP_IRQFD_RESAMPLE:
                 r = 1;
                 break;
         case KVM_CAP_COALESCED_MMIO:
@@@ -2358,8 -2367,7 +2370,7 @@@ static int kvm_vcpu_ioctl_get_lapic(str
   static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
                                     struct kvm_lapic_state *s)
   {
-       memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
-       kvm_apic_post_state_restore(vcpu);
+       kvm_apic_post_state_restore(vcpu, s);
         update_cr8_intercept(vcpu);
   
         return 0;
@@@ -2368,7 -2376,7 +2379,7 @@@
   static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
                                     struct kvm_interrupt *irq)
   {
-       if (irq->irq < 0 || irq->irq >= 256)
+       if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS)
                 return -EINVAL;
         if (irqchip_in_kernel(vcpu->kvm))
                 return -ENXIO;
@@@ -2635,11 -2643,9 +2646,9 @@@ static int kvm_vcpu_ioctl_x86_set_xcrs(
    */
   static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
   {
-       struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock;
         if (!vcpu->arch.time_page)
                 return -EINVAL;
-       src->flags |= PVCLOCK_GUEST_STOPPED;
-       mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
+       vcpu->arch.pvclock_set_guest_stopped_request = true;
         kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
         return 0;
   }
@@@ -3090,7 -3096,7 +3099,7 @@@ static int kvm_vm_ioctl_reinject(struc
         if (!kvm->arch.vpit)
                 return -ENXIO;
         mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+       kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
         return 0;
   }
@@@ -3173,6 -3179,16 +3182,16 @@@ out
         return r;
   }
   
+ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+ {
+       if (!irqchip_in_kernel(kvm))
+               return -ENXIO;
+ 
+       irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+                                       irq_event->irq, irq_event->level);
+       return 0;
+ }
+ 
   long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
   {
@@@ -3279,29 -3295,6 +3298,6 @@@
         create_pit_unlock:
                 mutex_unlock(&kvm->slots_lock);
                 break;
-       case KVM_IRQ_LINE_STATUS:
-       case KVM_IRQ_LINE: {
-               struct kvm_irq_level irq_event;
- 
-               r = -EFAULT;
-               if (copy_from_user(&irq_event, argp, sizeof irq_event))
-                       goto out;
-               r = -ENXIO;
-               if (irqchip_in_kernel(kvm)) {
-                       __s32 status;
-                       status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
-                                       irq_event.irq, irq_event.level);
-                       if (ioctl == KVM_IRQ_LINE_STATUS) {
-                               r = -EFAULT;
-                               irq_event.status = status;
-                               if (copy_to_user(argp, &irq_event,
-                                                       sizeof irq_event))
-                                       goto out;
-                       }
-                       r = 0;
-               }
-               break;
-       }
         case KVM_GET_IRQCHIP: {
                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
                 struct kvm_irqchip *chip;
@@@ -3689,20 -3682,17 +3685,17 @@@ static int vcpu_mmio_gva_to_gpa(struct 
                                 gpa_t *gpa, struct x86_exception *exception,
                                 bool write)
   {
-       u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
+               | (write ? PFERR_WRITE_MASK : 0);
   
-       if (vcpu_match_mmio_gva(vcpu, gva) &&
-                 check_write_user_access(vcpu, write, access,
-                 vcpu->arch.access)) {
+       if (vcpu_match_mmio_gva(vcpu, gva)
+           && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {
                 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
                                         (gva & (PAGE_SIZE - 1));
                 trace_vcpu_match_mmio(gva, *gpa, write, false);
                 return 1;
         }
   
-       if (write)
-               access |= PFERR_WRITE_MASK;
- 
         *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
   
         if (*gpa == UNMAPPED_GVA)
@@@ -3790,14 -3780,14 +3783,14 @@@ static int write_exit_mmio(struct kvm_v
         return X86EMUL_CONTINUE;
   }
   
- static struct read_write_emulator_ops read_emultor = {
+ static const struct read_write_emulator_ops read_emultor = {
         .read_write_prepare = read_prepare,
         .read_write_emulate = read_emulate,
         .read_write_mmio = vcpu_mmio_read,
         .read_write_exit_mmio = read_exit_mmio,
   };
   
- static struct read_write_emulator_ops write_emultor = {
+ static const struct read_write_emulator_ops write_emultor = {
         .read_write_emulate = write_emulate,
         .read_write_mmio = write_mmio,
         .read_write_exit_mmio = write_exit_mmio,
@@@ -3808,7 -3798,7 +3801,7 @@@ static int emulator_read_write_onepage(
                                        unsigned int bytes,
                                        struct x86_exception *exception,
                                        struct kvm_vcpu *vcpu,
-                                      struct read_write_emulator_ops *ops)
+                                      const struct read_write_emulator_ops *ops)
   {
         gpa_t gpa;
         int handled, ret;
@@@ -3857,7 -3847,7 +3850,7 @@@ mmio
   int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
                         void *val, unsigned int bytes,
                         struct x86_exception *exception,
-                       struct read_write_emulator_ops *ops)
+                       const struct read_write_emulator_ops *ops)
   {
         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
         gpa_t gpa;
@@@ -3962,10 -3952,8 +3955,8 @@@ static int emulator_cmpxchg_emulated(st
                 goto emul_write;
   
         page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-       if (is_error_page(page)) {
-               kvm_release_page_clean(page);
+       if (is_error_page(page))
                 goto emul_write;
-       }
   
         kaddr = kmap_atomic(page);
         kaddr += offset_in_page(gpa);
@@@ -4332,7 -4320,19 +4323,19 @@@ static void emulator_get_cpuid(struct x
         kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
   }
   
- static struct x86_emulate_ops emulate_ops = {
+ static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
+ {
+       return kvm_register_read(emul_to_vcpu(ctxt), reg);
+ }
+ 
+ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
+ {
+       kvm_register_write(emul_to_vcpu(ctxt), reg, val);
+ }
+ 
+ static const struct x86_emulate_ops emulate_ops = {
+       .read_gpr            = emulator_read_gpr,
+       .write_gpr           = emulator_write_gpr,
         .read_std            = kvm_read_guest_virt_system,
         .write_std           = kvm_write_guest_virt_system,
         .fetch               = kvm_fetch_guest_virt,
@@@ -4367,14 -4367,6 +4370,6 @@@
         .get_cpuid           = emulator_get_cpuid,
   };
   
- static void cache_all_regs(struct kvm_vcpu *vcpu)
- {
-       kvm_register_read(vcpu, VCPU_REGS_RAX);
-       kvm_register_read(vcpu, VCPU_REGS_RSP);
-       kvm_register_read(vcpu, VCPU_REGS_RIP);
-       vcpu->arch.regs_dirty = ~0;
- }
- 
   static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
   {
         u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
@@@ -4401,12 -4393,10 +4396,10 @@@ static void inject_emulated_exception(s
                 kvm_queue_exception(vcpu, ctxt->exception.vector);
   }
   
- static void init_decode_cache(struct x86_emulate_ctxt *ctxt,
-                             const unsigned long *regs)
+ static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
   {
         memset(&ctxt->twobyte, 0,
-              (void *)&ctxt->regs - (void *)&ctxt->twobyte);
-       memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
+              (void *)&ctxt->_regs - (void *)&ctxt->twobyte);
   
         ctxt->fetch.start = 0;
         ctxt->fetch.end = 0;
@@@ -4421,14 -4411,6 +4414,6 @@@ static void init_emulate_ctxt(struct kv
         struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
         int cs_db, cs_l;
   
-       /*
-        * TODO: fix emulate.c to use guest_read/write_register
-        * instead of direct ->regs accesses, can save hundred cycles
-        * on Intel for instructions that don't read/change RSP, for
-        * for example.
-        */
-       cache_all_regs(vcpu);
- 
         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
   
         ctxt->eflags = kvm_get_rflags(vcpu);
@@@ -4440,7 -4422,7 +4425,7 @@@
                                                           X86EMUL_MODE_PROT16;
         ctxt->guest_mode = is_guest_mode(vcpu);
   
-       init_decode_cache(ctxt, vcpu->arch.regs);
+       init_decode_cache(ctxt);
         vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
   }
   
@@@ -4460,7 -4442,6 +4445,6 @@@ int kvm_inject_realmode_interrupt(struc
                 return EMULATE_FAIL;
   
         ctxt->eip = ctxt->_eip;
-       memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
         kvm_rip_write(vcpu, ctxt->eip);
         kvm_set_rflags(vcpu, ctxt->eflags);
   
@@@ -4493,13 -4474,14 +4477,14 @@@ static int handle_emulation_failure(str
   static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
   {
         gpa_t gpa;
+       pfn_t pfn;
   
         if (tdp_enabled)
                 return false;
   
         /*
          * if emulation was due to access to shadowed page table
-        * and it failed try to unshadow page and re-entetr the
+        * and it failed try to unshadow page and re-enter the
          * guest to let CPU execute the instruction.
          */
         if (kvm_mmu_unprotect_page_virt(vcpu, gva))
@@@ -4510,8 -4492,17 +4495,17 @@@
         if (gpa == UNMAPPED_GVA)
                 return true; /* let cpu generate fault */
   
-       if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
+       /*
+        * Do not retry the unhandleable instruction if it faults on the
+        * readonly host memory, otherwise it will goto a infinite loop:
+        * retry instruction -> write #PF -> emulation fail -> retry
+        * instruction -> ...
+        */
+       pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
+       if (!is_error_pfn(pfn)) {
+               kvm_release_pfn_clean(pfn);
                 return true;
+       }
   
         return false;
   }
@@@ -4560,6 -4551,9 +4554,9 @@@ static bool retry_instruction(struct x8
         return true;
   }
   
+ static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
+ static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+ 
   int x86_emulate_instruction(struct kvm_vcpu *vcpu,
                             unsigned long cr2,
                             int emulation_type,
@@@ -4608,7 -4602,7 +4605,7 @@@
            changes registers values  during IO operation */
         if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
                 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
-               memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs);
+               emulator_invalidate_register_cache(ctxt);
         }
   
   restart:
@@@ -4630,13 -4624,16 +4627,16 @@@
         } else if (vcpu->arch.pio.count) {
                 if (!vcpu->arch.pio.in)
                         vcpu->arch.pio.count = 0;
-               else
+               else {
                         writeback = false;
+                       vcpu->arch.complete_userspace_io = complete_emulated_pio;
+               }
                 r = EMULATE_DO_MMIO;
         } else if (vcpu->mmio_needed) {
                 if (!vcpu->mmio_is_write)
                         writeback = false;
                 r = EMULATE_DO_MMIO;
+               vcpu->arch.complete_userspace_io = complete_emulated_mmio;
         } else if (r == EMULATION_RESTART)
                 goto restart;
         else
@@@ -4646,7 -4643,6 +4646,6 @@@
                 toggle_interruptibility(vcpu, ctxt->interruptibility);
                 kvm_set_rflags(vcpu, ctxt->eflags);
                 kvm_make_request(KVM_REQ_EVENT, vcpu);
-               memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
                 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
                 kvm_rip_write(vcpu, ctxt->eip);
         } else
@@@ -4929,6 -4925,7 +4928,7 @@@ int kvm_arch_init(void *opaque
         if (cpu_has_xsave)
                 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
   
+       kvm_lapic_init();
         return 0;
   
   out:
@@@ -5113,20 -5110,17 +5113,20 @@@ static void post_kvm_run_save(struct kv
                         !kvm_event_needs_reinjection(vcpu);
   }
   
- -static void vapic_enter(struct kvm_vcpu *vcpu)
+ +static int vapic_enter(struct kvm_vcpu *vcpu)
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
         struct page *page;
   
         if (!apic || !apic->vapic_addr)
- -              return;
+ +              return 0;
   
         page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
+ +      if (is_error_page(page))
+ +              return -EFAULT;
   
         vcpu->arch.apic->vapic_page = page;
+ +      return 0;
   }
   
   static void vapic_exit(struct kvm_vcpu *vcpu)
@@@ -5433,11 -5427,7 +5433,11 @@@ static int __vcpu_run(struct kvm_vcpu *
         }
   
         vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- -      vapic_enter(vcpu);
+ +      r = vapic_enter(vcpu);
+ +      if (r) {
+ +              srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ +              return r;
+ +      }
   
         r = 1;
         while (r > 0) {
@@@ -5499,6 -5489,24 +5499,24 @@@
         return r;
   }
   
+ static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
+ {
+       int r;
+       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+       r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+       if (r != EMULATE_DONE)
+               return 0;
+       return 1;
+ }
+ 
+ static int complete_emulated_pio(struct kvm_vcpu *vcpu)
+ {
+       BUG_ON(!vcpu->arch.pio.count);
+ 
+       return complete_emulated_io(vcpu);
+ }
+ 
   /*
    * Implements the following, as a state machine:
    *
@@@ -5515,47 -5523,37 +5533,37 @@@
    *      copy data
    *      exit
    */
- static int complete_mmio(struct kvm_vcpu *vcpu)
+ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
   {
         struct kvm_run *run = vcpu->run;
         struct kvm_mmio_fragment *frag;
-       int r;
   
-       if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
-               return 1;
+       BUG_ON(!vcpu->mmio_needed);
   
-       if (vcpu->mmio_needed) {
-               /* Complete previous fragment */
-               frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
-               if (!vcpu->mmio_is_write)
-                       memcpy(frag->data, run->mmio.data, frag->len);
-               if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
-                       vcpu->mmio_needed = 0;
-                       if (vcpu->mmio_is_write)
-                               return 1;
-                       vcpu->mmio_read_completed = 1;
-                       goto done;
-               }
-               /* Initiate next fragment */
-               ++frag;
-               run->exit_reason = KVM_EXIT_MMIO;
-               run->mmio.phys_addr = frag->gpa;
+       /* Complete previous fragment */
+       frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
+       if (!vcpu->mmio_is_write)
+               memcpy(frag->data, run->mmio.data, frag->len);
+       if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
+               vcpu->mmio_needed = 0;
                 if (vcpu->mmio_is_write)
-                       memcpy(run->mmio.data, frag->data, frag->len);
-               run->mmio.len = frag->len;
-               run->mmio.is_write = vcpu->mmio_is_write;
-               return 0;
- 
-       }
- done:
-       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-       r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
-       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-       if (r != EMULATE_DONE)
-               return 0;
-       return 1;
+                       return 1;
+               vcpu->mmio_read_completed = 1;
+               return complete_emulated_io(vcpu);
+       }
+       /* Initiate next fragment */
+       ++frag;
+       run->exit_reason = KVM_EXIT_MMIO;
+       run->mmio.phys_addr = frag->gpa;
+       if (vcpu->mmio_is_write)
+               memcpy(run->mmio.data, frag->data, frag->len);
+       run->mmio.len = frag->len;
+       run->mmio.is_write = vcpu->mmio_is_write;
+       vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+       return 0;
   }
   
+ 
   int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
   {
         int r;
@@@ -5582,9 -5580,14 +5590,14 @@@
                 }
         }
   
-       r = complete_mmio(vcpu);
-       if (r <= 0)
-               goto out;
+       if (unlikely(vcpu->arch.complete_userspace_io)) {
+               int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
+               vcpu->arch.complete_userspace_io = NULL;
+               r = cui(vcpu);
+               if (r <= 0)
+                       goto out;
+       } else
+               WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
   
         r = __vcpu_run(vcpu);
   
@@@ -5602,12 -5605,11 +5615,11 @@@ int kvm_arch_vcpu_ioctl_get_regs(struc
                 /*
                  * We are here if userspace calls get_regs() in the middle of
                  * instruction emulation. Registers state needs to be copied
-                * back from emulation context to vcpu. Usrapace shouldn't do
+                * back from emulation context to vcpu. Userspace shouldn't do
                  * that usually, but some bad designed PV devices (vmware
                  * backdoor interface) need this to work
                  */
-               struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
-               memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
+               emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
                 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
         }
         regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
@@@ -5747,7 -5749,6 +5759,6 @@@ int kvm_task_switch(struct kvm_vcpu *vc
         if (ret)
                 return EMULATE_FAIL;
   
-       memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
         kvm_rip_write(vcpu, ctxt->eip);
         kvm_set_rflags(vcpu, ctxt->eflags);
         kvm_make_request(KVM_REQ_EVENT, vcpu);
@@@ -5799,7 -5800,7 +5810,7 @@@ int kvm_arch_vcpu_ioctl_set_sregs(struc
         if (mmu_reset_needed)
                 kvm_mmu_reset_context(vcpu);
   
-       max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+       max_bits = KVM_NR_INTERRUPTS;
         pending_vec = find_first_bit(
                 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
         if (pending_vec < max_bits) {
@@@ -5859,13 -5860,12 +5870,12 @@@ int kvm_arch_vcpu_ioctl_set_guest_debug
         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
                 for (i = 0; i < KVM_NR_DB_REGS; ++i)
                         vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
-               vcpu->arch.switch_db_regs =
-                       (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
+               vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
         } else {
                 for (i = 0; i < KVM_NR_DB_REGS; i++)
                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
-               vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
         }
+       kvm_update_dr7(vcpu);
   
         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
                 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
@@@ -5877,7 -5877,7 +5887,7 @@@
          */
         kvm_set_rflags(vcpu, rflags);
   
-       kvm_x86_ops->set_guest_debug(vcpu, dbg);
+       kvm_x86_ops->update_db_bp_intercept(vcpu);
   
         r = 0;
   
@@@ -5979,7 -5979,7 +5989,7 @@@ void kvm_load_guest_fpu(struct kvm_vcp
          */
         kvm_put_guest_xcr0(vcpu);
         vcpu->guest_fpu_loaded = 1;
- -      unlazy_fpu(current);
+ +      __kernel_fpu_begin();
         fpu_restore_checking(&vcpu->arch.guest_fpu);
         trace_kvm_fpu(1);
   }
@@@ -5993,7 -5993,6 +6003,7 @@@ void kvm_put_guest_fpu(struct kvm_vcpu 
   
         vcpu->guest_fpu_loaded = 0;
         fpu_save_init(&vcpu->arch.guest_fpu);
+ +      __kernel_fpu_end();
         ++vcpu->stat.fpu_reload;
         kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
         trace_kvm_fpu(0);
@@@ -6023,7 -6022,9 +6033,9 @@@ int kvm_arch_vcpu_setup(struct kvm_vcp
         int r;
   
         vcpu->arch.mtrr_state.have_fixed = 1;
-       vcpu_load(vcpu);
+       r = vcpu_load(vcpu);
+       if (r)
+               return r;
         r = kvm_arch_vcpu_reset(vcpu);
         if (r == 0)
                 r = kvm_mmu_setup(vcpu);
@@@ -6034,9 -6035,11 +6046,11 @@@
   
   void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
   {
+       int r;
         vcpu->arch.apf.msr_val = 0;
   
-       vcpu_load(vcpu);
+       r = vcpu_load(vcpu);
+       BUG_ON(r);
         kvm_mmu_unload(vcpu);
         vcpu_put(vcpu);
   
@@@ -6050,10 -6053,10 +6064,10 @@@ int kvm_arch_vcpu_reset(struct kvm_vcp
         vcpu->arch.nmi_pending = 0;
         vcpu->arch.nmi_injected = false;
   
-       vcpu->arch.switch_db_regs = 0;
         memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
         vcpu->arch.dr6 = DR6_FIXED_1;
         vcpu->arch.dr7 = DR7_FIXED_1;
+       kvm_update_dr7(vcpu);
   
         kvm_make_request(KVM_REQ_EVENT, vcpu);
         vcpu->arch.apf.msr_val = 0;
@@@ -6132,7 -6135,7 +6146,7 @@@ int kvm_arch_hardware_enable(void *garb
          * as we reset last_host_tsc on all VCPUs to stop this from being
          * called multiple times (one for each physical CPU bringup).
          *
-        * Platforms with unnreliable TSCs don't have to deal with this, they
+        * Platforms with unreliable TSCs don't have to deal with this, they
          * will be compensated by the logic in vcpu_load, which sets the TSC to
          * catchup mode.  This will catchup all VCPUs to real time, but cannot
          * guarantee that they stay in perfect synchronization.
@@@ -6185,6 -6188,8 +6199,8 @@@ bool kvm_vcpu_compatible(struct kvm_vcp
         return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
   }
   
+ struct static_key kvm_no_apic_vcpu __read_mostly;
+ 
   int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
   {
         struct page *page;
@@@ -6217,7 -6222,8 +6233,8 @@@
                 r = kvm_create_lapic(vcpu);
                 if (r < 0)
                         goto fail_mmu_destroy;
-       }
+       } else
+               static_key_slow_inc(&kvm_no_apic_vcpu);
   
         vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
                                        GFP_KERNEL);
@@@ -6257,6 -6263,8 +6274,8 @@@ void kvm_arch_vcpu_uninit(struct kvm_vc
         kvm_mmu_destroy(vcpu);
         srcu_read_unlock(&vcpu->kvm->srcu, idx);
         free_page((unsigned long)vcpu->arch.pio_data);
+       if (!irqchip_in_kernel(vcpu->kvm))
+               static_key_slow_dec(&kvm_no_apic_vcpu);
   }
   
   int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
@@@ -6269,15 -6277,21 +6288,21 @@@
   
         /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
         set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
+       /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
+       set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+               &kvm->arch.irq_sources_bitmap);
   
         raw_spin_lock_init(&kvm->arch.tsc_write_lock);
+       mutex_init(&kvm->arch.apic_map_lock);
   
         return 0;
   }
   
   static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
   {
-       vcpu_load(vcpu);
+       int r;
+       r = vcpu_load(vcpu);
+       BUG_ON(r);
         kvm_mmu_unload(vcpu);
         vcpu_put(vcpu);
   }
@@@ -6321,6 -6335,7 +6346,7 @@@ void kvm_arch_destroy_vm(struct kvm *kv
                 put_page(kvm->arch.apic_access_page);
         if (kvm->arch.ept_identity_pagetable)
                 put_page(kvm->arch.ept_identity_pagetable);
+       kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
   }
   
   void kvm_arch_free_memslot(struct kvm_memory_slot *free,
@@@ -6328,10 -6343,18 +6354,18 @@@
   {
         int i;
   
-       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
-               if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
-                       kvm_kvfree(free->arch.lpage_info[i]);
-                       free->arch.lpage_info[i] = NULL;
+       for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+               if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
+                       kvm_kvfree(free->arch.rmap[i]);
+                       free->arch.rmap[i] = NULL;
+               }
+               if (i == 0)
+                       continue;
+ 
+               if (!dont || free->arch.lpage_info[i - 1] !=
+                            dont->arch.lpage_info[i - 1]) {
+                       kvm_kvfree(free->arch.lpage_info[i - 1]);
+                       free->arch.lpage_info[i - 1] = NULL;
                 }
         }
   }
@@@ -6340,23 -6363,30 +6374,30 @@@ int kvm_arch_create_memslot(struct kvm_
   {
         int i;
   
-       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+       for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
                 unsigned long ugfn;
                 int lpages;
-               int level = i + 2;
+               int level = i + 1;
   
                 lpages = gfn_to_index(slot->base_gfn + npages - 1,
                                       slot->base_gfn, level) + 1;
   
-               slot->arch.lpage_info[i] =
-                       kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
-               if (!slot->arch.lpage_info[i])
+               slot->arch.rmap[i] =
+                       kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
+               if (!slot->arch.rmap[i])
+                       goto out_free;
+               if (i == 0)
+                       continue;
+ 
+               slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
+                                       sizeof(*slot->arch.lpage_info[i - 1]));
+               if (!slot->arch.lpage_info[i - 1])
                         goto out_free;
   
                 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
-                       slot->arch.lpage_info[i][0].write_count = 1;
+                       slot->arch.lpage_info[i - 1][0].write_count = 1;
                 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
-                       slot->arch.lpage_info[i][lpages - 1].write_count = 1;
+                       slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
                 ugfn = slot->userspace_addr >> PAGE_SHIFT;
                 /*
                  * If the gfn and userspace address are not aligned wrt each
@@@ -6368,16 -6398,21 +6409,21 @@@
                         unsigned long j;
   
                         for (j = 0; j < lpages; ++j)
-                               slot->arch.lpage_info[i][j].write_count = 1;
+                               slot->arch.lpage_info[i - 1][j].write_count = 1;
                 }
         }
   
         return 0;
   
   out_free:
-       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
-               kvm_kvfree(slot->arch.lpage_info[i]);
-               slot->arch.lpage_info[i] = NULL;
+       for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+               kvm_kvfree(slot->arch.rmap[i]);
+               slot->arch.rmap[i] = NULL;
+               if (i == 0)
+                       continue;
+ 
+               kvm_kvfree(slot->arch.lpage_info[i - 1]);
+               slot->arch.lpage_info[i - 1] = NULL;
         }
         return -ENOMEM;
   }
@@@ -6396,10 -6431,10 +6442,10 @@@ int kvm_arch_prepare_memory_region(stru
                 map_flags = MAP_SHARED | MAP_ANONYMOUS;
   
         /*To keep backward compatibility with older userspace,
-        *x86 needs to hanlde !user_alloc case.
+        *x86 needs to handle !user_alloc case.
          */
         if (!user_alloc) {
-               if (npages && !old.rmap) {
+               if (npages && !old.npages) {
                         unsigned long userspace_addr;
   
                         userspace_addr = vm_mmap(NULL, 0,
@@@ -6427,7 -6462,7 +6473,7 @@@ void kvm_arch_commit_memory_region(stru
   
         int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
   
-       if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
+       if (!user_alloc && !old.user_alloc && old.npages && !npages) {
                 int ret;
   
                 ret = vm_munmap(old.userspace_addr,
@@@ -6446,14 -6481,28 +6492,28 @@@
                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
         spin_unlock(&kvm->mmu_lock);
+       /*
+        * If memory slot is created, or moved, we need to clear all
+        * mmio sptes.
+        */
+       if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {
+               kvm_mmu_zap_all(kvm);
+               kvm_reload_remote_mmus(kvm);
+       }
   }
   
- void kvm_arch_flush_shadow(struct kvm *kvm)
+ void kvm_arch_flush_shadow_all(struct kvm *kvm)
   {
         kvm_mmu_zap_all(kvm);
         kvm_reload_remote_mmus(kvm);
   }
   
+ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+                                  struct kvm_memory_slot *slot)
+ {
+       kvm_arch_flush_shadow_all(kvm);
+ }
+ 
   int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
   {
         return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
diff --combined include/linux/kvm_host.h

index 8a59e0abe5faf6c6c3ba2069b9c1b32b65017cf1,2850656e2e96a18b464bab0a0de3410dd6b17867..93bfc9f9815c7fa178ad70b555e7a2afb86b3f02
--- 1/include/linux/kvm_host.h
--- 2/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@@ -21,6 -21,7 +21,7 @@@
   #include <linux/slab.h>
   #include <linux/rcupdate.h>
   #include <linux/ratelimit.h>
+ #include <linux/err.h>
   #include <asm/signal.h>
   
   #include <linux/kvm.h>
@@@ -34,6 -35,13 +35,13 @@@
   #define KVM_MMIO_SIZE 8
   #endif
   
+ /*
+  * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
+  * in kvm, other bits are visible for userspace which are defined in
+  * include/linux/kvm_h.
+  */
+ #define KVM_MEMSLOT_INVALID   (1UL << 16)
+ 
   /*
    * If we support unaligned MMIO, at most one fragment will be split into two:
    */
@@@ -48,6 -56,47 +56,47 @@@
   #define KVM_MAX_MMIO_FRAGMENTS \
         (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS)
   
+ /*
+  * For the normal pfn, the highest 12 bits should be zero,
+  * so we can mask these bits to indicate the error.
+  */
+ #define KVM_PFN_ERR_MASK      (0xfffULL << 52)
+ 
+ #define KVM_PFN_ERR_FAULT     (KVM_PFN_ERR_MASK)
+ #define KVM_PFN_ERR_HWPOISON  (KVM_PFN_ERR_MASK + 1)
+ #define KVM_PFN_ERR_BAD               (KVM_PFN_ERR_MASK + 2)
+ #define KVM_PFN_ERR_RO_FAULT  (KVM_PFN_ERR_MASK + 3)
+ 
+ static inline bool is_error_pfn(pfn_t pfn)
+ {
+       return !!(pfn & KVM_PFN_ERR_MASK);
+ }
+ 
+ static inline bool is_noslot_pfn(pfn_t pfn)
+ {
+       return pfn == KVM_PFN_ERR_BAD;
+ }
+ 
+ static inline bool is_invalid_pfn(pfn_t pfn)
+ {
+       return !is_noslot_pfn(pfn) && is_error_pfn(pfn);
+ }
+ 
+ #define KVM_HVA_ERR_BAD               (PAGE_OFFSET)
+ #define KVM_HVA_ERR_RO_BAD    (PAGE_OFFSET + PAGE_SIZE)
+ 
+ static inline bool kvm_is_error_hva(unsigned long addr)
+ {
+       return addr >= PAGE_OFFSET;
+ }
+ 
+ #define KVM_ERR_PTR_BAD_PAGE  (ERR_PTR(-ENOENT))
+ 
+ static inline bool is_error_page(struct page *page)
+ {
+       return IS_ERR(page);
+ }
+ 
   /*
    * vcpu->requests bit members
    */
@@@ -70,7 -119,8 +119,8 @@@
   #define KVM_REQ_PMU               16
   #define KVM_REQ_PMI               17
   
- #define KVM_USERSPACE_IRQ_SOURCE_ID   0
+ #define KVM_USERSPACE_IRQ_SOURCE_ID           0
+ #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID      1
   
   struct kvm;
   struct kvm_vcpu;
@@@ -183,6 -233,18 +233,18 @@@ struct kvm_vcpu 
         } async_pf;
   #endif
   
+ #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+       /*
+        * Cpu relax intercept or pause loop exit optimization
+        * in_spin_loop: set when a vcpu does a pause loop exit
+        *  or cpu relax intercepted.
+        * dy_eligible: indicates whether vcpu is eligible for directed yield.
+        */
+       struct {
+               bool in_spin_loop;
+               bool dy_eligible;
+       } spin_loop;
+ #endif
         struct kvm_vcpu_arch arch;
   };
   
@@@ -201,7 -263,6 +263,6 @@@ struct kvm_memory_slot 
         gfn_t base_gfn;
         unsigned long npages;
         unsigned long flags;
-       unsigned long *rmap;
         unsigned long *dirty_bitmap;
         struct kvm_arch_memory_slot arch;
         unsigned long userspace_addr;
@@@ -283,6 -344,8 +344,8 @@@ struct kvm 
         struct {
                 spinlock_t        lock;
                 struct list_head  items;
+               struct list_head  resampler_list;
+               struct mutex      resampler_lock;
         } irqfds;
         struct list_head ioeventfds;
   #endif
@@@ -348,7 -411,7 +411,7 @@@ static inline struct kvm_vcpu *kvm_get_
   int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
   void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
   
- void vcpu_load(struct kvm_vcpu *vcpu);
+ int __must_check vcpu_load(struct kvm_vcpu *vcpu);
   void vcpu_put(struct kvm_vcpu *vcpu);
   
   int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
@@@ -378,23 -441,6 +441,6 @@@ id_to_memslot(struct kvm_memslots *slot
         return slot;
   }
   
- #define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
- #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
- static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
- 
- extern struct page *bad_page;
- extern struct page *fault_page;
- 
- extern pfn_t bad_pfn;
- extern pfn_t fault_pfn;
- 
- int is_error_page(struct page *page);
- int is_error_pfn(pfn_t pfn);
- int is_hwpoison_pfn(pfn_t pfn);
- int is_fault_pfn(pfn_t pfn);
- int is_noslot_pfn(pfn_t pfn);
- int is_invalid_pfn(pfn_t pfn);
- int kvm_is_error_hva(unsigned long addr);
   int kvm_set_memory_region(struct kvm *kvm,
                           struct kvm_userspace_memory_region *mem,
                           int user_alloc);
@@@ -415,28 -461,33 +461,33 @@@ void kvm_arch_commit_memory_region(stru
                                 int user_alloc);
   bool kvm_largepages_enabled(void);
   void kvm_disable_largepages(void);
- void kvm_arch_flush_shadow(struct kvm *kvm);
+ /* flush all memory translations */
+ void kvm_arch_flush_shadow_all(struct kvm *kvm);
+ /* flush memory translations pointing to 'slot' */
+ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+                                  struct kvm_memory_slot *slot);
   
   int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
                             int nr_pages);
   
   struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
   unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
+ unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
   void kvm_release_page_clean(struct page *page);
   void kvm_release_page_dirty(struct page *page);
   void kvm_set_page_dirty(struct page *page);
   void kvm_set_page_accessed(struct page *page);
   
- pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
   pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
   pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
                        bool write_fault, bool *writable);
   pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
   pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
                       bool *writable);
- pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
-                        struct kvm_memory_slot *slot, gfn_t gfn);
- void kvm_release_pfn_dirty(pfn_t);
+ pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
+ pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
+ 
+ void kvm_release_pfn_dirty(pfn_t pfn);
   void kvm_release_pfn_clean(pfn_t pfn);
   void kvm_set_pfn_dirty(pfn_t pfn);
   void kvm_set_pfn_accessed(pfn_t pfn);
@@@ -494,6 -545,7 +545,7 @@@ int kvm_vm_ioctl_set_memory_region(stru
                                    struct
                                    kvm_userspace_memory_region *mem,
                                    int user_alloc);
+ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level);
   long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg);
   
@@@ -573,7 -625,7 +625,7 @@@ void kvm_arch_sync_events(struct kvm *k
   int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
   void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
   
- int kvm_is_mmio_pfn(pfn_t pfn);
+ bool kvm_is_mmio_pfn(pfn_t pfn);
   
   struct kvm_irq_ack_notifier {
         struct hlist_node link;
@@@ -685,7 -737,7 +737,7 @@@ static inline int kvm_deassign_device(s
   static inline void kvm_guest_enter(void)
   {
         BUG_ON(preemptible());
- -      account_system_vtime(current);
+ +      vtime_account(current);
         current->flags |= PF_VCPU;
         /* KVM does not hold any references to rcu protected data when it
          * switches CPU into a guest mode. In fact switching to a guest mode
@@@ -699,7 -751,7 +751,7 @@@
   
   static inline void kvm_guest_exit(void)
   {
- -      account_system_vtime(current);
+ +      vtime_account(current);
         current->flags &= ~PF_VCPU;
   }
   
@@@ -728,6 -780,12 +780,12 @@@ __gfn_to_memslot(struct kvm_memslots *s
         return search_memslots(slots, gfn);
   }
   
+ static inline unsigned long
+ __gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+ {
+       return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+ }
+ 
   static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
   {
         return gfn_to_memslot(kvm, gfn)->id;
@@@ -740,10 -798,12 +798,12 @@@ static inline gfn_t gfn_to_index(gfn_t 
                 (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
   }
   
- static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
-                                              gfn_t gfn)
+ static inline gfn_t
+ hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
   {
-       return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+       gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT;
+ 
+       return slot->base_gfn + gfn_offset;
   }
   
   static inline gpa_t gfn_to_gpa(gfn_t gfn)
@@@ -899,5 -959,32 +959,32 @@@ static inline bool kvm_check_request(in
         }
   }
   
+ #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+ 
+ static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
+ {
+       vcpu->spin_loop.in_spin_loop = val;
+ }
+ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
+ {
+       vcpu->spin_loop.dy_eligible = val;
+ }
+ 
+ #else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
+ 
+ static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
+ {
+ }
+ 
+ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
+ {
+ }
+ 
+ static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+ {
+       return true;
+ }
+ 
+ #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
   #endif
   
diff --combined virt/kvm/eventfd.c

index 67a35e90384c0ae665923bc707c4ebd5fe3c8c22,356965c9d107b694c994c00e40200aa42bb0ee99..9718e98d6d2a84956d113bd43f55d804bd30d049
--- 1/virt/kvm/eventfd.c
--- 2/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@@ -43,6 -43,31 +43,31 @@@
    * --------------------------------------------------------------------
    */
   
+ /*
+  * Resampling irqfds are a special variety of irqfds used to emulate
+  * level triggered interrupts.  The interrupt is asserted on eventfd
+  * trigger.  On acknowledgement through the irq ack notifier, the
+  * interrupt is de-asserted and userspace is notified through the
+  * resamplefd.  All resamplers on the same gsi are de-asserted
+  * together, so we don't need to track the state of each individual
+  * user.  We can also therefore share the same irq source ID.
+  */
+ struct _irqfd_resampler {
+       struct kvm *kvm;
+       /*
+        * List of resampling struct _irqfd objects sharing this gsi.
+        * RCU list modified under kvm->irqfds.resampler_lock
+        */
+       struct list_head list;
+       struct kvm_irq_ack_notifier notifier;
+       /*
+        * Entry in list of kvm->irqfd.resampler_list.  Use for sharing
+        * resamplers among irqfds on the same gsi.
+        * Accessed and modified under kvm->irqfds.resampler_lock
+        */
+       struct list_head link;
+ };
+ 
   struct _irqfd {
         /* Used for MSI fast-path */
         struct kvm *kvm;
@@@ -52,6 -77,12 +77,12 @@@
         /* Used for level IRQ fast-path */
         int gsi;
         struct work_struct inject;
+       /* The resampler used by this irqfd (resampler-only) */
+       struct _irqfd_resampler *resampler;
+       /* Eventfd notified on resample (resampler-only) */
+       struct eventfd_ctx *resamplefd;
+       /* Entry in list of irqfds for a resampler (resampler-only) */
+       struct list_head resampler_link;
         /* Used for setup/shutdown */
         struct eventfd_ctx *eventfd;
         struct list_head list;
@@@ -67,8 -98,58 +98,58 @@@ irqfd_inject(struct work_struct *work
         struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
         struct kvm *kvm = irqfd->kvm;
   
-       kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
-       kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+       if (!irqfd->resampler) {
+               kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
+               kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+       } else
+               kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+                           irqfd->gsi, 1);
+ }
+ 
+ /*
+  * Since resampler irqfds share an IRQ source ID, we de-assert once
+  * then notify all of the resampler irqfds using this GSI.  We can't
+  * do multiple de-asserts or we risk racing with incoming re-asserts.
+  */
+ static void
+ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
+ {
+       struct _irqfd_resampler *resampler;
+       struct _irqfd *irqfd;
+ 
+       resampler = container_of(kian, struct _irqfd_resampler, notifier);
+ 
+       kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+                   resampler->notifier.gsi, 0);
+ 
+       rcu_read_lock();
+ 
+       list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
+               eventfd_signal(irqfd->resamplefd, 1);
+ 
+       rcu_read_unlock();
+ }
+ 
+ static void
+ irqfd_resampler_shutdown(struct _irqfd *irqfd)
+ {
+       struct _irqfd_resampler *resampler = irqfd->resampler;
+       struct kvm *kvm = resampler->kvm;
+ 
+       mutex_lock(&kvm->irqfds.resampler_lock);
+ 
+       list_del_rcu(&irqfd->resampler_link);
+       synchronize_rcu();
+ 
+       if (list_empty(&resampler->list)) {
+               list_del(&resampler->link);
+               kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
+               kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+                           resampler->notifier.gsi, 0);
+               kfree(resampler);
+       }
+ 
+       mutex_unlock(&kvm->irqfds.resampler_lock);
   }
   
   /*
@@@ -90,8 -171,13 +171,13 @@@ irqfd_shutdown(struct work_struct *work
          * We know no new events will be scheduled at this point, so block
          * until all previously outstanding events have completed
          */
- -      flush_work_sync(&irqfd->inject);
+ +      flush_work(&irqfd->inject);
   
+       if (irqfd->resampler) {
+               irqfd_resampler_shutdown(irqfd);
+               eventfd_ctx_put(irqfd->resamplefd);
+       }
+ 
         /*
          * It is now safe to release the object's resources
          */
@@@ -203,7 -289,7 +289,7 @@@ kvm_irqfd_assign(struct kvm *kvm, struc
         struct kvm_irq_routing_table *irq_rt;
         struct _irqfd *irqfd, *tmp;
         struct file *file = NULL;
-       struct eventfd_ctx *eventfd = NULL;
+       struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
         int ret;
         unsigned int events;
   
@@@ -231,6 -317,54 +317,54 @@@
   
         irqfd->eventfd = eventfd;
   
+       if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
+               struct _irqfd_resampler *resampler;
+ 
+               resamplefd = eventfd_ctx_fdget(args->resamplefd);
+               if (IS_ERR(resamplefd)) {
+                       ret = PTR_ERR(resamplefd);
+                       goto fail;
+               }
+ 
+               irqfd->resamplefd = resamplefd;
+               INIT_LIST_HEAD(&irqfd->resampler_link);
+ 
+               mutex_lock(&kvm->irqfds.resampler_lock);
+ 
+               list_for_each_entry(resampler,
+                                   &kvm->irqfds.resampler_list, list) {
+                       if (resampler->notifier.gsi == irqfd->gsi) {
+                               irqfd->resampler = resampler;
+                               break;
+                       }
+               }
+ 
+               if (!irqfd->resampler) {
+                       resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
+                       if (!resampler) {
+                               ret = -ENOMEM;
+                               mutex_unlock(&kvm->irqfds.resampler_lock);
+                               goto fail;
+                       }
+ 
+                       resampler->kvm = kvm;
+                       INIT_LIST_HEAD(&resampler->list);
+                       resampler->notifier.gsi = irqfd->gsi;
+                       resampler->notifier.irq_acked = irqfd_resampler_ack;
+                       INIT_LIST_HEAD(&resampler->link);
+ 
+                       list_add(&resampler->link, &kvm->irqfds.resampler_list);
+                       kvm_register_irq_ack_notifier(kvm,
+                                                     &resampler->notifier);
+                       irqfd->resampler = resampler;
+               }
+ 
+               list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
+               synchronize_rcu();
+ 
+               mutex_unlock(&kvm->irqfds.resampler_lock);
+       }
+ 
         /*
          * Install our own custom wake-up handling so we are notified via
          * a callback whenever someone signals the underlying eventfd
@@@ -276,6 -410,12 +410,12 @@@
         return 0;
   
   fail:
+       if (irqfd->resampler)
+               irqfd_resampler_shutdown(irqfd);
+ 
+       if (resamplefd && !IS_ERR(resamplefd))
+               eventfd_ctx_put(resamplefd);
+ 
         if (eventfd && !IS_ERR(eventfd))
                 eventfd_ctx_put(eventfd);
   
@@@ -291,6 -431,8 +431,8 @@@ kvm_eventfd_init(struct kvm *kvm
   {
         spin_lock_init(&kvm->irqfds.lock);
         INIT_LIST_HEAD(&kvm->irqfds.items);
+       INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
+       mutex_init(&kvm->irqfds.resampler_lock);
         INIT_LIST_HEAD(&kvm->ioeventfds);
   }
   
@@@ -340,7 -482,7 +482,7 @@@ kvm_irqfd_deassign(struct kvm *kvm, str
   int
   kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
   {
-       if (args->flags & ~KVM_IRQFD_FLAG_DEASSIGN)
+       if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
                 return -EINVAL;
   
         if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
diff --combined virt/kvm/kvm_main.c

index d617f69131d7667d3847c7e0fc040a717625bf38,cc3f6dc506e43fea81b79e7c5fcb427c8be083da..c353b4599cecdb4db0d3b276e1c12b81974cd2e7
--- 1/virt/kvm/kvm_main.c
--- 2/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@@ -100,13 -100,7 +100,7 @@@ EXPORT_SYMBOL_GPL(kvm_rebooting)
   
   static bool largepages_enabled = true;
   
- static struct page *hwpoison_page;
- static pfn_t hwpoison_pfn;
- 
- struct page *fault_page;
- pfn_t fault_pfn;
- 
- inline int kvm_is_mmio_pfn(pfn_t pfn)
+ bool kvm_is_mmio_pfn(pfn_t pfn)
   {
         if (pfn_valid(pfn)) {
                 int reserved;
@@@ -137,11 -131,12 +131,12 @@@
   /*
    * Switches to specified vcpu, until a matching vcpu_put()
    */
- void vcpu_load(struct kvm_vcpu *vcpu)
+ int vcpu_load(struct kvm_vcpu *vcpu)
   {
         int cpu;
   
-       mutex_lock(&vcpu->mutex);
+       if (mutex_lock_killable(&vcpu->mutex))
+               return -EINTR;
         if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
                 /* The thread running this VCPU changed. */
                 struct pid *oldpid = vcpu->pid;
@@@ -154,6 -149,7 +149,7 @@@
         preempt_notifier_register(&vcpu->preempt_notifier);
         kvm_arch_vcpu_load(vcpu, cpu);
         put_cpu();
+       return 0;
   }
   
   void vcpu_put(struct kvm_vcpu *vcpu)
@@@ -236,6 -232,9 +232,9 @@@ int kvm_vcpu_init(struct kvm_vcpu *vcpu
         }
         vcpu->run = page_address(page);
   
+       kvm_vcpu_set_in_spin_loop(vcpu, false);
+       kvm_vcpu_set_dy_eligible(vcpu, false);
+ 
         r = kvm_arch_vcpu_init(vcpu);
         if (r < 0)
                 goto fail_free_run;
@@@ -332,8 -331,7 +331,7 @@@ static void kvm_mmu_notifier_invalidate
          * count is also read inside the mmu_lock critical section.
          */
         kvm->mmu_notifier_count++;
-       for (; start < end; start += PAGE_SIZE)
-               need_tlb_flush |= kvm_unmap_hva(kvm, start);
+       need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
         need_tlb_flush |= kvm->tlbs_dirty;
         /* we've to flush the tlb before the pages can be freed */
         if (need_tlb_flush)
@@@ -412,7 -410,7 +410,7 @@@ static void kvm_mmu_notifier_release(st
         int idx;
   
         idx = srcu_read_lock(&kvm->srcu);
-       kvm_arch_flush_shadow(kvm);
+       kvm_arch_flush_shadow_all(kvm);
         srcu_read_unlock(&kvm->srcu, idx);
   }
   
@@@ -551,16 -549,12 +549,12 @@@ static void kvm_destroy_dirty_bitmap(st
   static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
                                   struct kvm_memory_slot *dont)
   {
-       if (!dont || free->rmap != dont->rmap)
-               vfree(free->rmap);
- 
         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
                 kvm_destroy_dirty_bitmap(free);
   
         kvm_arch_free_memslot(free, dont);
   
         free->npages = 0;
-       free->rmap = NULL;
   }
   
   void kvm_free_physmem(struct kvm *kvm)
@@@ -590,7 -584,7 +584,7 @@@ static void kvm_destroy_vm(struct kvm *
   #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
         mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
   #else
-       kvm_arch_flush_shadow(kvm);
+       kvm_arch_flush_shadow_all(kvm);
   #endif
         kvm_arch_destroy_vm(kvm);
         kvm_free_physmem(kvm);
@@@ -686,6 -680,20 +680,20 @@@ void update_memslots(struct kvm_memslot
         slots->generation++;
   }
   
+ static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
+ {
+       u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
+ 
+ #ifdef KVM_CAP_READONLY_MEM
+       valid_flags |= KVM_MEM_READONLY;
+ #endif
+ 
+       if (mem->flags & ~valid_flags)
+               return -EINVAL;
+ 
+       return 0;
+ }
+ 
   /*
    * Allocate some memory and give it an address in the guest physical address
    * space.
@@@ -706,6 -714,10 +714,10 @@@ int __kvm_set_memory_region(struct kvm 
         struct kvm_memory_slot old, new;
         struct kvm_memslots *slots, *old_memslots;
   
+       r = check_memory_region_flags(mem);
+       if (r)
+               goto out;
+ 
         r = -EINVAL;
         /* General sanity checks */
         if (mem->memory_size & (PAGE_SIZE - 1))
@@@ -769,11 -781,7 +781,7 @@@
         if (npages && !old.npages) {
                 new.user_alloc = user_alloc;
                 new.userspace_addr = mem->userspace_addr;
- #ifndef CONFIG_S390
-               new.rmap = vzalloc(npages * sizeof(*new.rmap));
-               if (!new.rmap)
-                       goto out_free;
- #endif /* not defined CONFIG_S390 */
+ 
                 if (kvm_arch_create_memslot(&new, npages))
                         goto out_free;
         }
@@@ -785,7 -793,7 +793,7 @@@
                 /* destroy any largepage mappings for dirty tracking */
         }
   
-       if (!npages) {
+       if (!npages || base_gfn != old.base_gfn) {
                 struct kvm_memory_slot *slot;
   
                 r = -ENOMEM;
@@@ -801,14 -809,14 +809,14 @@@
                 old_memslots = kvm->memslots;
                 rcu_assign_pointer(kvm->memslots, slots);
                 synchronize_srcu_expedited(&kvm->srcu);
-               /* From this point no new shadow pages pointing to a deleted
-                * memslot will be created.
+               /* From this point no new shadow pages pointing to a deleted,
+                * or moved, memslot will be created.
                  *
                  * validation of sp->gfn happens in:
                  *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
                  *      - kvm_is_visible_gfn (mmu_check_roots)
                  */
-               kvm_arch_flush_shadow(kvm);
+               kvm_arch_flush_shadow_memslot(kvm, slot);
                 kfree(old_memslots);
         }
   
@@@ -832,7 -840,6 +840,6 @@@
   
         /* actual memory is freed via old in kvm_free_physmem_slot below */
         if (!npages) {
-               new.rmap = NULL;
                 new.dirty_bitmap = NULL;
                 memset(&new.arch, 0, sizeof(new.arch));
         }
@@@ -844,13 -851,6 +851,6 @@@
   
         kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
   
-       /*
-        * If the new memory slot is created, we need to clear all
-        * mmio sptes.
-        */
-       if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
-               kvm_arch_flush_shadow(kvm);
- 
         kvm_free_physmem_slot(&old, &new);
         kfree(old_memslots);
   
@@@ -932,53 -932,6 +932,6 @@@ void kvm_disable_largepages(void
   }
   EXPORT_SYMBOL_GPL(kvm_disable_largepages);
   
- int is_error_page(struct page *page)
- {
-       return page == bad_page || page == hwpoison_page || page == fault_page;
- }
- EXPORT_SYMBOL_GPL(is_error_page);
- 
- int is_error_pfn(pfn_t pfn)
- {
-       return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
- }
- EXPORT_SYMBOL_GPL(is_error_pfn);
- 
- int is_hwpoison_pfn(pfn_t pfn)
- {
-       return pfn == hwpoison_pfn;
- }
- EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
- 
- int is_fault_pfn(pfn_t pfn)
- {
-       return pfn == fault_pfn;
- }
- EXPORT_SYMBOL_GPL(is_fault_pfn);
- 
- int is_noslot_pfn(pfn_t pfn)
- {
-       return pfn == bad_pfn;
- }
- EXPORT_SYMBOL_GPL(is_noslot_pfn);
- 
- int is_invalid_pfn(pfn_t pfn)
- {
-       return pfn == hwpoison_pfn || pfn == fault_pfn;
- }
- EXPORT_SYMBOL_GPL(is_invalid_pfn);
- 
- static inline unsigned long bad_hva(void)
- {
-       return PAGE_OFFSET;
- }
- 
- int kvm_is_error_hva(unsigned long addr)
- {
-       return addr == bad_hva();
- }
- EXPORT_SYMBOL_GPL(kvm_is_error_hva);
- 
   struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
   {
         return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@@ -1021,28 -974,62 +974,62 @@@ out
         return size;
   }
   
- static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
-                                    gfn_t *nr_pages)
+ static bool memslot_is_readonly(struct kvm_memory_slot *slot)
+ {
+       return slot->flags & KVM_MEM_READONLY;
+ }
+ 
+ static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+                                      gfn_t *nr_pages, bool write)
   {
         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
-               return bad_hva();
+               return KVM_HVA_ERR_BAD;
+ 
+       if (memslot_is_readonly(slot) && write)
+               return KVM_HVA_ERR_RO_BAD;
   
         if (nr_pages)
                 *nr_pages = slot->npages - (gfn - slot->base_gfn);
   
-       return gfn_to_hva_memslot(slot, gfn);
+       return __gfn_to_hva_memslot(slot, gfn);
   }
   
+ static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+                                    gfn_t *nr_pages)
+ {
+       return __gfn_to_hva_many(slot, gfn, nr_pages, true);
+ }
+ 
+ unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+                                gfn_t gfn)
+ {
+       return gfn_to_hva_many(slot, gfn, NULL);
+ }
+ EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
+ 
   unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
   {
         return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
   }
   EXPORT_SYMBOL_GPL(gfn_to_hva);
   
- static pfn_t get_fault_pfn(void)
+ /*
+  * The hva returned by this function is only allowed to be read.
+  * It should pair with kvm_read_hva() or kvm_read_hva_atomic().
+  */
+ static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
+ {
+       return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
+ }
+ 
+ static int kvm_read_hva(void *data, void __user *hva, int len)
   {
-       get_page(fault_page);
-       return fault_pfn;
+       return __copy_from_user(data, hva, len);
+ }
+ 
+ static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
+ {
+       return __copy_from_user_inatomic(data, hva, len);
   }
   
   int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
@@@ -1065,108 -1052,186 +1052,186 @@@ static inline int check_user_page_hwpoi
         return rc == -EHWPOISON;
   }
   
- static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
-                       bool *async, bool write_fault, bool *writable)
+ /*
+  * The atomic path to get the writable pfn which will be stored in @pfn,
+  * true indicates success, otherwise false is returned.
+  */
+ static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
+                           bool write_fault, bool *writable, pfn_t *pfn)
   {
         struct page *page[1];
-       int npages = 0;
-       pfn_t pfn;
+       int npages;
   
-       /* we can do it either atomically or asynchronously, not both */
-       BUG_ON(atomic && async);
+       if (!(async || atomic))
+               return false;
   
-       BUG_ON(!write_fault && !writable);
+       /*
+        * Fast pin a writable pfn only if it is a write fault request
+        * or the caller allows to map a writable pfn for a read fault
+        * request.
+        */
+       if (!(write_fault || writable))
+               return false;
   
-       if (writable)
-               *writable = true;
+       npages = __get_user_pages_fast(addr, 1, 1, page);
+       if (npages == 1) {
+               *pfn = page_to_pfn(page[0]);
   
-       if (atomic || async)
-               npages = __get_user_pages_fast(addr, 1, 1, page);
+               if (writable)
+                       *writable = true;
+               return true;
+       }
   
-       if (unlikely(npages != 1) && !atomic) {
-               might_sleep();
+       return false;
+ }
   
-               if (writable)
-                       *writable = write_fault;
+ /*
+  * The slow path to get the pfn of the specified host virtual address,
+  * 1 indicates success, -errno is returned if error is detected.
+  */
+ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
+                          bool *writable, pfn_t *pfn)
+ {
+       struct page *page[1];
+       int npages = 0;
   
-               if (async) {
-                       down_read(&current->mm->mmap_sem);
-                       npages = get_user_page_nowait(current, current->mm,
-                                                    addr, write_fault, page);
-                       up_read(&current->mm->mmap_sem);
-               } else
-                       npages = get_user_pages_fast(addr, 1, write_fault,
-                                                    page);
- 
-               /* map read fault as writable if possible */
-               if (unlikely(!write_fault) && npages == 1) {
-                       struct page *wpage[1];
- 
-                       npages = __get_user_pages_fast(addr, 1, 1, wpage);
-                       if (npages == 1) {
-                               *writable = true;
-                               put_page(page[0]);
-                               page[0] = wpage[0];
-                       }
-                       npages = 1;
+       might_sleep();
+ 
+       if (writable)
+               *writable = write_fault;
+ 
+       if (async) {
+               down_read(&current->mm->mmap_sem);
+               npages = get_user_page_nowait(current, current->mm,
+                                             addr, write_fault, page);
+               up_read(&current->mm->mmap_sem);
+       } else
+               npages = get_user_pages_fast(addr, 1, write_fault,
+                                            page);
+       if (npages != 1)
+               return npages;
+ 
+       /* map read fault as writable if possible */
+       if (unlikely(!write_fault) && writable) {
+               struct page *wpage[1];
+ 
+               npages = __get_user_pages_fast(addr, 1, 1, wpage);
+               if (npages == 1) {
+                       *writable = true;
+                       put_page(page[0]);
+                       page[0] = wpage[0];
                 }
+ 
+               npages = 1;
         }
+       *pfn = page_to_pfn(page[0]);
+       return npages;
+ }
   
-       if (unlikely(npages != 1)) {
-               struct vm_area_struct *vma;
+ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
+ {
+       if (unlikely(!(vma->vm_flags & VM_READ)))
+               return false;
   
-               if (atomic)
-                       return get_fault_pfn();
+       if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
+               return false;
   
-               down_read(&current->mm->mmap_sem);
-               if (npages == -EHWPOISON ||
-                       (!async && check_user_page_hwpoison(addr))) {
-                       up_read(&current->mm->mmap_sem);
-                       get_page(hwpoison_page);
-                       return page_to_pfn(hwpoison_page);
-               }
+       return true;
+ }
   
-               vma = find_vma_intersection(current->mm, addr, addr+1);
- 
-               if (vma == NULL)
-                       pfn = get_fault_pfn();
-               else if ((vma->vm_flags & VM_PFNMAP)) {
-                       pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
-                               vma->vm_pgoff;
-                       BUG_ON(!kvm_is_mmio_pfn(pfn));
-               } else {
-                       if (async && (vma->vm_flags & VM_WRITE))
-                               *async = true;
-                       pfn = get_fault_pfn();
-               }
-               up_read(&current->mm->mmap_sem);
-       } else
-               pfn = page_to_pfn(page[0]);
+ /*
+  * Pin guest page in memory and return its pfn.
+  * @addr: host virtual address which maps memory to the guest
+  * @atomic: whether this function can sleep
+  * @async: whether this function need to wait IO complete if the
+  *         host page is not in the memory
+  * @write_fault: whether we should get a writable host page
+  * @writable: whether it allows to map a writable host page for !@write_fault
+  *
+  * The function will map a writable host page for these two cases:
+  * 1): @write_fault = true
+  * 2): @write_fault = false && @writable, @writable will tell the caller
+  *     whether the mapping is writable.
+  */
+ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
+                       bool write_fault, bool *writable)
+ {
+       struct vm_area_struct *vma;
+       pfn_t pfn = 0;
+       int npages;
+ 
+       /* we can do it either atomically or asynchronously, not both */
+       BUG_ON(atomic && async);
   
+       if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
+               return pfn;
+ 
+       if (atomic)
+               return KVM_PFN_ERR_FAULT;
+ 
+       npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
+       if (npages == 1)
+               return pfn;
+ 
+       down_read(&current->mm->mmap_sem);
+       if (npages == -EHWPOISON ||
+             (!async && check_user_page_hwpoison(addr))) {
+               pfn = KVM_PFN_ERR_HWPOISON;
+               goto exit;
+       }
+ 
+       vma = find_vma_intersection(current->mm, addr, addr + 1);
+ 
+       if (vma == NULL)
+               pfn = KVM_PFN_ERR_FAULT;
+       else if ((vma->vm_flags & VM_PFNMAP)) {
+               pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+                       vma->vm_pgoff;
+               BUG_ON(!kvm_is_mmio_pfn(pfn));
+       } else {
+               if (async && vma_is_valid(vma, write_fault))
+                       *async = true;
+               pfn = KVM_PFN_ERR_FAULT;
+       }
+ exit:
+       up_read(&current->mm->mmap_sem);
         return pfn;
   }
   
- pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
+ static pfn_t
+ __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+                    bool *async, bool write_fault, bool *writable)
   {
-       return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
+       unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
+ 
+       if (addr == KVM_HVA_ERR_RO_BAD)
+               return KVM_PFN_ERR_RO_FAULT;
+ 
+       if (kvm_is_error_hva(addr))
+               return KVM_PFN_ERR_BAD;
+ 
+       /* Do not map writable pfn in the readonly memslot. */
+       if (writable && memslot_is_readonly(slot)) {
+               *writable = false;
+               writable = NULL;
+       }
+ 
+       return hva_to_pfn(addr, atomic, async, write_fault,
+                         writable);
   }
- EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
   
   static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
                           bool write_fault, bool *writable)
   {
-       unsigned long addr;
+       struct kvm_memory_slot *slot;
   
         if (async)
                 *async = false;
   
-       addr = gfn_to_hva(kvm, gfn);
-       if (kvm_is_error_hva(addr)) {
-               get_page(bad_page);
-               return page_to_pfn(bad_page);
-       }
+       slot = gfn_to_memslot(kvm, gfn);
   
-       return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
+       return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
+                                   writable);
   }
   
   pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
@@@ -1195,12 -1260,16 +1260,16 @@@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, 
   }
   EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
   
- pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
-                        struct kvm_memory_slot *slot, gfn_t gfn)
+ pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+ {
+       return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
+ }
+ 
+ pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
   {
-       unsigned long addr = gfn_to_hva_memslot(slot, gfn);
-       return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
+       return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
   }
+ EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
   
   int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
                                                                   int nr_pages)
@@@ -1219,30 -1288,42 +1288,42 @@@
   }
   EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
   
+ static struct page *kvm_pfn_to_page(pfn_t pfn)
+ {
+       if (is_error_pfn(pfn))
+               return KVM_ERR_PTR_BAD_PAGE;
+ 
+       if (kvm_is_mmio_pfn(pfn)) {
+               WARN_ON(1);
+               return KVM_ERR_PTR_BAD_PAGE;
+       }
+ 
+       return pfn_to_page(pfn);
+ }
+ 
   struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
   {
         pfn_t pfn;
   
         pfn = gfn_to_pfn(kvm, gfn);
-       if (!kvm_is_mmio_pfn(pfn))
-               return pfn_to_page(pfn);
- 
-       WARN_ON(kvm_is_mmio_pfn(pfn));
   
-       get_page(bad_page);
-       return bad_page;
+       return kvm_pfn_to_page(pfn);
   }
   
   EXPORT_SYMBOL_GPL(gfn_to_page);
   
   void kvm_release_page_clean(struct page *page)
   {
+       WARN_ON(is_error_page(page));
+ 
         kvm_release_pfn_clean(page_to_pfn(page));
   }
   EXPORT_SYMBOL_GPL(kvm_release_page_clean);
   
   void kvm_release_pfn_clean(pfn_t pfn)
   {
+       WARN_ON(is_error_pfn(pfn));
+ 
         if (!kvm_is_mmio_pfn(pfn))
                 put_page(pfn_to_page(pfn));
   }
@@@ -1250,6 -1331,8 +1331,8 @@@ EXPORT_SYMBOL_GPL(kvm_release_pfn_clean
   
   void kvm_release_page_dirty(struct page *page)
   {
+       WARN_ON(is_error_page(page));
+ 
         kvm_release_pfn_dirty(page_to_pfn(page));
   }
   EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
@@@ -1305,10 -1388,10 +1388,10 @@@ int kvm_read_guest_page(struct kvm *kvm
         int r;
         unsigned long addr;
   
-       addr = gfn_to_hva(kvm, gfn);
+       addr = gfn_to_hva_read(kvm, gfn);
         if (kvm_is_error_hva(addr))
                 return -EFAULT;
-       r = __copy_from_user(data, (void __user *)addr + offset, len);
+       r = kvm_read_hva(data, (void __user *)addr + offset, len);
         if (r)
                 return -EFAULT;
         return 0;
@@@ -1343,11 -1426,11 +1426,11 @@@ int kvm_read_guest_atomic(struct kvm *k
         gfn_t gfn = gpa >> PAGE_SHIFT;
         int offset = offset_in_page(gpa);
   
-       addr = gfn_to_hva(kvm, gfn);
+       addr = gfn_to_hva_read(kvm, gfn);
         if (kvm_is_error_hva(addr))
                 return -EFAULT;
         pagefault_disable();
-       r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+       r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
         pagefault_enable();
         if (r)
                 return -EFAULT;
@@@ -1580,6 -1663,43 +1663,43 @@@ bool kvm_vcpu_yield_to(struct kvm_vcpu 
   }
   EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
   
+ #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+ /*
+  * Helper that checks whether a VCPU is eligible for directed yield.
+  * Most eligible candidate to yield is decided by following heuristics:
+  *
+  *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
+  *  (preempted lock holder), indicated by @in_spin_loop.
+  *  Set at the beiginning and cleared at the end of interception/PLE handler.
+  *
+  *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
+  *  chance last time (mostly it has become eligible now since we have probably
+  *  yielded to lockholder in last iteration. This is done by toggling
+  *  @dy_eligible each time a VCPU checked for eligibility.)
+  *
+  *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
+  *  to preempted lock-holder could result in wrong VCPU selection and CPU
+  *  burning. Giving priority for a potential lock-holder increases lock
+  *  progress.
+  *
+  *  Since algorithm is based on heuristics, accessing another VCPU data without
+  *  locking does not harm. It may result in trying to yield to  same VCPU, fail
+  *  and continue with next VCPU and so on.
+  */
+ bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+ {
+       bool eligible;
+ 
+       eligible = !vcpu->spin_loop.in_spin_loop ||
+                       (vcpu->spin_loop.in_spin_loop &&
+                        vcpu->spin_loop.dy_eligible);
+ 
+       if (vcpu->spin_loop.in_spin_loop)
+               kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
+ 
+       return eligible;
+ }
+ #endif
   void kvm_vcpu_on_spin(struct kvm_vcpu *me)
   {
         struct kvm *kvm = me->kvm;
@@@ -1589,6 -1709,7 +1709,7 @@@
         int pass;
         int i;
   
+       kvm_vcpu_set_in_spin_loop(me, true);
         /*
          * We boost the priority of a VCPU that is runnable but not
          * currently running, because it got preempted by something
@@@ -1607,6 -1728,8 +1728,8 @@@
                                 continue;
                         if (waitqueue_active(&vcpu->wq))
                                 continue;
+                       if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
+                               continue;
                         if (kvm_vcpu_yield_to(vcpu)) {
                                 kvm->last_boosted_vcpu = i;
                                 yielded = 1;
@@@ -1614,6 -1737,10 +1737,10 @@@
                         }
                 }
         }
+       kvm_vcpu_set_in_spin_loop(me, false);
+ 
+       /* Ensure vcpu is not eligible during next spinloop */
+       kvm_vcpu_set_dy_eligible(me, false);
   }
   EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
   
@@@ -1766,7 -1893,9 +1893,9 @@@ static long kvm_vcpu_ioctl(struct file 
   #endif
   
   
-       vcpu_load(vcpu);
+       r = vcpu_load(vcpu);
+       if (r)
+               return r;
         switch (ioctl) {
         case KVM_RUN:
                 r = -EINVAL;
@@@ -1976,10 -2105,9 +2105,10 @@@ static long kvm_vcpu_compat_ioctl(struc
                         if (copy_from_user(&csigset, sigmask_arg->sigset,
                                            sizeof csigset))
                                 goto out;
- -              }
- -              sigset_from_compat(&sigset, &csigset);
- -              r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+ +                      sigset_from_compat(&sigset, &csigset);
+ +                      r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+ +              } else
+ +                      r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
                 break;
         }
         default:
@@@ -2093,6 -2221,29 +2222,29 @@@ static long kvm_vm_ioctl(struct file *f
                 r = kvm_send_userspace_msi(kvm, &msi);
                 break;
         }
+ #endif
+ #ifdef __KVM_HAVE_IRQ_LINE
+       case KVM_IRQ_LINE_STATUS:
+       case KVM_IRQ_LINE: {
+               struct kvm_irq_level irq_event;
+ 
+               r = -EFAULT;
+               if (copy_from_user(&irq_event, argp, sizeof irq_event))
+                       goto out;
+ 
+               r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
+               if (r)
+                       goto out;
+ 
+               r = -EFAULT;
+               if (ioctl == KVM_IRQ_LINE_STATUS) {
+                       if (copy_to_user(argp, &irq_event, sizeof irq_event))
+                               goto out;
+               }
+ 
+               r = 0;
+               break;
+       }
   #endif
         default:
                 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
@@@ -2698,9 -2849,6 +2850,6 @@@ static struct syscore_ops kvm_syscore_o
         .resume = kvm_resume,
   };
   
- struct page *bad_page;
- pfn_t bad_pfn;
- 
   static inline
   struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
   {
@@@ -2732,33 -2880,6 +2881,6 @@@ int kvm_init(void *opaque, unsigned vcp
         if (r)
                 goto out_fail;
   
-       bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- 
-       if (bad_page == NULL) {
-               r = -ENOMEM;
-               goto out;
-       }
- 
-       bad_pfn = page_to_pfn(bad_page);
- 
-       hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- 
-       if (hwpoison_page == NULL) {
-               r = -ENOMEM;
-               goto out_free_0;
-       }
- 
-       hwpoison_pfn = page_to_pfn(hwpoison_page);
- 
-       fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- 
-       if (fault_page == NULL) {
-               r = -ENOMEM;
-               goto out_free_0;
-       }
- 
-       fault_pfn = page_to_pfn(fault_page);
- 
         if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
                 r = -ENOMEM;
                 goto out_free_0;
@@@ -2833,12 -2954,6 +2955,6 @@@ out_free_1
   out_free_0a:
         free_cpumask_var(cpus_hardware_enabled);
   out_free_0:
-       if (fault_page)
-               __free_page(fault_page);
-       if (hwpoison_page)
-               __free_page(hwpoison_page);
-       __free_page(bad_page);
- out:
         kvm_arch_exit();
   out_fail:
         return r;
@@@ -2858,8 -2973,5 +2974,5 @@@ void kvm_exit(void
         kvm_arch_hardware_unsetup();
         kvm_arch_exit();
         free_cpumask_var(cpus_hardware_enabled);
-       __free_page(fault_page);
-       __free_page(hwpoison_page);
-       __free_page(bad_page);
   }
   EXPORT_SYMBOL_GPL(kvm_exit);
author	Linus Torvalds <[email protected]>
	Thu, 4 Oct 2012 16:30:33 +0000 (09:30 -0700)
committer	Linus Torvalds <[email protected]>
	Thu, 4 Oct 2012 16:30:33 +0000 (09:30 -0700)
		1	2
arch/s390/include/asm/processor.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kernel/dis.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kvm/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kvm/priv.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/kvm.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/setup.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/eventfd.c	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/kvm_main.c	patch \|	diff1 \|	diff2 \|	blob \| history