Merge remote-tracking branch 'torvalds/master' into perf/core

author Arnaldo Carvalho de Melo <[email protected]>

Tue, 26 Nov 2019 14:06:19 +0000 (11:06 -0300)

committer Arnaldo Carvalho de Melo <[email protected]>

Tue, 26 Nov 2019 14:06:19 +0000 (11:06 -0300)
author Arnaldo Carvalho de Melo <[email protected]>
Tue, 26 Nov 2019 14:06:19 +0000 (11:06 -0300)
committer Arnaldo Carvalho de Melo <[email protected]>
Tue, 26 Nov 2019 14:06:19 +0000 (11:06 -0300)
diff --combined MAINTAINERS

index 81dd8f902bdcb1f52cc2405e403995f97464bd0b,8f075b866aaf6a4d9b4b88be1926a3c8d15f4736..2c7aa547d8b86275fd9f51c10619d8a0408cacdd
--- 1/MAINTAINERS
--- 2/MAINTAINERS
+++ b/MAINTAINERS
@@@ -643,7 -643,7 +643,7 @@@ F: drivers/net/ethernet/alacritech/
   
   FORCEDETH GIGABIT ETHERNET DRIVER
   M:    Rain River <[email protected]>
- M:    Zhu Yanjun <yanjun.zhu@oracle.com>
+ M:    Zhu Yanjun <zyjzyj2000@gmail.com>
   L:    [email protected]
   S:    Maintained
   F:    drivers/net/ethernet/nvidia/*
@@@ -682,11 -682,11 +682,11 @@@ S:      Maintaine
   F:    Documentation/devicetree/bindings/opp/sun50i-nvmem-cpufreq.txt
   F:    drivers/cpufreq/sun50i-cpufreq-nvmem.c
   
- ALLWINNER SECURITY SYSTEM
+ ALLWINNER CRYPTO DRIVERS
   M:    Corentin Labbe <[email protected]>
   L:    [email protected]
   S:    Maintained
- F:    drivers/crypto/sunxi-ss/
+ F:    drivers/crypto/allwinner/
   
   ALLWINNER VPU DRIVER
   M:    Maxime Ripard <[email protected]>
@@@ -1182,14 -1182,21 +1182,21 @@@ S:   Maintaine
   F:    drivers/media/i2c/aptina-pll.*
   
   AQUANTIA ETHERNET DRIVER (atlantic)
- M:    Igor Russkikh <igor.russkikh@aquantia.com>
+ M:    Igor Russkikh <irusskikh@marvell.com>
   L:    [email protected]
   S:    Supported
- W:    http://www.aquantia.com
+ W:    https://www.marvell.com/
   Q:    http://patchwork.ozlabs.org/project/netdev/list/
   F:    drivers/net/ethernet/aquantia/atlantic/
   F:    Documentation/networking/device_drivers/aquantia/atlantic.txt
   
+ AQUANTIA ETHERNET DRIVER PTP SUBSYSTEM
+ M:    Egor Pomozov <[email protected]>
+ L:    [email protected]
+ S:    Supported
+ W:    http://www.aquantia.com
+ F:    drivers/net/ethernet/aquantia/atlantic/aq_ptp*
+ 
   ARC FRAMEBUFFER DRIVER
   M:    Jaya Kumar <[email protected]>
   S:    Maintained
@@@ -1470,6 -1477,14 +1477,14 @@@ F:    drivers/soc/amlogic
   F:    drivers/rtc/rtc-meson*
   N:    meson
   
+ ARM/Amlogic Meson SoC Crypto Drivers
+ M:    Corentin Labbe <[email protected]>
+ L:    [email protected]
+ L:    [email protected]
+ S:    Maintained
+ F:    drivers/crypto/amlogic/
+ F:    Documentation/devicetree/bindings/crypto/amlogic*
+ 
   ARM/Amlogic Meson SoC Sound Drivers
   M:    Jerome Brunet <[email protected]>
   L:    [email protected] (moderated for non-subscribers)
@@@ -2611,6 -2626,7 +2626,7 @@@ S:      Maintaine
   F:    arch/arm64/
   X:    arch/arm64/boot/dts/
   F:    Documentation/arm64/
+ F:    tools/testing/selftests/arm64/
   
   AS3645A LED FLASH CONTROLLER DRIVER
   M:    Sakari Ailus <[email protected]>
@@@ -3595,6 -3611,13 +3611,13 @@@ S:    Maintaine
   F:    Documentation/devicetree/bindings/media/cdns,*.txt
   F:    drivers/media/platform/cadence/cdns-csi2*
   
+ CADENCE NAND DRIVER
+ M:    Piotr Sroka <[email protected]>
+ L:    [email protected]
+ S:    Maintained
+ F:    drivers/mtd/nand/raw/cadence-nand-controller.c
+ F:    Documentation/devicetree/bindings/mtd/cadence-nand-controller.txt
+ 
   CADET FM/AM RADIO RECEIVER DRIVER
   M:    Hans Verkuil <[email protected]>
   L:    [email protected]
@@@ -5046,10 -5069,14 +5069,14 @@@ M:   Ioana Radulescu <ruxandra.radulescu@
   L:    [email protected]
   S:    Maintained
   F:    drivers/net/ethernet/freescale/dpaa2/dpaa2-eth*
+ F:    drivers/net/ethernet/freescale/dpaa2/dpaa2-mac*
   F:    drivers/net/ethernet/freescale/dpaa2/dpni*
+ F:    drivers/net/ethernet/freescale/dpaa2/dpmac*
   F:    drivers/net/ethernet/freescale/dpaa2/dpkg.h
   F:    drivers/net/ethernet/freescale/dpaa2/Makefile
   F:    drivers/net/ethernet/freescale/dpaa2/Kconfig
+ F:    Documentation/networking/device_drivers/freescale/dpaa2/ethernet-driver.rst
+ F:    Documentation/networking/device_drivers/freescale/dpaa2/mac-phy-support.rst
   
   DPAA2 ETHERNET SWITCH DRIVER
   M:    Ioana Radulescu <[email protected]>
@@@ -6143,10 -6170,12 +6170,12 @@@ S:   Maintaine
   F:    Documentation/ABI/testing/sysfs-class-net-phydev
   F:    Documentation/devicetree/bindings/net/ethernet-phy.yaml
   F:    Documentation/devicetree/bindings/net/mdio*
+ F:    Documentation/devicetree/bindings/net/qca,ar803x.yaml
   F:    Documentation/networking/phy.rst
   F:    drivers/net/phy/
   F:    drivers/of/of_mdio.c
   F:    drivers/of/of_net.c
+ F:    include/dt-bindings/net/qca-ar803x.h
   F:    include/linux/*mdio*.h
   F:    include/linux/of_net.h
   F:    include/linux/phy.h
@@@ -7364,6 -7393,25 +7393,25 @@@ F:    include/uapi/linux/if_hippi.
   F:    net/802/hippi.c
   F:    drivers/net/hippi/
   
+ HISILICON SECURITY ENGINE V2 DRIVER (SEC2)
+ M:    Zaibo Xu <[email protected]>
+ L:    [email protected]
+ S:    Maintained
+ F:    drivers/crypto/hisilicon/sec2/sec_crypto.c
+ F:    drivers/crypto/hisilicon/sec2/sec_main.c
+ F:    drivers/crypto/hisilicon/sec2/sec_crypto.h
+ F:    drivers/crypto/hisilicon/sec2/sec.h
+ F:    Documentation/ABI/testing/debugfs-hisi-sec
+ 
+ HISILICON HIGH PERFORMANCE RSA ENGINE DRIVER (HPRE)
+ M:    Zaibo Xu <[email protected]>
+ L:    [email protected]
+ S:    Maintained
+ F:    drivers/crypto/hisilicon/hpre/hpre_crypto.c
+ F:    drivers/crypto/hisilicon/hpre/hpre_main.c
+ F:    drivers/crypto/hisilicon/hpre/hpre.h
+ F:    Documentation/ABI/testing/debugfs-hisi-hpre
+ 
   HISILICON NETWORK SUBSYSTEM 3 DRIVER (HNS3)
   M:    Yisen Zhuang <[email protected]>
   M:    Salil Mehta <[email protected]>
@@@ -7372,6 -7420,11 +7420,11 @@@ W:    http://www.hisilicon.co
   S:    Maintained
   F:    drivers/net/ethernet/hisilicon/hns3/
   
+ HISILICON TRUE RANDOM NUMBER GENERATOR V2 SUPPORT
+ M:    Zaibo Xu <[email protected]>
+ S:    Maintained
+ F:    drivers/char/hw_random/hisi-trng-v2.c
+ 
   HISILICON LPC BUS DRIVER
   M:    [email protected]
   W:    http://www.hisilicon.com
@@@ -7417,7 -7470,6 +7470,6 @@@ S:      Maintaine
   F:    drivers/crypto/hisilicon/qm.c
   F:    drivers/crypto/hisilicon/qm.h
   F:    drivers/crypto/hisilicon/sgl.c
- F:    drivers/crypto/hisilicon/sgl.h
   F:    drivers/crypto/hisilicon/zip/
   F:    Documentation/ABI/testing/debugfs-hisi-zip
   
@@@ -7443,8 -7495,8 +7495,8 @@@ F:      drivers/platform/x86/tc1100-wmi.
   
   HP100:        Driver for HP 10/100 Mbit/s Voice Grade Network Adapter Series
   M:    Jaroslav Kysela <[email protected]>
- S:    Maintained
- F:    drivers/net/ethernet/hp/hp100.*
+ S:    Obsolete
+ F:    drivers/staging/hp/hp100.*
   
   HPET: High Precision Event Timers driver
   M:    Clemens Ladisch <[email protected]>
@@@ -7729,7 -7781,7 +7781,7 @@@ F:      drivers/i2c/i2c-stub.
   
   I3C SUBSYSTEM
   M:    Boris Brezillon <[email protected]>
- L:    [email protected]
+ L:    [email protected] (moderated for non-subscribers)
   C:    irc://chat.freenode.net/linux-i3c
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/i3c/linux.git
   S:    Maintained
@@@ -7745,6 -7797,12 +7797,12 @@@ S:    Maintaine
   F:    Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.txt
   F:    drivers/i3c/master/dw*
   
+ I3C DRIVER FOR CADENCE I3C MASTER IP
+ M:      Przemysław Gaj <[email protected]>
+ S:      Maintained
+ F:      Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt
+ F:      drivers/i3c/master/i3c-master-cdns.c
+ 
   IA64 (Itanium) PLATFORM
   M:    Tony Luck <[email protected]>
   M:    Fenghua Yu <[email protected]>
@@@ -8564,12 -8622,13 +8622,13 @@@ F:   include/linux/iova.
   
   IO_URING
   M:    Jens Axboe <[email protected]>
- L:    [email protected]
- L:    [email protected]
+ L:    [email protected]
   T:    git git://git.kernel.dk/linux-block
   T:    git git://git.kernel.dk/liburing
   S:    Maintained
   F:    fs/io_uring.c
+ F:    fs/io-wq.c
+ F:    fs/io-wq.h
   F:    include/uapi/linux/io_uring.h
   
   IPMI SUBSYSTEM
@@@ -8920,6 -8979,17 +8979,17 @@@ S:    Maintaine
   F:    tools/testing/selftests/
   F:    Documentation/dev-tools/kselftest*
   
+ KERNEL UNIT TESTING FRAMEWORK (KUnit)
+ M:    Brendan Higgins <[email protected]>
+ L:    [email protected]
+ L:    [email protected]
+ W:    https://google.github.io/kunit-docs/third_party/kernel/docs/
+ S:    Maintained
+ F:    Documentation/dev-tools/kunit/
+ F:    include/kunit/
+ F:    lib/kunit/
+ F:    tools/testing/kunit/
+ 
   KERNEL USERMODE HELPER
   M:    Luis Chamberlain <[email protected]>
   L:    [email protected]
@@@ -9497,6 -9567,13 +9567,13 @@@ F:    Documentation/misc-devices/lis3lv02d
   F:    drivers/misc/lis3lv02d/
   F:    drivers/platform/x86/hp_accel.c
   
+ LIST KUNIT TEST
+ M:    David Gow <[email protected]>
+ L:    [email protected]
+ L:    [email protected]
+ S:    Maintained
+ F:    lib/list-test.c
+ 
   LIVE PATCHING
   M:    Josh Poimboeuf <[email protected]>
   M:    Jiri Kosina <[email protected]>
@@@ -9740,6 -9817,7 +9817,7 @@@ S:      Maintaine
   F:    drivers/net/dsa/mv88e6xxx/
   F:    include/linux/platform_data/mv88e6xxx.h
   F:    Documentation/devicetree/bindings/net/dsa/marvell.txt
+ F:    Documentation/networking/devlink-params-mv88e6xxx.txt
   
   MARVELL ARMADA DRM SUPPORT
   M:    Russell King <[email protected]>
@@@ -10536,15 -10614,13 +10614,13 @@@ F:        include/linux/vmalloc.
   F:    mm/
   
   MEMORY TECHNOLOGY DEVICES (MTD)
- M:    David Woodhouse <[email protected]>
- M:    Brian Norris <[email protected]>
- M:    Marek Vasut <[email protected]>
   M:    Miquel Raynal <[email protected]>
   M:    Richard Weinberger <[email protected]>
   M:    Vignesh Raghavendra <[email protected]>
   L:    [email protected]
   W:    http://www.linux-mtd.infradead.org/
   Q:    http://patchwork.ozlabs.org/project/linux-mtd/list/
+ C:    irc://irc.oftc.net/mtd
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git mtd/fixes
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git mtd/next
   S:    Maintained
@@@ -10821,6 -10897,7 +10897,7 @@@ M:   Microchip Linux Driver Support <UNGL
   L:    [email protected]
   S:    Supported
   F:    drivers/net/ethernet/mscc/
+ F:    include/soc/mscc/ocelot*
   
   MICROSOFT SURFACE PRO 3 BUTTON DRIVER
   M:    Chen Yu <[email protected]>
@@@ -10875,18 -10952,18 +10952,18 @@@ F:        arch/mips/include/asm/mach-loongson3
   F:    drivers/*/*loongson1*
   F:    drivers/*/*/*loongson1*
   
- MIPS/LOONGSON2 ARCHITECTURE
+ MIPS/LOONGSON2EF ARCHITECTURE
   M:    Jiaxun Yang <[email protected]>
   L:    [email protected]
   S:    Maintained
- F:    arch/mips/loongson64/fuloong-2e/
- F:    arch/mips/loongson64/lemote-2f/
- F:    arch/mips/include/asm/mach-loongson64/
+ F:    arch/mips/loongson2ef/
+ F:    arch/mips/include/asm/mach-loongson2ef/
   F:    drivers/*/*loongson2*
   F:    drivers/*/*/*loongson2*
   
- MIPS/LOONGSON3 ARCHITECTURE
+ MIPS/LOONGSON64 ARCHITECTURE
   M:    Huacai Chen <[email protected]>
+ M:    Jiaxun Yang <[email protected]>
   L:    [email protected]
   S:    Maintained
   F:    arch/mips/loongson64/
@@@ -11637,6 -11714,7 +11714,7 @@@ F:   drivers/nvme/target/fcloop.
   NVM EXPRESS TARGET DRIVER
   M:    Christoph Hellwig <[email protected]>
   M:    Sagi Grimberg <[email protected]>
+ M:    Chaitanya Kulkarni <[email protected]>
   L:    [email protected]
   T:    git://git.infradead.org/nvme.git
   W:    http://git.infradead.org/nvme.git
@@@ -12778,13 -12856,6 +12856,13 @@@ F: arch/*/events/
   F:    arch/*/events/*/*
   F:    tools/perf/
   
+ +PERFORMANCE EVENTS SUBSYSTEM ARM64 PMU EVENTS
+ +R:    John Garry <[email protected]>
+ +R:    Will Deacon <[email protected]>
+ +L:    [email protected] (moderated for non-subscribers)
+ +S:    Supported
+ +F:    tools/perf/pmu-events/arch/arm64/
+ +
   PERSONALITY HANDLING
   M:    Christoph Hellwig <[email protected]>
   L:    [email protected]
@@@ -12842,6 -12913,7 +12920,7 @@@ S:   Maintaine
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
   F:    samples/pidfd/
   F:    tools/testing/selftests/pidfd/
+ F:    tools/testing/selftests/clone3/
   K:    (?i)pidfd
   K:    (?i)clone3
   K:    \b(clone_args|kernel_clone_args)\b
@@@ -13139,12 -13211,14 +13218,14 @@@ F:        Documentation/filesystems/proc.tx
   PROC SYSCTL
   M:    Luis Chamberlain <[email protected]>
   M:    Kees Cook <[email protected]>
+ M:    Iurii Zaikin <[email protected]>
   L:    [email protected]
   L:    [email protected]
   S:    Maintained
   F:    fs/proc/proc_sysctl.c
   F:    include/linux/sysctl.h
   F:    kernel/sysctl.c
+ F:    kernel/sysctl-test.c
   F:    tools/testing/selftests/sysctl/
   
   PS3 NETWORK SUPPORT
@@@ -13828,7 -13902,7 +13909,7 @@@ R:   Sergei Shtylyov <sergei.shtylyov@cog
   L:    [email protected]
   L:    [email protected]
   F:    Documentation/devicetree/bindings/net/renesas,*.txt
- F:    Documentation/devicetree/bindings/net/sh_eth.txt
+ F:    Documentation/devicetree/bindings/net/renesas,*.yaml
   F:    drivers/net/ethernet/renesas/
   F:    include/linux/sh_eth.h
   
@@@ -15301,7 -15375,6 +15382,6 @@@ F:   arch/arm/boot/dts/spear
   F:    arch/arm/mach-spear/
   
   SPI NOR SUBSYSTEM
- M:    Marek Vasut <[email protected]>
   M:    Tudor Ambarus <[email protected]>
   L:    [email protected]
   W:    http://www.linux-mtd.infradead.org/
@@@ -16598,10 -16671,9 +16678,9 @@@ F:  drivers/media/pci/tw686x
   
   UBI FILE SYSTEM (UBIFS)
   M:    Richard Weinberger <[email protected]>
- M:    Artem Bityutskiy <[email protected]>
- M:    Adrian Hunter <[email protected]>
   L:    [email protected]
- T:    git git://git.infradead.org/ubifs-2.6.git
+ T:    git git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git next
+ T:    git git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git fixes
   W:    http://www.linux-mtd.infradead.org/doc/ubifs.html
   S:    Supported
   F:    Documentation/filesystems/ubifs.txt
@@@ -16716,11 -16788,11 +16795,11 @@@ S:        Maintaine
   F:    drivers/scsi/ufs/ufs-mediatek*
   
   UNSORTED BLOCK IMAGES (UBI)
- M:    Artem Bityutskiy <[email protected]>
   M:    Richard Weinberger <[email protected]>
   W:    http://www.linux-mtd.infradead.org/
   L:    [email protected]
- T:    git git://git.infradead.org/ubifs-2.6.git
+ T:    git git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git next
+ T:    git git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git fixes
   S:    Supported
   F:    drivers/mtd/ubi/
   F:    include/linux/mtd/ubi.h
@@@ -17222,6 -17294,7 +17301,7 @@@ F:   virt/lib
   
   VIRTIO AND VHOST VSOCK DRIVER
   M:    Stefan Hajnoczi <[email protected]>
+ M:    Stefano Garzarella <[email protected]>
   L:    [email protected]
   L:    [email protected]
   L:    [email protected]
@@@ -17353,6 -17426,14 +17433,14 @@@ S: Maintaine
   F:    drivers/input/serio/userio.c
   F:    include/uapi/linux/userio.h
   
+ VITESSE FELIX ETHERNET SWITCH DRIVER
+ M:    Vladimir Oltean <[email protected]>
+ M:    Claudiu Manoil <[email protected]>
+ L:    [email protected]
+ S:    Maintained
+ F:    drivers/net/dsa/ocelot/*
+ F:    net/dsa/tag_ocelot.c
+ 
   VIVID VIRTUAL VIDEO DRIVER
   M:    Hans Verkuil <[email protected]>
   L:    [email protected]
@@@ -17453,6 -17534,18 +17541,18 @@@ S: Maintaine
   F:    drivers/net/vrf.c
   F:    Documentation/networking/vrf.txt
   
+ VSPRINTF
+ M:    Petr Mladek <[email protected]>
+ M:    Steven Rostedt <[email protected]>
+ M:    Sergey Senozhatsky <[email protected]>
+ R:    Andy Shevchenko <[email protected]>
+ R:    Rasmus Villemoes <[email protected]>
+ T:    git git://git.kernel.org/pub/scm/linux/kernel/git/pmladek/printk.git
+ S:    Maintained
+ F:    lib/vsprintf.c
+ F:    lib/test_printf.c
+ F:    Documentation/core-api/printk-formats.rst
+ 
   VT1211 HARDWARE MONITOR DRIVER
   M:    Juerg Haefliger <[email protected]>
   L:    [email protected]
diff --combined arch/x86/events/intel/core.c

index dc64b16e6b719b0d8f89227b17ec965bb2dc4586,937363b803c19d3a1c90b84a22eeec2969d08be1..3be51aa06e67ec2e5b893af82096b43db5c3f23a
--- 1/arch/x86/events/intel/core.c
--- 2/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@@ -3315,17 -3315,27 +3315,28 @@@ static int intel_pmu_hw_config(struct p
         if (x86_pmu.version < 3)
                 return -EINVAL;
   
- -      if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- -              return -EACCES;
+ +      ret = perf_allow_cpu(&event->attr);
+ +      if (ret)
+ +              return ret;
   
         event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
   
         return 0;
   }
   
+ #ifdef CONFIG_RETPOLINE
+ static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr);
+ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr);
+ #endif
+ 
   struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
   {
+ #ifdef CONFIG_RETPOLINE
+       if (x86_pmu.guest_get_msrs == intel_guest_get_msrs)
+               return intel_guest_get_msrs(nr);
+       else if (x86_pmu.guest_get_msrs == core_guest_get_msrs)
+               return core_guest_get_msrs(nr);
+ #endif
         if (x86_pmu.guest_get_msrs)
                 return x86_pmu.guest_get_msrs(nr);
         *nr = 0;
@@@ -3820,12 -3830,6 +3831,12 @@@ static void intel_pmu_sched_task(struc
         intel_pmu_lbr_sched_task(ctx, sched_in);
   }
   
+ +static void intel_pmu_swap_task_ctx(struct perf_event_context *prev,
+ +                                  struct perf_event_context *next)
+ +{
+ +      intel_pmu_lbr_swap_task_ctx(prev, next);
+ +}
+ +
   static int intel_pmu_check_period(struct perf_event *event, u64 value)
   {
         return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0;
@@@ -3961,7 -3965,6 +3972,7 @@@ static __initconst const struct x86_pm
   
         .guest_get_msrs         = intel_guest_get_msrs,
         .sched_task             = intel_pmu_sched_task,
+ +      .swap_task_ctx          = intel_pmu_swap_task_ctx,
   
         .check_period           = intel_pmu_check_period,
   
diff --combined arch/x86/kvm/x86.c

index 783aa8d141bfa4da0fcc045282da0e5769307dbe,3ed167e039e54f7089d88705ec1718d4d4840d8c..cf917139de6ba272f01aac132d9f0c8b3f2b2c2b
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -68,7 -68,6 +68,7 @@@
   #include <asm/mshyperv.h>
   #include <asm/hypervisor.h>
   #include <asm/intel_pt.h>
+ +#include <asm/emulate_prefix.h>
   #include <clocksource/hyperv_timer.h>
   
   #define CREATE_TRACE_POINTS
@@@ -177,6 -176,8 +177,8 @@@ struct kvm_shared_msrs 
   static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
   static struct kvm_shared_msrs __percpu *shared_msrs;
   
+ static u64 __read_mostly host_xss;
+ 
   struct kvm_stats_debugfs_item debugfs_entries[] = {
         { "pf_fixed", VCPU_STAT(pf_fixed) },
         { "pf_guest", VCPU_STAT(pf_guest) },
@@@ -261,23 -262,6 +263,6 @@@ static void kvm_on_user_return(struct u
         }
   }
   
- static void shared_msr_update(unsigned slot, u32 msr)
- {
-       u64 value;
-       unsigned int cpu = smp_processor_id();
-       struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
- 
-       /* only read, and nobody should modify it at this time,
-        * so don't need lock */
-       if (slot >= shared_msrs_global.nr) {
-               printk(KERN_ERR "kvm: invalid MSR slot!");
-               return;
-       }
-       rdmsrl_safe(msr, &value);
-       smsr->values[slot].host = value;
-       smsr->values[slot].curr = value;
- }
- 
   void kvm_define_shared_msr(unsigned slot, u32 msr)
   {
         BUG_ON(slot >= KVM_NR_SHARED_MSRS);
@@@ -289,10 -273,16 +274,16 @@@ EXPORT_SYMBOL_GPL(kvm_define_shared_msr
   
   static void kvm_shared_msr_cpu_online(void)
   {
-       unsigned i;
+       unsigned int cpu = smp_processor_id();
+       struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+       u64 value;
+       int i;
   
-       for (i = 0; i < shared_msrs_global.nr; ++i)
-               shared_msr_update(i, shared_msrs_global.msrs[i]);
+       for (i = 0; i < shared_msrs_global.nr; ++i) {
+               rdmsrl_safe(shared_msrs_global.msrs[i], &value);
+               smsr->values[i].host = value;
+               smsr->values[i].curr = value;
+       }
   }
   
   int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
@@@ -301,13 -291,14 +292,14 @@@
         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
         int err;
   
-       if (((value ^ smsr->values[slot].curr) & mask) == 0)
+       value = (value & mask) | (smsr->values[slot].host & ~mask);
+       if (value == smsr->values[slot].curr)
                 return 0;
-       smsr->values[slot].curr = value;
         err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
         if (err)
                 return 1;
   
+       smsr->values[slot].curr = value;
         if (!smsr->registered) {
                 smsr->urn.on_user_return = kvm_on_user_return;
                 user_return_notifier_register(&smsr->urn);
@@@ -710,10 -701,8 +702,8 @@@ int load_pdptrs(struct kvm_vcpu *vcpu, 
         ret = 1;
   
         memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
-       __set_bit(VCPU_EXREG_PDPTR,
-                 (unsigned long *)&vcpu->arch.regs_avail);
-       __set_bit(VCPU_EXREG_PDPTR,
-                 (unsigned long *)&vcpu->arch.regs_dirty);
+       kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ 
   out:
   
         return ret;
@@@ -723,7 -712,6 +713,6 @@@ EXPORT_SYMBOL_GPL(load_pdptrs)
   bool pdptrs_changed(struct kvm_vcpu *vcpu)
   {
         u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
-       bool changed = true;
         int offset;
         gfn_t gfn;
         int r;
@@@ -731,8 -719,7 +720,7 @@@
         if (!is_pae_paging(vcpu))
                 return false;
   
-       if (!test_bit(VCPU_EXREG_PDPTR,
-                     (unsigned long *)&vcpu->arch.regs_avail))
+       if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
                 return true;
   
         gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
@@@ -740,11 -727,9 +728,9 @@@
         r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
                                        PFERR_USER_MASK | PFERR_WRITE_MASK);
         if (r < 0)
-               goto out;
-       changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
- out:
+               return true;
   
-       return changed;
+       return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
   }
   EXPORT_SYMBOL_GPL(pdptrs_changed);
   
@@@ -813,27 -798,34 +799,34 @@@ void kvm_lmsw(struct kvm_vcpu *vcpu, un
   }
   EXPORT_SYMBOL_GPL(kvm_lmsw);
   
- void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
   {
-       if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
-                       !vcpu->guest_xcr0_loaded) {
-               /* kvm_set_xcr() also depends on this */
+       if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+ 
                 if (vcpu->arch.xcr0 != host_xcr0)
                         xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
-               vcpu->guest_xcr0_loaded = 1;
+ 
+               if (vcpu->arch.xsaves_enabled &&
+                   vcpu->arch.ia32_xss != host_xss)
+                       wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
         }
   }
- EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
+ EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
   
- void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
   {
-       if (vcpu->guest_xcr0_loaded) {
+       if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+ 
                 if (vcpu->arch.xcr0 != host_xcr0)
                         xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
-               vcpu->guest_xcr0_loaded = 0;
+ 
+               if (vcpu->arch.xsaves_enabled &&
+                   vcpu->arch.ia32_xss != host_xss)
+                       wrmsrl(MSR_IA32_XSS, host_xss);
         }
+ 
   }
- EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
+ EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
   
   static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
   {
@@@ -985,7 -977,7 +978,7 @@@ int kvm_set_cr3(struct kvm_vcpu *vcpu, 
   
         kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
         vcpu->arch.cr3 = cr3;
-       __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+       kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
   
         return 0;
   }
@@@ -1314,23 -1306,15 +1307,15 @@@ static u64 kvm_get_arch_capabilities(vo
                 data |= ARCH_CAP_MDS_NO;
   
         /*
-        * On TAA affected systems, export MDS_NO=0 when:
-        *      - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
-        *      - Updated microcode is present. This is detected by
-        *        the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
-        *        that VERW clears CPU buffers.
-        *
-        * When MDS_NO=0 is exported, guests deploy clear CPU buffer
-        * mitigation and don't complain:
-        *
-        *      "Vulnerable: Clear CPU buffers attempted, no microcode"
-        *
-        * If TSX is disabled on the system, guests are also mitigated against
-        * TAA and clear CPU buffer mitigation is not required for guests.
+        * On TAA affected systems:
+        *      - nothing to do if TSX is disabled on the host.
+        *      - we emulate TSX_CTRL if present on the host.
+        *        This lets the guest use VERW to clear CPU buffers.
          */
-       if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) &&
-           (data & ARCH_CAP_TSX_CTRL_MSR))
-               data &= ~ARCH_CAP_MDS_NO;
+       if (!boot_cpu_has(X86_FEATURE_RTM))
+               data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR);
+       else if (!boot_cpu_has_bug(X86_BUG_TAA))
+               data |= ARCH_CAP_TAA_NO;
   
         return data;
   }
@@@ -1478,8 -1462,8 +1463,8 @@@ static int __kvm_set_msr(struct kvm_vcp
    * Returns 0 on success, non-0 otherwise.
    * Assumes vcpu_load() was already called.
    */
- static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
-                        bool host_initiated)
+ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+                 bool host_initiated)
   {
         struct msr_data msr;
         int ret;
@@@ -1554,20 -1538,25 +1539,25 @@@ static int do_set_msr(struct kvm_vcpu *
   }
   
   #ifdef CONFIG_X86_64
+ struct pvclock_clock {
+       int vclock_mode;
+       u64 cycle_last;
+       u64 mask;
+       u32 mult;
+       u32 shift;
+ };
+ 
   struct pvclock_gtod_data {
         seqcount_t      seq;
   
-       struct { /* extract of a clocksource struct */
-               int vclock_mode;
-               u64     cycle_last;
-               u64     mask;
-               u32     mult;
-               u32     shift;
-       } clock;
+       struct pvclock_clock clock; /* extract of a clocksource struct */
+       struct pvclock_clock raw_clock; /* extract of a clocksource struct */
   
+       u64             boot_ns_raw;
         u64             boot_ns;
         u64             nsec_base;
         u64             wall_time_sec;
+       u64             monotonic_raw_nsec;
   };
   
   static struct pvclock_gtod_data pvclock_gtod_data;
@@@ -1575,9 -1564,10 +1565,10 @@@
   static void update_pvclock_gtod(struct timekeeper *tk)
   {
         struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
-       u64 boot_ns;
+       u64 boot_ns, boot_ns_raw;
   
         boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
+       boot_ns_raw = ktime_to_ns(ktime_add(tk->tkr_raw.base, tk->offs_boot));
   
         write_seqcount_begin(&vdata->seq);
   
@@@ -1588,11 -1578,20 +1579,20 @@@
         vdata->clock.mult               = tk->tkr_mono.mult;
         vdata->clock.shift              = tk->tkr_mono.shift;
   
+       vdata->raw_clock.vclock_mode    = tk->tkr_raw.clock->archdata.vclock_mode;
+       vdata->raw_clock.cycle_last     = tk->tkr_raw.cycle_last;
+       vdata->raw_clock.mask           = tk->tkr_raw.mask;
+       vdata->raw_clock.mult           = tk->tkr_raw.mult;
+       vdata->raw_clock.shift          = tk->tkr_raw.shift;
+ 
         vdata->boot_ns                  = boot_ns;
         vdata->nsec_base                = tk->tkr_mono.xtime_nsec;
   
         vdata->wall_time_sec            = tk->xtime_sec;
   
+       vdata->boot_ns_raw              = boot_ns_raw;
+       vdata->monotonic_raw_nsec       = tk->tkr_raw.xtime_nsec;
+ 
         write_seqcount_end(&vdata->seq);
   }
   #endif
@@@ -2016,21 -2015,21 +2016,21 @@@ static u64 read_tsc(void
         return last;
   }
   
- static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
+ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
+                         int *mode)
   {
         long v;
-       struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
         u64 tsc_pg_val;
   
-       switch (gtod->clock.vclock_mode) {
+       switch (clock->vclock_mode) {
         case VCLOCK_HVCLOCK:
                 tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
                                                   tsc_timestamp);
                 if (tsc_pg_val != U64_MAX) {
                         /* TSC page valid */
                         *mode = VCLOCK_HVCLOCK;
-                       v = (tsc_pg_val - gtod->clock.cycle_last) &
-                               gtod->clock.mask;
+                       v = (tsc_pg_val - clock->cycle_last) &
+                               clock->mask;
                 } else {
                         /* TSC page invalid */
                         *mode = VCLOCK_NONE;
@@@ -2039,8 -2038,8 +2039,8 @@@
         case VCLOCK_TSC:
                 *mode = VCLOCK_TSC;
                 *tsc_timestamp = read_tsc();
-               v = (*tsc_timestamp - gtod->clock.cycle_last) &
-                       gtod->clock.mask;
+               v = (*tsc_timestamp - clock->cycle_last) &
+                       clock->mask;
                 break;
         default:
                 *mode = VCLOCK_NONE;
@@@ -2049,10 -2048,10 +2049,10 @@@
         if (*mode == VCLOCK_NONE)
                 *tsc_timestamp = v = 0;
   
-       return v * gtod->clock.mult;
+       return v * clock->mult;
   }
   
- static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
+ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
   {
         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
         unsigned long seq;
@@@ -2061,10 -2060,10 +2061,10 @@@
   
         do {
                 seq = read_seqcount_begin(&gtod->seq);
-               ns = gtod->nsec_base;
-               ns += vgettsc(tsc_timestamp, &mode);
+               ns = gtod->monotonic_raw_nsec;
+               ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
                 ns >>= gtod->clock.shift;
-               ns += gtod->boot_ns;
+               ns += gtod->boot_ns_raw;
         } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
         *t = ns;
   
@@@ -2082,7 -2081,7 +2082,7 @@@ static int do_realtime(struct timespec6
                 seq = read_seqcount_begin(&gtod->seq);
                 ts->tv_sec = gtod->wall_time_sec;
                 ns = gtod->nsec_base;
-               ns += vgettsc(tsc_timestamp, &mode);
+               ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
                 ns >>= gtod->clock.shift;
         } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
   
@@@ -2099,7 -2098,7 +2099,7 @@@ static bool kvm_get_time_and_clockread(
         if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
                 return false;
   
-       return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
+       return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
                                                       tsc_timestamp));
   }
   
@@@ -2722,6 -2721,20 +2722,20 @@@ int kvm_set_msr_common(struct kvm_vcpu 
         case MSR_IA32_TSC:
                 kvm_write_tsc(vcpu, msr_info);
                 break;
+       case MSR_IA32_XSS:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+                       return 1;
+               /*
+                * We do support PT if kvm_x86_ops->pt_supported(), but we do
+                * not support IA32_XSS[bit 8]. Guests will have to use
+                * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT
+                * MSRs.
+                */
+               if (data != 0)
+                       return 1;
+               vcpu->arch.ia32_xss = data;
+               break;
         case MSR_SMI_COUNT:
                 if (!msr_info->host_initiated)
                         return 1;
@@@ -3049,6 -3062,12 +3063,12 @@@ int kvm_get_msr_common(struct kvm_vcpu 
         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
                 return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
                                    msr_info->host_initiated);
+       case MSR_IA32_XSS:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+                       return 1;
+               msr_info->data = vcpu->arch.ia32_xss;
+               break;
         case MSR_K7_CLK_CTL:
                 /*
                  * Provide expected ramp-up count for K7. All other
@@@ -3826,12 -3845,13 +3846,13 @@@ static int kvm_vcpu_ioctl_x86_set_vcpu_
                                 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
                         else
                                 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
-                       if (lapic_in_kernel(vcpu)) {
-                               if (events->smi.latched_init)
-                                       set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
-                               else
-                                       clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
-                       }
+               }
+ 
+               if (lapic_in_kernel(vcpu)) {
+                       if (events->smi.latched_init)
+                               set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+                       else
+                               clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
                 }
         }
   
@@@ -4422,6 -4442,7 +4443,7 @@@ long kvm_arch_vcpu_ioctl(struct file *f
         case KVM_SET_NESTED_STATE: {
                 struct kvm_nested_state __user *user_kvm_nested_state = argp;
                 struct kvm_nested_state kvm_state;
+               int idx;
   
                 r = -EINVAL;
                 if (!kvm_x86_ops->set_nested_state)
@@@ -4445,7 -4466,9 +4467,9 @@@
                     && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
                         break;
   
+               idx = srcu_read_lock(&vcpu->kvm->srcu);
                 r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
+               srcu_read_unlock(&vcpu->kvm->srcu, idx);
                 break;
         }
         case KVM_GET_SUPPORTED_HV_CPUID: {
@@@ -4947,9 -4970,6 +4971,6 @@@ set_identity_unlock
                 if (!irqchip_kernel(kvm))
                         goto set_irqchip_out;
                 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
-               if (r)
-                       goto set_irqchip_out;
-               r = 0;
         set_irqchip_out:
                 kfree(chip);
                 break;
@@@ -5472,7 -5492,6 +5493,7 @@@ EXPORT_SYMBOL_GPL(kvm_write_guest_virt_
   
   int handle_ud(struct kvm_vcpu *vcpu)
   {
+ +      static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
         int emul_type = EMULTYPE_TRAP_UD;
         char sig[5]; /* ud2; .ascii "kvm" */
         struct x86_exception e;
@@@ -5480,7 -5499,7 +5501,7 @@@
         if (force_emulation_prefix &&
             kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
                                 sig, sizeof(sig), &e) == 0 &&
- -          memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
+ +          memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
                 kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
                 emul_type = EMULTYPE_TRAP_UD_FORCED;
         }
@@@ -6138,7 -6157,7 +6159,7 @@@ static void emulator_set_smbase(struct 
   static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
                               u32 pmc)
   {
-       return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
+       return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc);
   }
   
   static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@@ -7868,6 -7887,19 +7889,19 @@@ static void process_smi(struct kvm_vcp
         kvm_make_request(KVM_REQ_EVENT, vcpu);
   }
   
+ void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
+                                      unsigned long *vcpu_bitmap)
+ {
+       cpumask_var_t cpus;
+ 
+       zalloc_cpumask_var(&cpus, GFP_ATOMIC);
+ 
+       kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
+                                   vcpu_bitmap, cpus);
+ 
+       free_cpumask_var(cpus);
+ }
+ 
   void kvm_make_scan_ioapic_request(struct kvm *kvm)
   {
         kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
@@@ -7945,7 -7977,6 +7979,6 @@@ void kvm_vcpu_reload_apic_access_page(s
          */
         put_page(page);
   }
- EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
   
   void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
   {
@@@ -8704,8 -8735,12 +8737,12 @@@ int kvm_arch_vcpu_ioctl_set_mpstate(str
             mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
                 goto out;
   
-       /* INITs are latched while in SMM */
-       if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
+       /*
+        * KVM_MP_STATE_INIT_RECEIVED means the processor is in
+        * INIT state; latched init should be reported using
+        * KVM_SET_VCPU_EVENTS, so reject it here.
+        */
+       if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
             (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
              mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
                 goto out;
@@@ -8797,7 -8832,7 +8834,7 @@@ static int __set_sregs(struct kvm_vcpu 
         vcpu->arch.cr2 = sregs->cr2;
         mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
         vcpu->arch.cr3 = sregs->cr3;
-       __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+       kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
   
         kvm_set_cr8(vcpu, sregs->cr8);
   
@@@ -9324,6 -9359,9 +9361,9 @@@ int kvm_arch_hardware_setup(void
                 kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
         }
   
+       if (boot_cpu_has(X86_FEATURE_XSAVES))
+               rdmsrl(MSR_IA32_XSS, host_xss);
+ 
         kvm_init_msr_list();
         return 0;
   }
@@@ -9377,7 -9415,7 +9417,7 @@@ int kvm_arch_vcpu_init(struct kvm_vcpu 
                 goto fail_free_pio_data;
   
         if (irqchip_in_kernel(vcpu->kvm)) {
-               vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
+               vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm);
                 r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
                 if (r < 0)
                         goto fail_mmu_destroy;
@@@ -9446,7 -9484,13 +9486,13 @@@ void kvm_arch_vcpu_uninit(struct kvm_vc
   
   void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
   {
+       struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ 
         vcpu->arch.l1tf_flush_l1d = true;
+       if (pmu->version && unlikely(pmu->event_count)) {
+               pmu->need_cleanup = true;
+               kvm_make_request(KVM_REQ_PMU, vcpu);
+       }
         kvm_x86_ops->sched_in(vcpu, cpu);
   }
   
diff --combined include/linux/perf_event.h

index 34c7c69100265c9bfaf59470c04f52ba5c634292,a07bfdb7d8ea139792bb37f0ce042393acaadeee..6d4c22aee38483a34362e3d56c3c22108645fe1f
--- 1/include/linux/perf_event.h
--- 2/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@@ -56,7 -56,6 +56,7 @@@ struct perf_guest_info_callbacks 
   #include <linux/perf_regs.h>
   #include <linux/cgroup.h>
   #include <linux/refcount.h>
+ +#include <linux/security.h>
   #include <asm/local.h>
   
   struct perf_callchain_entry {
@@@ -249,8 -248,6 +249,8 @@@ struct perf_event
   #define PERF_PMU_CAP_NO_EXCLUDE                       0x80
   #define PERF_PMU_CAP_AUX_OUTPUT                       0x100
   
+ +struct perf_output_handle;
+ +
   /**
    * struct pmu - generic performance monitoring unit
    */
@@@ -412,15 -409,6 +412,15 @@@ struct pmu 
          */
         size_t                          task_ctx_size;
   
+ +      /*
+ +       * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data)
+ +       * can be synchronized using this function. See Intel LBR callstack support
+ +       * implementation and Perf core context switch handling callbacks for usage
+ +       * examples.
+ +       */
+ +      void (*swap_task_ctx)           (struct perf_event_context *prev,
+ +                                       struct perf_event_context *next);
+ +                                      /* optional */
   
         /*
          * Set up pmu-private data structures for an AUX area
@@@ -434,19 -422,6 +434,19 @@@
          */
         void (*free_aux)                (void *aux); /* optional */
   
+ +      /*
+ +       * Take a snapshot of the AUX buffer without touching the event
+ +       * state, so that preempting ->start()/->stop() callbacks does
+ +       * not interfere with their logic. Called in PMI context.
+ +       *
+ +       * Returns the size of AUX data copied to the output handle.
+ +       *
+ +       * Optional.
+ +       */
+ +      long (*snapshot_aux)            (struct perf_event *event,
+ +                                       struct perf_output_handle *handle,
+ +                                       unsigned long size);
+ +
         /*
          * Validate address range filters: make sure the HW supports the
          * requested configuration and number of filters; return 0 if the
@@@ -746,9 -721,6 +746,9 @@@ struct perf_event 
         struct perf_cgroup              *cgrp; /* cgroup event is attach to */
   #endif
   
+ +#ifdef CONFIG_SECURITY
+ +      void *security;
+ +#endif
         struct list_head                sb_list;
   #endif /* CONFIG_PERF_EVENTS */
   };
@@@ -988,7 -960,6 +988,7 @@@ struct perf_sample_data 
                 u32     reserved;
         }                               cpu_entry;
         struct perf_callchain_entry     *callchain;
+ +      u64                             aux_size;
   
         /*
          * regs_user may point to task_pt_regs or to regs_user_copy, depending
@@@ -1270,41 -1241,19 +1270,41 @@@ extern int perf_cpu_time_max_percent_ha
   int perf_event_max_stack_handler(struct ctl_table *table, int write,
                                  void __user *buffer, size_t *lenp, loff_t *ppos);
   
- -static inline bool perf_paranoid_tracepoint_raw(void)
+ +/* Access to perf_event_open(2) syscall. */
+ +#define PERF_SECURITY_OPEN            0
+ +
+ +/* Finer grained perf_event_open(2) access control. */
+ +#define PERF_SECURITY_CPU             1
+ +#define PERF_SECURITY_KERNEL          2
+ +#define PERF_SECURITY_TRACEPOINT      3
+ +
+ +static inline int perf_is_paranoid(void)
   {
         return sysctl_perf_event_paranoid > -1;
   }
   
- -static inline bool perf_paranoid_cpu(void)
+ +static inline int perf_allow_kernel(struct perf_event_attr *attr)
   {
- -      return sysctl_perf_event_paranoid > 0;
+ +      if (sysctl_perf_event_paranoid > 1 && !capable(CAP_SYS_ADMIN))
+ +              return -EACCES;
+ +
+ +      return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
   }
   
- -static inline bool perf_paranoid_kernel(void)
+ +static inline int perf_allow_cpu(struct perf_event_attr *attr)
   {
- -      return sysctl_perf_event_paranoid > 1;
+ +      if (sysctl_perf_event_paranoid > 0 && !capable(CAP_SYS_ADMIN))
+ +              return -EACCES;
+ +
+ +      return security_perf_event_open(attr, PERF_SECURITY_CPU);
+ +}
+ +
+ +static inline int perf_allow_tracepoint(struct perf_event_attr *attr)
+ +{
+ +      if (sysctl_perf_event_paranoid > -1 && !capable(CAP_SYS_ADMIN))
+ +              return -EPERM;
+ +
+ +      return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT);
   }
   
   extern void perf_event_init(void);
@@@ -1378,9 -1327,6 +1378,9 @@@ extern unsigned int perf_output_copy(st
                              const void *buf, unsigned int len);
   extern unsigned int perf_output_skip(struct perf_output_handle *handle,
                                      unsigned int len);
+ +extern long perf_output_copy_aux(struct perf_output_handle *aux_handle,
+ +                               struct perf_output_handle *handle,
+ +                               unsigned long from, unsigned long to);
   extern int perf_swevent_get_recursion_context(void);
   extern void perf_swevent_put_recursion_context(int rctx);
   extern u64 perf_swevent_set_period(struct perf_event *event);
@@@ -1390,6 -1336,8 +1390,8 @@@ extern void perf_event_disable_local(st
   extern void perf_event_disable_inatomic(struct perf_event *event);
   extern void perf_event_task_tick(void);
   extern int perf_event_account_interrupt(struct perf_event *event);
+ extern int perf_event_period(struct perf_event *event, u64 value);
+ extern u64 perf_event_pause(struct perf_event *event, bool reset);
   #else /* !CONFIG_PERF_EVENTS: */
   static inline void *
   perf_aux_output_begin(struct perf_output_handle *handle,
@@@ -1469,6 -1417,14 +1471,14 @@@ static inline void perf_event_disable(s
   static inline int __perf_event_disable(void *info)                    { return -1; }
   static inline void perf_event_task_tick(void)                         { }
   static inline int perf_event_release_kernel(struct perf_event *event) { return 0; }
+ static inline int perf_event_period(struct perf_event *event, u64 value)
+ {
+       return -EINVAL;
+ }
+ static inline u64 perf_event_pause(struct perf_event *event, bool reset)
+ {
+       return 0;
+ }
   #endif
   
   #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
diff --combined kernel/events/core.c

index 059ee711600843261ac8cae416eb8370bdb43a71,5de0b801bc7bc189645f8163f3ba06158418668a..4ff86d57f9e5309905e7e5cffe5a1eb875b45e30
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -1941,11 -1941,6 +1941,11 @@@ static void perf_put_aux_event(struct p
         }
   }
   
+ +static bool perf_need_aux_event(struct perf_event *event)
+ +{
+ +      return !!event->attr.aux_output || !!event->attr.aux_sample_size;
+ +}
+ +
   static int perf_get_aux_event(struct perf_event *event,
                               struct perf_event *group_leader)
   {
@@@ -1958,17 -1953,7 +1958,17 @@@
         if (!group_leader)
                 return 0;
   
- -      if (!perf_aux_output_match(event, group_leader))
+ +      /*
+ +       * aux_output and aux_sample_size are mutually exclusive.
+ +       */
+ +      if (event->attr.aux_output && event->attr.aux_sample_size)
+ +              return 0;
+ +
+ +      if (event->attr.aux_output &&
+ +          !perf_aux_output_match(event, group_leader))
+ +              return 0;
+ +
+ +      if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
                 return 0;
   
         if (!atomic_long_inc_not_zero(&group_leader->refcount))
@@@ -2681,25 -2666,6 +2681,25 @@@ perf_install_in_context(struct perf_eve
          */
         smp_store_release(&event->ctx, ctx);
   
+ +      /*
+ +       * perf_event_attr::disabled events will not run and can be initialized
+ +       * without IPI. Except when this is the first event for the context, in
+ +       * that case we need the magic of the IPI to set ctx->is_active.
+ +       *
+ +       * The IOC_ENABLE that is sure to follow the creation of a disabled
+ +       * event will issue the IPI and reprogram the hardware.
+ +       */
+ +      if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
+ +              raw_spin_lock_irq(&ctx->lock);
+ +              if (ctx->task == TASK_TOMBSTONE) {
+ +                      raw_spin_unlock_irq(&ctx->lock);
+ +                      return;
+ +              }
+ +              add_event_to_ctx(event, ctx);
+ +              raw_spin_unlock_irq(&ctx->lock);
+ +              return;
+ +      }
+ +
         if (!task) {
                 cpu_function_call(cpu, __perf_install_in_context, event);
                 return;
@@@ -3238,21 -3204,10 +3238,21 @@@ static void perf_event_context_sched_ou
                 raw_spin_lock(&ctx->lock);
                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
                 if (context_equiv(ctx, next_ctx)) {
+ +                      struct pmu *pmu = ctx->pmu;
+ +
                         WRITE_ONCE(ctx->task, next);
                         WRITE_ONCE(next_ctx->task, task);
   
- -                      swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ +                      /*
+ +                       * PMU specific parts of task perf context can require
+ +                       * additional synchronization. As an example of such
+ +                       * synchronization see implementation details of Intel
+ +                       * LBR call stack data profiling;
+ +                       */
+ +                      if (pmu->swap_task_ctx)
+ +                              pmu->swap_task_ctx(ctx, next_ctx);
+ +                      else
+ +                              swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
   
                         /*
                          * RCU_INIT_POINTER here is safe because we've not
@@@ -4274,9 -4229,8 +4274,9 @@@ find_get_context(struct pmu *pmu, struc
   
         if (!task) {
                 /* Must be root to operate on a CPU event: */
- -              if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- -                      return ERR_PTR(-EACCES);
+ +              err = perf_allow_cpu(&event->attr);
+ +              if (err)
+ +                      return ERR_PTR(err);
   
                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                 ctx = &cpuctx->ctx;
@@@ -4585,8 -4539,6 +4585,8 @@@ static void _free_event(struct perf_eve
   
         unaccount_event(event);
   
+ +      security_perf_event_free(event);
+ +
         if (event->rb) {
                 /*
                  * Can happen when we close an event with re-directed output.
@@@ -5040,10 -4992,6 +5040,10 @@@ perf_read(struct file *file, char __use
         struct perf_event_context *ctx;
         int ret;
   
+ +      ret = security_perf_event_read(event);
+ +      if (ret)
+ +              return ret;
+ +
         ctx = perf_event_ctx_lock(event);
         ret = __perf_read(event, buf, count);
         perf_event_ctx_unlock(event, ctx);
@@@ -5081,6 -5029,24 +5081,24 @@@ static void _perf_event_reset(struct pe
         perf_event_update_userpage(event);
   }
   
+ /* Assume it's not an event with inherit set. */
+ u64 perf_event_pause(struct perf_event *event, bool reset)
+ {
+       struct perf_event_context *ctx;
+       u64 count;
+ 
+       ctx = perf_event_ctx_lock(event);
+       WARN_ON_ONCE(event->attr.inherit);
+       _perf_event_disable(event);
+       count = local64_read(&event->count);
+       if (reset)
+               local64_set(&event->count, 0);
+       perf_event_ctx_unlock(event, ctx);
+ 
+       return count;
+ }
+ EXPORT_SYMBOL_GPL(perf_event_pause);
+ 
   /*
    * Holding the top-level event's child_mutex means that any
    * descendant process that has inherited this event will block
@@@ -5158,16 -5124,11 +5176,11 @@@ static int perf_event_check_period(stru
         return event->pmu->check_period(event, value);
   }
   
- static int perf_event_period(struct perf_event *event, u64 __user *arg)
+ static int _perf_event_period(struct perf_event *event, u64 value)
   {
-       u64 value;
- 
         if (!is_sampling_event(event))
                 return -EINVAL;
   
-       if (copy_from_user(&value, arg, sizeof(value)))
-               return -EFAULT;
- 
         if (!value)
                 return -EINVAL;
   
@@@ -5185,6 -5146,19 +5198,19 @@@
         return 0;
   }
   
+ int perf_event_period(struct perf_event *event, u64 value)
+ {
+       struct perf_event_context *ctx;
+       int ret;
+ 
+       ctx = perf_event_ctx_lock(event);
+       ret = _perf_event_period(event, value);
+       perf_event_ctx_unlock(event, ctx);
+ 
+       return ret;
+ }
+ EXPORT_SYMBOL_GPL(perf_event_period);
+ 
   static const struct file_operations perf_fops;
   
   static inline int perf_fget_light(int fd, struct fd *p)
@@@ -5228,8 -5202,14 +5254,14 @@@ static long _perf_ioctl(struct perf_eve
                 return _perf_event_refresh(event, arg);
   
         case PERF_EVENT_IOC_PERIOD:
-               return perf_event_period(event, (u64 __user *)arg);
+       {
+               u64 value;
   
+               if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
+                       return -EFAULT;
+ 
+               return _perf_event_period(event, value);
+       }
         case PERF_EVENT_IOC_ID:
         {
                 u64 id = primary_event_id(event);
@@@ -5308,11 -5288,6 +5340,11 @@@ static long perf_ioctl(struct file *fil
         struct perf_event_context *ctx;
         long ret;
   
+ +      /* Treat ioctl like writes as it is likely a mutating operation. */
+ +      ret = security_perf_event_write(event);
+ +      if (ret)
+ +              return ret;
+ +
         ctx = perf_event_ctx_lock(event);
         ret = _perf_ioctl(event, cmd, arg);
         perf_event_ctx_unlock(event, ctx);
@@@ -5664,8 -5639,10 +5696,8 @@@ static void perf_mmap_close(struct vm_a
                 perf_pmu_output_stop(event);
   
                 /* now it's safe to free the pages */
- -              if (!rb->aux_mmap_locked)
- -                      atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
- -              else
- -                      atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
+ +              atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
+ +              atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
   
                 /* this has to be the last one */
                 rb_free_aux(rb);
@@@ -5776,10 -5753,6 +5808,10 @@@ static int perf_mmap(struct file *file
         if (!(vma->vm_flags & VM_SHARED))
                 return -EINVAL;
   
+ +      ret = security_perf_event_read(event);
+ +      if (ret)
+ +              return ret;
+ +
         vma_size = vma->vm_end - vma->vm_start;
   
         if (vma->vm_pgoff == 0) {
@@@ -5886,7 -5859,13 +5918,7 @@@ accounting
   
         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
   
- -      if (user_locked <= user_lock_limit) {
- -              /* charge all to locked_vm */
- -      } else if (atomic_long_read(&user->locked_vm) >= user_lock_limit) {
- -              /* charge all to pinned_vm */
- -              extra = user_extra;
- -              user_extra = 0;
- -      } else {
+ +      if (user_locked > user_lock_limit) {
                 /*
                  * charge locked_vm until it hits user_lock_limit;
                  * charge the rest from pinned_vm
@@@ -5899,7 -5878,7 +5931,7 @@@
         lock_limit >>= PAGE_SHIFT;
         locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
   
- -      if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+ +      if ((locked > lock_limit) && perf_is_paranoid() &&
                 !capable(CAP_IPC_LOCK)) {
                 ret = -EPERM;
                 goto unlock;
@@@ -6229,122 -6208,6 +6261,122 @@@ perf_output_sample_ustack(struct perf_o
         }
   }
   
+ +static unsigned long perf_prepare_sample_aux(struct perf_event *event,
+ +                                        struct perf_sample_data *data,
+ +                                        size_t size)
+ +{
+ +      struct perf_event *sampler = event->aux_event;
+ +      struct ring_buffer *rb;
+ +
+ +      data->aux_size = 0;
+ +
+ +      if (!sampler)
+ +              goto out;
+ +
+ +      if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
+ +              goto out;
+ +
+ +      if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
+ +              goto out;
+ +
+ +      rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ +      if (!rb)
+ +              goto out;
+ +
+ +      /*
+ +       * If this is an NMI hit inside sampling code, don't take
+ +       * the sample. See also perf_aux_sample_output().
+ +       */
+ +      if (READ_ONCE(rb->aux_in_sampling)) {
+ +              data->aux_size = 0;
+ +      } else {
+ +              size = min_t(size_t, size, perf_aux_size(rb));
+ +              data->aux_size = ALIGN(size, sizeof(u64));
+ +      }
+ +      ring_buffer_put(rb);
+ +
+ +out:
+ +      return data->aux_size;
+ +}
+ +
+ +long perf_pmu_snapshot_aux(struct ring_buffer *rb,
+ +                         struct perf_event *event,
+ +                         struct perf_output_handle *handle,
+ +                         unsigned long size)
+ +{
+ +      unsigned long flags;
+ +      long ret;
+ +
+ +      /*
+ +       * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
+ +       * paths. If we start calling them in NMI context, they may race with
+ +       * the IRQ ones, that is, for example, re-starting an event that's just
+ +       * been stopped, which is why we're using a separate callback that
+ +       * doesn't change the event state.
+ +       *
+ +       * IRQs need to be disabled to prevent IPIs from racing with us.
+ +       */
+ +      local_irq_save(flags);
+ +      /*
+ +       * Guard against NMI hits inside the critical section;
+ +       * see also perf_prepare_sample_aux().
+ +       */
+ +      WRITE_ONCE(rb->aux_in_sampling, 1);
+ +      barrier();
+ +
+ +      ret = event->pmu->snapshot_aux(event, handle, size);
+ +
+ +      barrier();
+ +      WRITE_ONCE(rb->aux_in_sampling, 0);
+ +      local_irq_restore(flags);
+ +
+ +      return ret;
+ +}
+ +
+ +static void perf_aux_sample_output(struct perf_event *event,
+ +                                 struct perf_output_handle *handle,
+ +                                 struct perf_sample_data *data)
+ +{
+ +      struct perf_event *sampler = event->aux_event;
+ +      unsigned long pad;
+ +      struct ring_buffer *rb;
+ +      long size;
+ +
+ +      if (WARN_ON_ONCE(!sampler || !data->aux_size))
+ +              return;
+ +
+ +      rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ +      if (!rb)
+ +              return;
+ +
+ +      size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
+ +
+ +      /*
+ +       * An error here means that perf_output_copy() failed (returned a
+ +       * non-zero surplus that it didn't copy), which in its current
+ +       * enlightened implementation is not possible. If that changes, we'd
+ +       * like to know.
+ +       */
+ +      if (WARN_ON_ONCE(size < 0))
+ +              goto out_put;
+ +
+ +      /*
+ +       * The pad comes from ALIGN()ing data->aux_size up to u64 in
+ +       * perf_prepare_sample_aux(), so should not be more than that.
+ +       */
+ +      pad = data->aux_size - size;
+ +      if (WARN_ON_ONCE(pad >= sizeof(u64)))
+ +              pad = 8;
+ +
+ +      if (pad) {
+ +              u64 zero = 0;
+ +              perf_output_copy(handle, &zero, pad);
+ +      }
+ +
+ +out_put:
+ +      ring_buffer_put(rb);
+ +}
+ +
   static void __perf_event_header__init_id(struct perf_event_header *header,
                                          struct perf_sample_data *data,
                                          struct perf_event *event)
@@@ -6664,13 -6527,6 +6696,13 @@@ void perf_output_sample(struct perf_out
         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                 perf_output_put(handle, data->phys_addr);
   
+ +      if (sample_type & PERF_SAMPLE_AUX) {
+ +              perf_output_put(handle, data->aux_size);
+ +
+ +              if (data->aux_size)
+ +                      perf_aux_sample_output(event, handle, data);
+ +      }
+ +
         if (!event->attr.watermark) {
                 int wakeup_events = event->attr.wakeup_events;
   
@@@ -6859,35 -6715,6 +6891,35 @@@ void perf_prepare_sample(struct perf_ev
   
         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                 data->phys_addr = perf_virt_to_phys(data->addr);
+ +
+ +      if (sample_type & PERF_SAMPLE_AUX) {
+ +              u64 size;
+ +
+ +              header->size += sizeof(u64); /* size */
+ +
+ +              /*
+ +               * Given the 16bit nature of header::size, an AUX sample can
+ +               * easily overflow it, what with all the preceding sample bits.
+ +               * Make sure this doesn't happen by using up to U16_MAX bytes
+ +               * per sample in total (rounded down to 8 byte boundary).
+ +               */
+ +              size = min_t(size_t, U16_MAX - header->size,
+ +                           event->attr.aux_sample_size);
+ +              size = rounddown(size, 8);
+ +              size = perf_prepare_sample_aux(event, data, size);
+ +
+ +              WARN_ON_ONCE(size + header->size > U16_MAX);
+ +              header->size += size;
+ +      }
+ +      /*
+ +       * If you're adding more sample types here, you likely need to do
+ +       * something about the overflowing header::size, like repurpose the
+ +       * lowest 3 bits of size, which should be always zero at the moment.
+ +       * This raises a more important question, do we really need 512k sized
+ +       * samples and why, so good argumentation is in order for whatever you
+ +       * do here next.
+ +       */
+ +      WARN_ON_ONCE(header->size & 7);
   }
   
   static __always_inline int
@@@ -10239,7 -10066,7 +10271,7 @@@ static struct lock_class_key cpuctx_loc
   
   int perf_pmu_register(struct pmu *pmu, const char *name, int type)
   {
- -      int cpu, ret;
+ +      int cpu, ret, max = PERF_TYPE_MAX;
   
         mutex_lock(&pmus_lock);
         ret = -ENOMEM;
@@@ -10252,17 -10079,12 +10284,17 @@@
                 goto skip_type;
         pmu->name = name;
   
- -      if (type < 0) {
- -              type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
- -              if (type < 0) {
- -                      ret = type;
+ +      if (type != PERF_TYPE_SOFTWARE) {
+ +              if (type >= 0)
+ +                      max = type;
+ +
+ +              ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
+ +              if (ret < 0)
                         goto free_pdc;
- -              }
+ +
+ +              WARN_ON(type >= 0 && ret != type);
+ +
+ +              type = ret;
         }
         pmu->type = type;
   
@@@ -10339,16 -10161,7 +10371,16 @@@ got_cpu_context
         if (!pmu->event_idx)
                 pmu->event_idx = perf_event_idx_default;
   
- -      list_add_rcu(&pmu->entry, &pmus);
+ +      /*
+ +       * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
+ +       * since these cannot be in the IDR. This way the linear search
+ +       * is fast, provided a valid software event is provided.
+ +       */
+ +      if (type == PERF_TYPE_SOFTWARE || !name)
+ +              list_add_rcu(&pmu->entry, &pmus);
+ +      else
+ +              list_add_tail_rcu(&pmu->entry, &pmus);
+ +
         atomic_set(&pmu->exclusive_cnt, 0);
         ret = 0;
   unlock:
@@@ -10361,7 -10174,7 +10393,7 @@@ free_dev
         put_device(pmu->dev);
   
   free_idr:
- -      if (pmu->type >= PERF_TYPE_MAX)
+ +      if (pmu->type != PERF_TYPE_SOFTWARE)
                 idr_remove(&pmu_idr, pmu->type);
   
   free_pdc:
@@@ -10383,7 -10196,7 +10415,7 @@@ void perf_pmu_unregister(struct pmu *pm
         synchronize_rcu();
   
         free_percpu(pmu->pmu_disable_count);
- -      if (pmu->type >= PERF_TYPE_MAX)
+ +      if (pmu->type != PERF_TYPE_SOFTWARE)
                 idr_remove(&pmu_idr, pmu->type);
         if (pmu_bus_running) {
                 if (pmu->nr_addr_filters)
@@@ -10453,8 -10266,9 +10485,8 @@@ static int perf_try_init_event(struct p
   
   static struct pmu *perf_init_event(struct perf_event *event)
   {
+ +      int idx, type, ret;
         struct pmu *pmu;
- -      int idx;
- -      int ret;
   
         idx = srcu_read_lock(&pmus_srcu);
   
@@@ -10466,28 -10280,13 +10498,28 @@@
                         goto unlock;
         }
   
+ +      /*
+ +       * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
+ +       * are often aliases for PERF_TYPE_RAW.
+ +       */
+ +      type = event->attr.type;
+ +      if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
+ +              type = PERF_TYPE_RAW;
+ +
+ +again:
         rcu_read_lock();
- -      pmu = idr_find(&pmu_idr, event->attr.type);
+ +      pmu = idr_find(&pmu_idr, type);
         rcu_read_unlock();
         if (pmu) {
                 ret = perf_try_init_event(pmu, event);
+ +              if (ret == -ENOENT && event->attr.type != type) {
+ +                      type = event->attr.type;
+ +                      goto again;
+ +              }
+ +
                 if (ret)
                         pmu = ERR_PTR(ret);
+ +
                 goto unlock;
         }
   
@@@ -10710,12 -10509,9 +10742,9 @@@ perf_event_alloc(struct perf_event_att
                 context = parent_event->overflow_handler_context;
   #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
                 if (overflow_handler == bpf_overflow_handler) {
-                       struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
+                       struct bpf_prog *prog = parent_event->prog;
   
-                       if (IS_ERR(prog)) {
-                               err = PTR_ERR(prog);
-                               goto err_ns;
-                       }
+                       bpf_prog_inc(prog);
                         event->prog = prog;
                         event->orig_overflow_handler =
                                 parent_event->orig_overflow_handler;
@@@ -10822,20 -10618,11 +10851,20 @@@
                 }
         }
   
+ +      err = security_perf_event_alloc(event);
+ +      if (err)
+ +              goto err_callchain_buffer;
+ +
         /* symmetric to unaccount_event() in _free_event() */
         account_event(event);
   
         return event;
   
+ +err_callchain_buffer:
+ +      if (!event->parent) {
+ +              if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ +                      put_callchain_buffers();
+ +      }
   err_addr_filters:
         kfree(event->addr_filter_ranges);
   
@@@ -10886,7 -10673,7 +10915,7 @@@ static int perf_copy_attr(struct perf_e
   
         attr->size = size;
   
- -      if (attr->__reserved_1 || attr->__reserved_2)
+ +      if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
                 return -EINVAL;
   
         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@@ -10924,11 -10711,9 +10953,11 @@@
                         attr->branch_sample_type = mask;
                 }
                 /* privileged levels capture (kernel, hv): check permissions */
- -              if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
- -                  && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- -                      return -EACCES;
+ +              if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
+ +                      ret = perf_allow_kernel(attr);
+ +                      if (ret)
+ +                              return ret;
+ +              }
         }
   
         if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@@ -11141,19 -10926,13 +11170,19 @@@ SYSCALL_DEFINE5(perf_event_open
         if (flags & ~PERF_FLAG_ALL)
                 return -EINVAL;
   
+ +      /* Do we allow access to perf_event_open(2) ? */
+ +      err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
+ +      if (err)
+ +              return err;
+ +
         err = perf_copy_attr(attr_uptr, &attr);
         if (err)
                 return err;
   
         if (!attr.exclude_kernel) {
- -              if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- -                      return -EACCES;
+ +              err = perf_allow_kernel(&attr);
+ +              if (err)
+ +                      return err;
         }
   
         if (attr.namespaces) {
@@@ -11170,11 -10949,9 +11199,11 @@@
         }
   
         /* Only privileged users can get physical addresses */
- -      if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
- -          perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- -              return -EACCES;
+ +      if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
+ +              err = perf_allow_kernel(&attr);
+ +              if (err)
+ +                      return err;
+ +      }
   
         err = security_locked_down(LOCKDOWN_PERF);
         if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
@@@ -11436,7 -11213,7 +11465,7 @@@
                 }
         }
   
- -      if (event->attr.aux_output && !perf_get_aux_event(event, group_leader))
+ +      if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader))
                 goto err_locked;
   
         /*
author	Arnaldo Carvalho de Melo <[email protected]>
	Tue, 26 Nov 2019 14:06:19 +0000 (11:06 -0300)
committer	Arnaldo Carvalho de Melo <[email protected]>
	Tue, 26 Nov 2019 14:06:19 +0000 (11:06 -0300)
		1	2
MAINTAINERS	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/events/intel/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/perf_event.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history