Merge tag 'perf-core-2022-12-12' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <[email protected]>

Mon, 12 Dec 2022 23:19:38 +0000 (15:19 -0800)

committer Linus Torvalds <[email protected]>

Mon, 12 Dec 2022 23:19:38 +0000 (15:19 -0800)
author Linus Torvalds <[email protected]>
Mon, 12 Dec 2022 23:19:38 +0000 (15:19 -0800)
committer Linus Torvalds <[email protected]>
Mon, 12 Dec 2022 23:19:38 +0000 (15:19 -0800)
diff --combined arch/arm64/kernel/perf_event.c

index a15b3c1d15d91e8b310a9b71355505e97d16927b,54186542a969e49d234c85168bb3cf35da587394..a5193f2146a689b2621c5ea548039de353c656c3
--- 1/arch/arm64/kernel/perf_event.c
--- 2/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@@ -806,10 -806,14 +806,14 @@@ static void armv8pmu_disable_event(stru
   
   static void armv8pmu_start(struct arm_pmu *cpu_pmu)
   {
-       struct perf_event_context *task_ctx =
-               this_cpu_ptr(cpu_pmu->pmu.pmu_cpu_context)->task_ctx;
+       struct perf_event_context *ctx;
+       int nr_user = 0;
   
-       if (sysctl_perf_user_access && task_ctx && task_ctx->nr_user)
+       ctx = perf_cpu_task_ctx();
+       if (ctx)
+               nr_user = ctx->nr_user;
+ 
+       if (sysctl_perf_user_access && nr_user)
                 armv8pmu_enable_user_access(cpu_pmu);
         else
                 armv8pmu_disable_user_access();
@@@ -1019,10 -1023,10 +1023,10 @@@ static int armv8pmu_set_event_filter(st
         return 0;
   }
   
- static int armv8pmu_filter_match(struct perf_event *event)
+ static bool armv8pmu_filter(struct pmu *pmu, int cpu)
   {
-       unsigned long evtype = event->hw.config_base & ARMV8_PMU_EVTYPE_EVENT;
-       return evtype != ARMV8_PMUV3_PERFCTR_CHAIN;
+       struct arm_pmu *armpmu = to_arm_pmu(pmu);
+       return !cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus);
   }
   
   static void armv8pmu_reset(void *info)
@@@ -1146,8 -1150,7 +1150,8 @@@ static void __armv8pmu_probe_pmu(void *
         dfr0 = read_sysreg(id_aa64dfr0_el1);
         pmuver = cpuid_feature_extract_unsigned_field(dfr0,
                         ID_AA64DFR0_EL1_PMUVer_SHIFT);
- -      if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF || pmuver == 0)
+ +      if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF ||
+ +          pmuver == ID_AA64DFR0_EL1_PMUVer_NI)
                 return;
   
         cpu_pmu->pmuver = pmuver;
@@@ -1254,7 -1257,7 +1258,7 @@@ static int armv8_pmu_init(struct arm_pm
         cpu_pmu->stop                   = armv8pmu_stop;
         cpu_pmu->reset                  = armv8pmu_reset;
         cpu_pmu->set_event_filter       = armv8pmu_set_event_filter;
-       cpu_pmu->filter_match           = armv8pmu_filter_match;
+       cpu_pmu->filter                 = armv8pmu_filter;
   
         cpu_pmu->pmu.event_idx          = armv8pmu_user_event_idx;
   
diff --combined arch/s390/kernel/perf_pai_crypto.c

index 529a2fee4ea511c719ad43e565aa40b854cc0049,f747137f39ae84632eab41792eb3d6e9a55dfa82..985e243a2ed83b80f3f3a24ad6910ce46c83695a
--- 1/arch/s390/kernel/perf_pai_crypto.c
--- 2/arch/s390/kernel/perf_pai_crypto.c
+++ b/arch/s390/kernel/perf_pai_crypto.c
@@@ -35,9 -35,9 +35,9 @@@ struct pai_userdata 
   struct paicrypt_map {
         unsigned long *page;            /* Page for CPU to store counters */
         struct pai_userdata *save;      /* Page to store no-zero counters */
- -      unsigned int users;             /* # of PAI crypto users */
- -      unsigned int sampler;           /* # of PAI crypto samplers */
- -      unsigned int counter;           /* # of PAI crypto counters */
+ +      unsigned int active_events;     /* # of PAI crypto users */
+ +      unsigned int refcnt;            /* Reference count mapped buffers */
+ +      enum paievt_mode mode;          /* Type of event */
         struct perf_event *event;       /* Perf event for sampling */
   };
   
@@@ -56,11 -56,15 +56,11 @@@ static void paicrypt_event_destroy(stru
         cpump->event = NULL;
         static_branch_dec(&pai_key);
         mutex_lock(&pai_reserve_mutex);
- -      if (event->attr.sample_period)
- -              cpump->sampler -= 1;
- -      else
- -              cpump->counter -= 1;
- -      debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d"
- -                          " sampler %d counter %d\n", __func__,
- -                          event->attr.config, event->cpu, cpump->sampler,
- -                          cpump->counter);
- -      if (!cpump->counter && !cpump->sampler) {
+ +      debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d"
+ +                          " mode %d refcnt %d\n", __func__,
+ +                          event->attr.config, event->cpu,
+ +                          cpump->active_events, cpump->mode, cpump->refcnt);
+ +      if (!--cpump->refcnt) {
                 debug_sprintf_event(cfm_dbg, 4, "%s page %#lx save %p\n",
                                     __func__, (unsigned long)cpump->page,
                                     cpump->save);
@@@ -68,7 -72,6 +68,7 @@@
                 cpump->page = NULL;
                 kvfree(cpump->save);
                 cpump->save = NULL;
+ +              cpump->mode = PAI_MODE_NONE;
         }
         mutex_unlock(&pai_reserve_mutex);
   }
@@@ -133,14 -136,17 +133,14 @@@ static u64 paicrypt_getall(struct perf_
    */
   static int paicrypt_busy(struct perf_event_attr *a, struct paicrypt_map *cpump)
   {
- -      unsigned int *use_ptr;
         int rc = 0;
   
         mutex_lock(&pai_reserve_mutex);
         if (a->sample_period) {         /* Sampling requested */
- -              use_ptr = &cpump->sampler;
- -              if (cpump->counter || cpump->sampler)
+ +              if (cpump->mode != PAI_MODE_NONE)
                         rc = -EBUSY;    /* ... sampling/counting active */
         } else {                        /* Counting requested */
- -              use_ptr = &cpump->counter;
- -              if (cpump->sampler)
+ +              if (cpump->mode == PAI_MODE_SAMPLING)
                         rc = -EBUSY;    /* ... and sampling active */
         }
         if (rc)
@@@ -166,16 -172,12 +166,16 @@@
         rc = 0;
   
   unlock:
- -      /* If rc is non-zero, do not increment counter/sampler. */
- -      if (!rc)
- -              *use_ptr += 1;
- -      debug_sprintf_event(cfm_dbg, 5, "%s sample_period %#llx sampler %d"
- -                          " counter %d page %#lx save %p rc %d\n", __func__,
- -                          a->sample_period, cpump->sampler, cpump->counter,
+ +      /* If rc is non-zero, do not set mode and reference count */
+ +      if (!rc) {
+ +              cpump->refcnt++;
+ +              cpump->mode = a->sample_period ? PAI_MODE_SAMPLING
+ +                                             : PAI_MODE_COUNTING;
+ +      }
+ +      debug_sprintf_event(cfm_dbg, 5, "%s sample_period %#llx users %d"
+ +                          " mode %d refcnt %d page %#lx save %p rc %d\n",
+ +                          __func__, a->sample_period, cpump->active_events,
+ +                          cpump->mode, cpump->refcnt,
                             (unsigned long)cpump->page, cpump->save, rc);
         mutex_unlock(&pai_reserve_mutex);
         return rc;
@@@ -260,7 -262,7 +260,7 @@@ static int paicrypt_add(struct perf_eve
         struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map);
         unsigned long ccd;
   
- -      if (cpump->users++ == 0) {
+ +      if (++cpump->active_events == 1) {
                 ccd = virt_to_phys(cpump->page) | PAI_CRYPTO_KERNEL_OFFSET;
                 WRITE_ONCE(S390_lowcore.ccd, ccd);
                 __ctl_set_bit(0, 50);
@@@ -291,7 -293,7 +291,7 @@@ static void paicrypt_del(struct perf_ev
         if (!event->attr.sample_period)
                 /* Only counting needs to read counter */
                 paicrypt_stop(event, PERF_EF_UPDATE);
- -      if (cpump->users-- == 1) {
+ +      if (--cpump->active_events == 0) {
                 __ctl_clear_bit(0, 50);
                 WRITE_ONCE(S390_lowcore.ccd, 0);
         }
@@@ -377,7 -379,7 +377,7 @@@ static int paicrypt_push_sample(void
   /* Called on schedule-in and schedule-out. No access to event structure,
    * but for sampling only event CRYPTO_ALL is allowed.
    */
- static void paicrypt_sched_task(struct perf_event_context *ctx, bool sched_in)
+ static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
   {
         /* We started with a clean page on event installation. So read out
          * results on schedule_out and if page was dirty, clear values.
diff --combined arch/s390/kernel/perf_pai_ext.c

index a46cd7406b20ca611d977d6073c040b70a9cbe44,9547798594f9fa7744f68926c9698c9ae33fa987..1138f57baae3f0b7e3b14577a9229b1ef6ca3e8d
--- 1/arch/s390/kernel/perf_pai_ext.c
--- 2/arch/s390/kernel/perf_pai_ext.c
+++ b/arch/s390/kernel/perf_pai_ext.c
@@@ -28,6 -28,12 +28,6 @@@
   static debug_info_t *paiext_dbg;
   static unsigned int paiext_cnt;       /* Extracted with QPACI instruction */
   
- -enum paiext_mode {
- -      PAI_MODE_NONE,
- -      PAI_MODE_SAMPLING,
- -      PAI_MODE_COUNTER,
- -};
- -
   struct pai_userdata {
         u16 num;
         u64 value;
@@@ -48,7 -54,7 +48,7 @@@ struct paiext_cb {            /* PAI extension 1 
   struct paiext_map {
         unsigned long *area;            /* Area for CPU to store counters */
         struct pai_userdata *save;      /* Area to store non-zero counters */
- -      enum paiext_mode mode;          /* Type of event */
+ +      enum paievt_mode mode;          /* Type of event */
         unsigned int active_events;     /* # of PAI Extension users */
         unsigned int refcnt;
         struct perf_event *event;       /* Perf event for sampling */
@@@ -186,14 -192,14 +186,14 @@@ static int paiext_alloc(struct perf_eve
                         goto unlock;
                 }
                 cpump->mode = a->sample_period ? PAI_MODE_SAMPLING
- -                                             : PAI_MODE_COUNTER;
+ +                                             : PAI_MODE_COUNTING;
         } else {
                 /* Multiple invocation, check whats active.
                  * Supported are multiple counter events or only one sampling
                  * event concurrently at any one time.
                  */
                 if (cpump->mode == PAI_MODE_SAMPLING ||
- -                  (cpump->mode == PAI_MODE_COUNTER && a->sample_period)) {
+ +                  (cpump->mode == PAI_MODE_COUNTING && a->sample_period)) {
                         rc = -EBUSY;
                         goto unlock;
                 }
@@@ -453,7 -459,6 +453,7 @@@ static int paiext_push_sample(void
                 raw.frag.data = cpump->save;
                 raw.size = raw.frag.size;
                 data.raw = &raw;
+ +              data.sample_flags |= PERF_SAMPLE_RAW;
         }
   
         overflow = perf_event_overflow(event, &data, &regs);
@@@ -466,7 -471,7 +466,7 @@@
   /* Called on schedule-in and schedule-out. No access to event structure,
    * but for sampling only event NNPA_ALL is allowed.
    */
- static void paiext_sched_task(struct perf_event_context *ctx, bool sched_in)
+ static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
   {
         /* We started with a clean page on event installation. So read out
          * results on schedule_out and if page was dirty, clear values.
diff --combined arch/x86/events/amd/ibs.c

index 4cb710efbdd9ad7e43ed1448cb471b8cd9fde975,fbc2ce86f4b81525c2e8ed71683c2705e8ac80e7..da3f5ebac4e1ca9d1dd24a4bc77b9a37113cbeef
--- 1/arch/x86/events/amd/ibs.c
--- 2/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@@ -631,7 -631,7 +631,7 @@@ static const struct attribute_group *op
   
   static struct perf_ibs perf_ibs_fetch = {
         .pmu = {
-               .task_ctx_nr    = perf_invalid_context,
+               .task_ctx_nr    = perf_hw_context,
   
                 .event_init     = perf_ibs_init,
                 .add            = perf_ibs_add,
@@@ -655,7 -655,7 +655,7 @@@
   
   static struct perf_ibs perf_ibs_op = {
         .pmu = {
-               .task_ctx_nr    = perf_invalid_context,
+               .task_ctx_nr    = perf_hw_context,
   
                 .event_init     = perf_ibs_init,
                 .add            = perf_ibs_add,
@@@ -801,7 -801,7 +801,7 @@@ static void perf_ibs_get_mem_lvl(union 
         /* Extension Memory */
         if (ibs_caps & IBS_CAPS_ZEN4 &&
             ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) {
- -              data_src->mem_lvl_num = PERF_MEM_LVLNUM_EXTN_MEM;
+ +              data_src->mem_lvl_num = PERF_MEM_LVLNUM_CXL;
                 if (op_data2->rmt_node) {
                         data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
                         /* IBS doesn't provide Remote socket detail */
diff --combined arch/x86/events/intel/core.c

index 1b92bf05fd652a7b92c158ed1b9641809bec234d,d8af75466ee90c16f57103fb467b4e90a2856521..dfd2c124cdf80a819570340caa2b886887d57890
--- 1/arch/x86/events/intel/core.c
--- 2/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@@ -4536,8 -4536,6 +4536,6 @@@ end
         cpumask_set_cpu(cpu, &pmu->supported_cpus);
         cpuc->pmu = &pmu->pmu;
   
-       x86_pmu_update_cpu_context(&pmu->pmu, cpu);
- 
         return true;
   }
   
@@@ -4671,17 -4669,17 +4669,17 @@@ static void intel_pmu_cpu_dead(int cpu
                 cpumask_clear_cpu(cpu, &hybrid_pmu(cpuc->pmu)->supported_cpus);
   }
   
- static void intel_pmu_sched_task(struct perf_event_context *ctx,
+ static void intel_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
                                  bool sched_in)
   {
-       intel_pmu_pebs_sched_task(ctx, sched_in);
-       intel_pmu_lbr_sched_task(ctx, sched_in);
+       intel_pmu_pebs_sched_task(pmu_ctx, sched_in);
+       intel_pmu_lbr_sched_task(pmu_ctx, sched_in);
   }
   
- static void intel_pmu_swap_task_ctx(struct perf_event_context *prev,
-                                   struct perf_event_context *next)
+ static void intel_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
+                                   struct perf_event_pmu_context *next_epc)
   {
-       intel_pmu_lbr_swap_task_ctx(prev, next);
+       intel_pmu_lbr_swap_task_ctx(prev_epc, next_epc);
   }
   
   static int intel_pmu_check_period(struct perf_event *event, u64 value)
@@@ -4705,12 -4703,11 +4703,11 @@@ static int intel_pmu_aux_output_match(s
         return is_intel_pt_event(event);
   }
   
- static int intel_pmu_filter_match(struct perf_event *event)
+ static void intel_pmu_filter(struct pmu *pmu, int cpu, bool *ret)
   {
-       struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
-       unsigned int cpu = smp_processor_id();
+       struct x86_hybrid_pmu *hpmu = hybrid_pmu(pmu);
   
-       return cpumask_test_cpu(cpu, &pmu->supported_cpus);
+       *ret = !cpumask_test_cpu(cpu, &hpmu->supported_cpus);
   }
   
   PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
@@@ -4911,7 -4908,6 +4908,7 @@@ static const struct x86_cpu_desc isolat
         INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X,             5, 0x00000000),
         INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X,             6, 0x00000000),
         INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X,             7, 0x00000000),
+ +      INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X,            11, 0x00000000),
         INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_L,             3, 0x0000007c),
         INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE,               3, 0x0000007c),
         INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE,              9, 0x0000004e),
@@@ -6413,7 -6409,7 +6410,7 @@@ __init int intel_pmu_init(void
                 static_call_update(intel_pmu_set_topdown_event_period,
                                    &adl_set_topdown_event_period);
   
-               x86_pmu.filter_match = intel_pmu_filter_match;
+               x86_pmu.filter = intel_pmu_filter;
                 x86_pmu.get_event_constraints = adl_get_event_constraints;
                 x86_pmu.hw_config = adl_hw_config;
                 x86_pmu.limit_period = spr_limit_period;
diff --combined arch/x86/events/intel/ds.c

index 446d2833efa768bcc7dd2b63cca574ba29e8cb44,f141cc7b88479c16d8628f351d836d801a4269a2..88e58b6ee73c016cd55a7eb887751778a6c5d086
--- 1/arch/x86/events/intel/ds.c
--- 2/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@@ -982,13 -982,8 +982,13 @@@ struct event_constraint intel_icl_pebs_
         INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),  /* SLOTS */
   
         INTEL_PLD_CONSTRAINT(0x1cd, 0xff),                      /* MEM_TRANS_RETIRED.LOAD_LATENCY */
- -      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf),    /* MEM_INST_RETIRED.LOAD */
- -      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf),    /* MEM_INST_RETIRED.STORE */
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf),   /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf),   /* MEM_INST_RETIRED.STLB_MISS_STORES */
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf),   /* MEM_INST_RETIRED.LOCK_LOADS */
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf),   /* MEM_INST_RETIRED.SPLIT_LOADS */
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf),   /* MEM_INST_RETIRED.SPLIT_STORES */
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf),   /* MEM_INST_RETIRED.ALL_LOADS */
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf),   /* MEM_INST_RETIRED.ALL_STORES */
   
         INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), /* MEM_LOAD_*_RETIRED.* */
   
@@@ -1009,13 -1004,8 +1009,13 @@@ struct event_constraint intel_spr_pebs_
         INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe),
         INTEL_PLD_CONSTRAINT(0x1cd, 0xfe),
         INTEL_PSD_CONSTRAINT(0x2cd, 0x1),
- -      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf),
- -      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf),
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf),   /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf),   /* MEM_INST_RETIRED.STLB_MISS_STORES */
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf),   /* MEM_INST_RETIRED.LOCK_LOADS */
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf),   /* MEM_INST_RETIRED.SPLIT_LOADS */
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf),   /* MEM_INST_RETIRED.SPLIT_STORES */
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf),   /* MEM_INST_RETIRED.ALL_LOADS */
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf),   /* MEM_INST_RETIRED.ALL_STORES */
   
         INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf),
   
@@@ -1069,7 -1059,7 +1069,7 @@@ static inline bool pebs_needs_sched_cb(
         return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
   }
   
- void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+ void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
   {
         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
   
@@@ -1177,7 -1167,7 +1177,7 @@@ static voi
   pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
                   struct perf_event *event, bool add)
   {
-       struct pmu *pmu = event->ctx->pmu;
+       struct pmu *pmu = event->pmu;
         /*
          * Make sure we get updated with the first PEBS
          * event. It will trigger also during removal, but
diff --combined drivers/perf/arm_pmu.c

index bb56676f50ef98acfc15381d2c463925b70862b9,5ece3f132d80b593821bc12f6355f358f7b1b8c0..9b593f985805eb782ee6e9ec612ff10b2003a8bf
--- 1/drivers/perf/arm_pmu.c
--- 2/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@@ -514,6 -514,9 +514,6 @@@ static int armpmu_event_init(struct per
         if (has_branch_stack(event))
                 return -EOPNOTSUPP;
   
- -      if (armpmu->map_event(event) == -ENOENT)
- -              return -ENOENT;
- -
         return __hw_perf_event_init(event);
   }
   
@@@ -547,15 -550,14 +547,14 @@@ static void armpmu_disable(struct pmu *
    * microarchitecture, and aren't suitable for another. Thus, only match CPUs of
    * the same microarchitecture.
    */
- static int armpmu_filter_match(struct perf_event *event)
+ static bool armpmu_filter(struct pmu *pmu, int cpu)
   {
-       struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-       unsigned int cpu = smp_processor_id();
-       int ret;
+       struct arm_pmu *armpmu = to_arm_pmu(pmu);
+       bool ret;
   
         ret = cpumask_test_cpu(cpu, &armpmu->supported_cpus);
-       if (ret && armpmu->filter_match)
-               return armpmu->filter_match(event);
+       if (ret && armpmu->filter)
+               return armpmu->filter(pmu, cpu);
   
         return ret;
   }
@@@ -858,16 -860,16 +857,16 @@@ static void cpu_pmu_destroy(struct arm_
                                             &cpu_pmu->node);
   }
   
- -static struct arm_pmu *__armpmu_alloc(gfp_t flags)
+ +struct arm_pmu *armpmu_alloc(void)
   {
         struct arm_pmu *pmu;
         int cpu;
   
- -      pmu = kzalloc(sizeof(*pmu), flags);
+ +      pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
         if (!pmu)
                 goto out;
   
- -      pmu->hw_events = alloc_percpu_gfp(struct pmu_hw_events, flags);
+ +      pmu->hw_events = alloc_percpu_gfp(struct pmu_hw_events, GFP_KERNEL);
         if (!pmu->hw_events) {
                 pr_info("failed to allocate per-cpu PMU data.\n");
                 goto out_free_pmu;
@@@ -882,14 -884,13 +881,13 @@@
                 .start          = armpmu_start,
                 .stop           = armpmu_stop,
                 .read           = armpmu_read,
-               .filter_match   = armpmu_filter_match,
+               .filter         = armpmu_filter,
                 .attr_groups    = pmu->attr_groups,
                 /*
                  * This is a CPU PMU potentially in a heterogeneous
                  * configuration (e.g. big.LITTLE). This is not an uncore PMU,
                  * and we have taken ctx sharing into account (e.g. with our
-                * pmu::filter_match callback and pmu::event_init group
-                * validation).
+                * pmu::filter callback and pmu::event_init group validation).
                  */
                 .capabilities   = PERF_PMU_CAP_HETEROGENEOUS_CPUS | PERF_PMU_CAP_EXTENDED_REGS,
         };
@@@ -913,6 -914,17 +911,6 @@@ out
         return NULL;
   }
   
- -struct arm_pmu *armpmu_alloc(void)
- -{
- -      return __armpmu_alloc(GFP_KERNEL);
- -}
- -
- -struct arm_pmu *armpmu_alloc_atomic(void)
- -{
- -      return __armpmu_alloc(GFP_ATOMIC);
- -}
- -
- -
   void armpmu_free(struct arm_pmu *pmu)
   {
         free_percpu(pmu->hw_events);
diff --combined include/linux/perf/arm_pmu.h

index 0c15c5b7f801ec2684a23b292a078b1295a52f5f,725968095ea9dd0c1b9d9d91c460fd4e808e2a63..ef914a600087eaf2c5bb755f9bb354d58349469c
--- 1/include/linux/perf/arm_pmu.h
--- 2/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@@ -100,7 -100,7 +100,7 @@@ struct arm_pmu 
         void            (*stop)(struct arm_pmu *);
         void            (*reset)(void *);
         int             (*map_event)(struct perf_event *event);
-       int             (*filter_match)(struct perf_event *event);
+       bool            (*filter)(struct pmu *pmu, int cpu);
         int             num_events;
         bool            secure_access; /* 32-bit ARM only */
   #define ARMV8_PMUV3_MAX_COMMON_EVENTS         0x40
@@@ -174,6 -174,7 +174,6 @@@ void kvm_host_pmu_init(struct arm_pmu *
   
   /* Internal functions only for core arm_pmu code */
   struct arm_pmu *armpmu_alloc(void);
- -struct arm_pmu *armpmu_alloc_atomic(void);
   void armpmu_free(struct arm_pmu *pmu);
   int armpmu_register(struct arm_pmu *pmu);
   int armpmu_request_irq(int irq, int cpu);
diff --combined kernel/events/core.c

index 7f04f995c9754891042abf3d0f695d0ba0cc890f,65e20c5c3c44e38ac18645e29fe32e1b13275633..e47914ac8732325572830cf3dee5886bca21022f
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -155,12 -155,6 +155,6 @@@ static int cpu_function_call(int cpu, r
         return data.ret;
   }
   
- static inline struct perf_cpu_context *
- __get_cpu_context(struct perf_event_context *ctx)
- {
-       return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
- }
- 
   static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
                           struct perf_event_context *ctx)
   {
@@@ -184,6 -178,14 +178,14 @@@ static bool is_kernel_event(struct perf
         return READ_ONCE(event->owner) == TASK_TOMBSTONE;
   }
   
+ static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+ 
+ struct perf_event_context *perf_cpu_task_ctx(void)
+ {
+       lockdep_assert_irqs_disabled();
+       return this_cpu_ptr(&perf_cpu_context)->task_ctx;
+ }
+ 
   /*
    * On task ctx scheduling...
    *
@@@ -217,7 -219,7 +219,7 @@@ static int event_function(void *info
         struct event_function_struct *efs = info;
         struct perf_event *event = efs->event;
         struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
         struct perf_event_context *task_ctx = cpuctx->task_ctx;
         int ret = 0;
   
@@@ -314,7 -316,7 +316,7 @@@ again
   static void event_function_local(struct perf_event *event, event_f func, void *data)
   {
         struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
         struct task_struct *task = READ_ONCE(ctx->task);
         struct perf_event_context *task_ctx = NULL;
   
@@@ -388,7 -390,6 +390,6 @@@ static DEFINE_MUTEX(perf_sched_mutex)
   static atomic_t perf_sched_count;
   
   static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
- static DEFINE_PER_CPU(int, perf_sched_cb_usages);
   static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
   
   static atomic_t nr_mmap_events __read_mostly;
@@@ -448,7 -449,7 +449,7 @@@ static void update_perf_cpu_limits(void
         WRITE_ONCE(perf_sample_allowed_ns, tmp);
   }
   
- static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
+ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
   
   int perf_proc_update_handler(struct ctl_table *table, int write,
                 void *buffer, size_t *lenp, loff_t *ppos)
@@@ -571,12 -572,6 +572,6 @@@ void perf_sample_event_took(u64 sample_
   
   static atomic64_t perf_event_id;
   
- static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
-                             enum event_type_t event_type);
- 
- static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-                            enum event_type_t event_type);
- 
   static void update_context_time(struct perf_event_context *ctx);
   static u64 perf_event_time(struct perf_event *event);
   
@@@ -691,13 -686,31 +686,31 @@@ do {                                                                    
         ___p;                                                           \
   })
   
+ static void perf_ctx_disable(struct perf_event_context *ctx)
+ {
+       struct perf_event_pmu_context *pmu_ctx;
+ 
+       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+               perf_pmu_disable(pmu_ctx->pmu);
+ }
+ 
+ static void perf_ctx_enable(struct perf_event_context *ctx)
+ {
+       struct perf_event_pmu_context *pmu_ctx;
+ 
+       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+               perf_pmu_enable(pmu_ctx->pmu);
+ }
+ 
+ static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
+ static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+ 
   #ifdef CONFIG_CGROUP_PERF
   
   static inline bool
   perf_cgroup_match(struct perf_event *event)
   {
-       struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
   
         /* @event doesn't care about cgroup */
         if (!event->cgrp)
@@@ -823,54 -836,39 +836,39 @@@ perf_cgroup_set_timestamp(struct perf_c
         }
   }
   
- static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
- 
   /*
    * reschedule events based on the cgroup constraint of task.
    */
   static void perf_cgroup_switch(struct task_struct *task)
   {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
         struct perf_cgroup *cgrp;
-       struct perf_cpu_context *cpuctx, *tmp;
-       struct list_head *list;
-       unsigned long flags;
- 
-       /*
-        * Disable interrupts and preemption to avoid this CPU's
-        * cgrp_cpuctx_entry to change under us.
-        */
-       local_irq_save(flags);
   
         cgrp = perf_cgroup_from_task(task, NULL);
   
-       list = this_cpu_ptr(&cgrp_cpuctx_list);
-       list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
-               WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
-               if (READ_ONCE(cpuctx->cgrp) == cgrp)
-                       continue;
- 
-               perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-               perf_pmu_disable(cpuctx->ctx.pmu);
+       WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
+       if (READ_ONCE(cpuctx->cgrp) == cgrp)
+               return;
   
-               cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-               /*
-                * must not be done before ctxswout due
-                * to update_cgrp_time_from_cpuctx() in
-                * ctx_sched_out()
-                */
-               cpuctx->cgrp = cgrp;
-               /*
-                * set cgrp before ctxsw in to allow
-                * perf_cgroup_set_timestamp() in ctx_sched_in()
-                * to not have to pass task around
-                */
-               cpu_ctx_sched_in(cpuctx, EVENT_ALL);
+       perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+       perf_ctx_disable(&cpuctx->ctx);
   
-               perf_pmu_enable(cpuctx->ctx.pmu);
-               perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-       }
+       ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+       /*
+        * must not be done before ctxswout due
+        * to update_cgrp_time_from_cpuctx() in
+        * ctx_sched_out()
+        */
+       cpuctx->cgrp = cgrp;
+       /*
+        * set cgrp before ctxsw in to allow
+        * perf_cgroup_set_timestamp() in ctx_sched_in()
+        * to not have to pass task around
+        */
+       ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
   
-       local_irq_restore(flags);
+       perf_ctx_enable(&cpuctx->ctx);
+       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
   }
   
   static int perf_cgroup_ensure_storage(struct perf_event *event,
@@@ -888,7 -886,7 +886,7 @@@
                 heap_size++;
   
         for_each_possible_cpu(cpu) {
-               cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
+               cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
                 if (heap_size <= cpuctx->heap_size)
                         continue;
   
@@@ -972,8 -970,6 +970,6 @@@ perf_cgroup_event_enable(struct perf_ev
                 return;
   
         cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
-       list_add(&cpuctx->cgrp_cpuctx_entry,
-                       per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
   }
   
   static inline void
@@@ -994,7 -990,6 +990,6 @@@ perf_cgroup_event_disable(struct perf_e
                 return;
   
         cpuctx->cgrp = NULL;
-       list_del(&cpuctx->cgrp_cpuctx_entry);
   }
   
   #else /* !CONFIG_CGROUP_PERF */
@@@ -1069,34 -1064,30 +1064,30 @@@ static void perf_cgroup_switch(struct t
    */
   static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
   {
-       struct perf_cpu_context *cpuctx;
+       struct perf_cpu_pmu_context *cpc;
         bool rotations;
   
         lockdep_assert_irqs_disabled();
   
-       cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
-       rotations = perf_rotate_context(cpuctx);
+       cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
+       rotations = perf_rotate_context(cpc);
   
-       raw_spin_lock(&cpuctx->hrtimer_lock);
+       raw_spin_lock(&cpc->hrtimer_lock);
         if (rotations)
-               hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+               hrtimer_forward_now(hr, cpc->hrtimer_interval);
         else
-               cpuctx->hrtimer_active = 0;
-       raw_spin_unlock(&cpuctx->hrtimer_lock);
+               cpc->hrtimer_active = 0;
+       raw_spin_unlock(&cpc->hrtimer_lock);
   
         return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
   }
   
- static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+ static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
   {
-       struct hrtimer *timer = &cpuctx->hrtimer;
-       struct pmu *pmu = cpuctx->ctx.pmu;
+       struct hrtimer *timer = &cpc->hrtimer;
+       struct pmu *pmu = cpc->epc.pmu;
         u64 interval;
   
-       /* no multiplexing needed for SW PMU */
-       if (pmu->task_ctx_nr == perf_sw_context)
-               return;
- 
         /*
          * check default is sane, if not set then force to
          * default interval (1/tick)
@@@ -1105,34 -1096,34 +1096,34 @@@
         if (interval < 1)
                 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
   
-       cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
+       cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
   
-       raw_spin_lock_init(&cpuctx->hrtimer_lock);
+       raw_spin_lock_init(&cpc->hrtimer_lock);
         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
         timer->function = perf_mux_hrtimer_handler;
   }
   
- static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
+ static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
   {
-       struct hrtimer *timer = &cpuctx->hrtimer;
-       struct pmu *pmu = cpuctx->ctx.pmu;
+       struct hrtimer *timer = &cpc->hrtimer;
         unsigned long flags;
   
-       /* not for SW PMU */
-       if (pmu->task_ctx_nr == perf_sw_context)
-               return 0;
- 
-       raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
-       if (!cpuctx->hrtimer_active) {
-               cpuctx->hrtimer_active = 1;
-               hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+       raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
+       if (!cpc->hrtimer_active) {
+               cpc->hrtimer_active = 1;
+               hrtimer_forward_now(timer, cpc->hrtimer_interval);
                 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
         }
-       raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
+       raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);
   
         return 0;
   }
   
+ static int perf_mux_hrtimer_restart_ipi(void *arg)
+ {
+       return perf_mux_hrtimer_restart(arg);
+ }
+ 
   void perf_pmu_disable(struct pmu *pmu)
   {
         int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@@ -1147,32 -1138,9 +1138,9 @@@ void perf_pmu_enable(struct pmu *pmu
                 pmu->pmu_enable(pmu);
   }
   
- static DEFINE_PER_CPU(struct list_head, active_ctx_list);
- 
- /*
-  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
-  * perf_event_task_tick() are fully serialized because they're strictly cpu
-  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
-  * disabled, while perf_event_task_tick is called from IRQ context.
-  */
- static void perf_event_ctx_activate(struct perf_event_context *ctx)
- {
-       struct list_head *head = this_cpu_ptr(&active_ctx_list);
- 
-       lockdep_assert_irqs_disabled();
- 
-       WARN_ON(!list_empty(&ctx->active_ctx_list));
- 
-       list_add(&ctx->active_ctx_list, head);
- }
- 
- static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+ static void perf_assert_pmu_disabled(struct pmu *pmu)
   {
-       lockdep_assert_irqs_disabled();
- 
-       WARN_ON(list_empty(&ctx->active_ctx_list));
- 
-       list_del_init(&ctx->active_ctx_list);
+       WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
   }
   
   static void get_ctx(struct perf_event_context *ctx)
@@@ -1199,7 -1167,6 +1167,6 @@@ static void free_ctx(struct rcu_head *h
         struct perf_event_context *ctx;
   
         ctx = container_of(head, struct perf_event_context, rcu_head);
-       free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
         kfree(ctx);
   }
   
@@@ -1384,7 -1351,7 +1351,7 @@@ static u64 primary_event_id(struct perf
    * the context could get moved to another task.
    */
   static struct perf_event_context *
- perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
+ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
   {
         struct perf_event_context *ctx;
   
@@@ -1400,7 -1367,7 +1367,7 @@@ retry
          */
         local_irq_save(*flags);
         rcu_read_lock();
-       ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
+       ctx = rcu_dereference(task->perf_event_ctxp);
         if (ctx) {
                 /*
                  * If this context is a clone of another, it might
@@@ -1413,7 -1380,7 +1380,7 @@@
                  * can't get swapped on us any more.
                  */
                 raw_spin_lock(&ctx->lock);
-               if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
+               if (ctx != rcu_dereference(task->perf_event_ctxp)) {
                         raw_spin_unlock(&ctx->lock);
                         rcu_read_unlock();
                         local_irq_restore(*flags);
@@@ -1440,12 -1407,12 +1407,12 @@@
    * reference count so that the context can't get freed.
    */
   static struct perf_event_context *
- perf_pin_task_context(struct task_struct *task, int ctxn)
+ perf_pin_task_context(struct task_struct *task)
   {
         struct perf_event_context *ctx;
         unsigned long flags;
   
-       ctx = perf_lock_task_context(task, ctxn, &flags);
+       ctx = perf_lock_task_context(task, &flags);
         if (ctx) {
                 ++ctx->pin_count;
                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@@ -1593,14 -1560,22 +1560,22 @@@ static inline struct cgroup *event_cgro
    * which provides ordering when rotating groups for the same CPU.
    */
   static __always_inline int
- perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
-                     const u64 left_group_index, const struct perf_event *right)
+ perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
+                     const struct cgroup *left_cgroup, const u64 left_group_index,
+                     const struct perf_event *right)
   {
         if (left_cpu < right->cpu)
                 return -1;
         if (left_cpu > right->cpu)
                 return 1;
   
+       if (left_pmu) {
+               if (left_pmu < right->pmu_ctx->pmu)
+                       return -1;
+               if (left_pmu > right->pmu_ctx->pmu)
+                       return 1;
+       }
+ 
   #ifdef CONFIG_CGROUP_PERF
         {
                 const struct cgroup *right_cgroup = event_cgroup(right);
@@@ -1643,12 -1618,13 +1618,13 @@@
   static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
   {
         struct perf_event *e = __node_2_pe(a);
-       return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
-                                    __node_2_pe(b)) < 0;
+       return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
+                                    e->group_index, __node_2_pe(b)) < 0;
   }
   
   struct __group_key {
         int cpu;
+       struct pmu *pmu;
         struct cgroup *cgroup;
   };
   
@@@ -1657,14 -1633,25 +1633,25 @@@ static inline int __group_cmp(const voi
         const struct __group_key *a = key;
         const struct perf_event *b = __node_2_pe(node);
   
-       /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
-       return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
+       /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
+       return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
+ }
+ 
+ static inline int
+ __group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
+ {
+       const struct __group_key *a = key;
+       const struct perf_event *b = __node_2_pe(node);
+ 
+       /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
+       return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
+                                    b->group_index, b);
   }
   
   /*
-  * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
-  * key (see perf_event_groups_less). This places it last inside the CPU
-  * subtree.
+  * Insert @event into @groups' tree; using
+  *   {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
+  * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
    */
   static void
   perf_event_groups_insert(struct perf_event_groups *groups,
@@@ -1714,14 -1701,15 +1701,15 @@@ del_event_from_groups(struct perf_even
   }
   
   /*
-  * Get the leftmost event in the cpu/cgroup subtree.
+  * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
    */
   static struct perf_event *
   perf_event_groups_first(struct perf_event_groups *groups, int cpu,
-                       struct cgroup *cgrp)
+                       struct pmu *pmu, struct cgroup *cgrp)
   {
         struct __group_key key = {
                 .cpu = cpu,
+               .pmu = pmu,
                 .cgroup = cgrp,
         };
         struct rb_node *node;
@@@ -1733,14 -1721,12 +1721,12 @@@
         return NULL;
   }
   
- /*
-  * Like rb_entry_next_safe() for the @cpu subtree.
-  */
   static struct perf_event *
- perf_event_groups_next(struct perf_event *event)
+ perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
   {
         struct __group_key key = {
                 .cpu = event->cpu,
+               .pmu = pmu,
                 .cgroup = event_cgroup(event),
         };
         struct rb_node *next;
@@@ -1752,6 -1738,10 +1738,10 @@@
         return NULL;
   }
   
+ #define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu)                \
+       for (event = perf_event_groups_first(groups, cpu, pmu, NULL);   \
+            event; event = perf_event_groups_next(event, pmu))
+ 
   /*
    * Iterate through the whole groups tree.
    */
@@@ -1796,6 -1786,7 +1786,7 @@@ list_add_event(struct perf_event *event
                 perf_cgroup_event_enable(event, ctx);
   
         ctx->generation++;
+       event->pmu_ctx->nr_events++;
   }
   
   /*
@@@ -1941,7 -1932,8 +1932,8 @@@ static void perf_group_attach(struct pe
         lockdep_assert_held(&event->ctx->lock);
   
         /*
-        * We can have double attach due to group movement in perf_event_open.
+        * We can have double attach due to group movement (move_group) in
+        * perf_event_open().
          */
         if (event->attach_state & PERF_ATTACH_GROUP)
                 return;
@@@ -2006,6 -1998,7 +1998,7 @@@ list_del_event(struct perf_event *event
         }
   
         ctx->generation++;
+       event->pmu_ctx->nr_events--;
   }
   
   static int
@@@ -2022,13 -2015,11 +2015,11 @@@ perf_aux_output_match(struct perf_even
   
   static void put_event(struct perf_event *event);
   static void event_sched_out(struct perf_event *event,
-                           struct perf_cpu_context *cpuctx,
                             struct perf_event_context *ctx);
   
   static void perf_put_aux_event(struct perf_event *event)
   {
         struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
         struct perf_event *iter;
   
         /*
@@@ -2057,7 -2048,7 +2048,7 @@@
                  * state so that we don't try to schedule it again. Note
                  * that perf_event_enable() will clear the ERROR status.
                  */
-               event_sched_out(iter, cpuctx, ctx);
+               event_sched_out(iter, ctx);
                 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
         }
   }
@@@ -2108,8 -2099,8 +2099,8 @@@ static int perf_get_aux_event(struct pe
   
   static inline struct list_head *get_event_list(struct perf_event *event)
   {
-       struct perf_event_context *ctx = event->ctx;
-       return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
+       return event->attr.pinned ? &event->pmu_ctx->pinned_active :
+                                   &event->pmu_ctx->flexible_active;
   }
   
   /*
@@@ -2120,10 -2111,7 +2111,7 @@@
    */
   static inline void perf_remove_sibling_event(struct perf_event *event)
   {
-       struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
- 
-       event_sched_out(event, cpuctx, ctx);
+       event_sched_out(event, event->ctx);
         perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
   }
   
@@@ -2212,53 -2200,22 +2200,22 @@@ static bool is_orphaned_event(struct pe
         return event->state == PERF_EVENT_STATE_DEAD;
   }
   
- static inline int __pmu_filter_match(struct perf_event *event)
- {
-       struct pmu *pmu = event->pmu;
-       return pmu->filter_match ? pmu->filter_match(event) : 1;
- }
- 
- /*
-  * Check whether we should attempt to schedule an event group based on
-  * PMU-specific filtering. An event group can consist of HW and SW events,
-  * potentially with a SW leader, so we must check all the filters, to
-  * determine whether a group is schedulable:
-  */
- static inline int pmu_filter_match(struct perf_event *event)
- {
-       struct perf_event *sibling;
-       unsigned long flags;
-       int ret = 1;
- 
-       if (!__pmu_filter_match(event))
-               return 0;
- 
-       local_irq_save(flags);
-       for_each_sibling_event(sibling, event) {
-               if (!__pmu_filter_match(sibling)) {
-                       ret = 0;
-                       break;
-               }
-       }
-       local_irq_restore(flags);
- 
-       return ret;
- }
- 
   static inline int
   event_filter_match(struct perf_event *event)
   {
         return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
-              perf_cgroup_match(event) && pmu_filter_match(event);
+              perf_cgroup_match(event);
   }
   
   static void
- event_sched_out(struct perf_event *event,
-                 struct perf_cpu_context *cpuctx,
-                 struct perf_event_context *ctx)
+ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
   {
+       struct perf_event_pmu_context *epc = event->pmu_ctx;
+       struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
         enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
   
+       // XXX cpc serialization, probably per-cpu IRQ disabled
+ 
         WARN_ON_ONCE(event->ctx != ctx);
         lockdep_assert_held(&ctx->lock);
   
@@@ -2291,7 -2248,6 +2248,7 @@@
                     !event->pending_work) {
                         event->pending_work = 1;
                         dec = false;
+ +                      WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
                         task_work_add(current, &event->pending_task, TWA_RESUME);
                 }
                 if (dec)
@@@ -2301,43 -2257,36 +2258,37 @@@
         perf_event_set_state(event, state);
   
         if (!is_software_event(event))
-               cpuctx->active_oncpu--;
-       if (!--ctx->nr_active)
-               perf_event_ctx_deactivate(ctx);
+               cpc->active_oncpu--;
         if (event->attr.freq && event->attr.sample_freq)
                 ctx->nr_freq--;
-       if (event->attr.exclusive || !cpuctx->active_oncpu)
-               cpuctx->exclusive = 0;
+       if (event->attr.exclusive || !cpc->active_oncpu)
+               cpc->exclusive = 0;
   
         perf_pmu_enable(event->pmu);
   }
   
   static void
- group_sched_out(struct perf_event *group_event,
-               struct perf_cpu_context *cpuctx,
-               struct perf_event_context *ctx)
+ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
   {
         struct perf_event *event;
   
         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
                 return;
   
-       perf_pmu_disable(ctx->pmu);
+       perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
   
-       event_sched_out(group_event, cpuctx, ctx);
+       event_sched_out(group_event, ctx);
   
         /*
          * Schedule out siblings (if any):
          */
         for_each_sibling_event(event, group_event)
-               event_sched_out(event, cpuctx, ctx);
- 
-       perf_pmu_enable(ctx->pmu);
+               event_sched_out(event, ctx);
   }
   
   #define DETACH_GROUP  0x01UL
   #define DETACH_CHILD  0x02UL
+ +#define DETACH_DEAD   0x04UL
   
   /*
    * Cross CPU call to remove a performance event
@@@ -2351,6 -2300,7 +2302,7 @@@ __perf_remove_from_context(struct perf_
                            struct perf_event_context *ctx,
                            void *info)
   {
+       struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
         unsigned long flags = (unsigned long)info;
   
         if (ctx->is_active & EVENT_TIME) {
@@@ -2358,27 -2308,30 +2310,38 @@@
                 update_cgrp_time_from_cpuctx(cpuctx, false);
         }
   
-       event_sched_out(event, cpuctx, ctx);
+ +      /*
+ +       * Ensure event_sched_out() switches to OFF, at the very least
+ +       * this avoids raising perf_pending_task() at this time.
+ +       */
+ +      if (flags & DETACH_DEAD)
+ +              event->pending_disable = 1;
+       event_sched_out(event, ctx);
         if (flags & DETACH_GROUP)
                 perf_group_detach(event);
         if (flags & DETACH_CHILD)
                 perf_child_detach(event);
         list_del_event(event, ctx);
+ +      if (flags & DETACH_DEAD)
+ +              event->state = PERF_EVENT_STATE_DEAD;
   
+       if (!pmu_ctx->nr_events) {
+               pmu_ctx->rotate_necessary = 0;
+ 
+               if (ctx->task && ctx->is_active) {
+                       struct perf_cpu_pmu_context *cpc;
+ 
+                       cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+                       WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+                       cpc->task_epc = NULL;
+               }
+       }
+ 
         if (!ctx->nr_events && ctx->is_active) {
                 if (ctx == &cpuctx->ctx)
                         update_cgrp_time_from_cpuctx(cpuctx, true);
   
                 ctx->is_active = 0;
-               ctx->rotate_necessary = 0;
                 if (ctx->task) {
                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
                         cpuctx->task_ctx = NULL;
@@@ -2408,12 -2361,8 +2371,8 @@@ static void perf_remove_from_context(st
          * event_function_call() user.
          */
         raw_spin_lock_irq(&ctx->lock);
-       /*
-        * Cgroup events are per-cpu events, and must IPI because of
-        * cgrp_cpuctx_list.
-        */
-       if (!ctx->is_active && !is_cgroup_event(event)) {
-               __perf_remove_from_context(event, __get_cpu_context(ctx),
+       if (!ctx->is_active) {
+               __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
                                            ctx, (void *)flags);
                 raw_spin_unlock_irq(&ctx->lock);
                 return;
@@@ -2439,13 -2388,17 +2398,17 @@@ static void __perf_event_disable(struc
                 update_cgrp_time_from_event(event);
         }
   
+       perf_pmu_disable(event->pmu_ctx->pmu);
+ 
         if (event == event->group_leader)
-               group_sched_out(event, cpuctx, ctx);
+               group_sched_out(event, ctx);
         else
-               event_sched_out(event, cpuctx, ctx);
+               event_sched_out(event, ctx);
   
         perf_event_set_state(event, PERF_EVENT_STATE_OFF);
         perf_cgroup_event_disable(event, ctx);
+ 
+       perf_pmu_enable(event->pmu_ctx->pmu);
   }
   
   /*
@@@ -2507,10 -2460,10 +2470,10 @@@ static void perf_log_throttle(struct pe
   static void perf_log_itrace_start(struct perf_event *event);
   
   static int
- event_sched_in(struct perf_event *event,
-                struct perf_cpu_context *cpuctx,
-                struct perf_event_context *ctx)
+ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
   {
+       struct perf_event_pmu_context *epc = event->pmu_ctx;
+       struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
         int ret = 0;
   
         WARN_ON_ONCE(event->ctx != ctx);
@@@ -2551,14 -2504,12 +2514,12 @@@
         }
   
         if (!is_software_event(event))
-               cpuctx->active_oncpu++;
-       if (!ctx->nr_active++)
-               perf_event_ctx_activate(ctx);
+               cpc->active_oncpu++;
         if (event->attr.freq && event->attr.sample_freq)
                 ctx->nr_freq++;
   
         if (event->attr.exclusive)
-               cpuctx->exclusive = 1;
+               cpc->exclusive = 1;
   
   out:
         perf_pmu_enable(event->pmu);
@@@ -2567,26 -2518,24 +2528,24 @@@
   }
   
   static int
- group_sched_in(struct perf_event *group_event,
-              struct perf_cpu_context *cpuctx,
-              struct perf_event_context *ctx)
+ group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
   {
         struct perf_event *event, *partial_group = NULL;
-       struct pmu *pmu = ctx->pmu;
+       struct pmu *pmu = group_event->pmu_ctx->pmu;
   
         if (group_event->state == PERF_EVENT_STATE_OFF)
                 return 0;
   
         pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
   
-       if (event_sched_in(group_event, cpuctx, ctx))
+       if (event_sched_in(group_event, ctx))
                 goto error;
   
         /*
          * Schedule in siblings as one group (if any):
          */
         for_each_sibling_event(event, group_event) {
-               if (event_sched_in(event, cpuctx, ctx)) {
+               if (event_sched_in(event, ctx)) {
                         partial_group = event;
                         goto group_error;
                 }
@@@ -2605,9 -2554,9 +2564,9 @@@ group_error
                 if (event == partial_group)
                         break;
   
-               event_sched_out(event, cpuctx, ctx);
+               event_sched_out(event, ctx);
         }
-       event_sched_out(group_event, cpuctx, ctx);
+       event_sched_out(group_event, ctx);
   
   error:
         pmu->cancel_txn(pmu);
@@@ -2617,10 -2566,11 +2576,11 @@@
   /*
    * Work out whether we can put this event group on the CPU now.
    */
- static int group_can_go_on(struct perf_event *event,
-                          struct perf_cpu_context *cpuctx,
-                          int can_add_hw)
+ static int group_can_go_on(struct perf_event *event, int can_add_hw)
   {
+       struct perf_event_pmu_context *epc = event->pmu_ctx;
+       struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+ 
         /*
          * Groups consisting entirely of software events can always go on.
          */
@@@ -2630,7 -2580,7 +2590,7 @@@
          * If an exclusive group is already on, no other hardware
          * events can go on.
          */
-       if (cpuctx->exclusive)
+       if (cpc->exclusive)
                 return 0;
         /*
          * If this group is exclusive and there are already
@@@ -2652,36 -2602,29 +2612,29 @@@ static void add_event_to_ctx(struct per
         perf_group_attach(event);
   }
   
- static void ctx_sched_out(struct perf_event_context *ctx,
-                         struct perf_cpu_context *cpuctx,
-                         enum event_type_t event_type);
- static void
- ctx_sched_in(struct perf_event_context *ctx,
-            struct perf_cpu_context *cpuctx,
-            enum event_type_t event_type);
- 
- static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
-                              struct perf_event_context *ctx,
-                              enum event_type_t event_type)
+ static void task_ctx_sched_out(struct perf_event_context *ctx,
+                               enum event_type_t event_type)
   {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ 
         if (!cpuctx->task_ctx)
                 return;
   
         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                 return;
   
-       ctx_sched_out(ctx, cpuctx, event_type);
+       ctx_sched_out(ctx, event_type);
   }
   
   static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
                                 struct perf_event_context *ctx)
   {
-       cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
+       ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
         if (ctx)
-               ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
-       cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+                ctx_sched_in(ctx, EVENT_PINNED);
+       ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
         if (ctx)
-               ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+                ctx_sched_in(ctx, EVENT_FLEXIBLE);
   }
   
   /*
@@@ -2699,11 -2642,15 +2652,15 @@@
    * event_type is a bit mask of the types of events involved. For CPU events,
    * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
    */
+ /*
+  * XXX: ctx_resched() reschedule entire perf_event_context while adding new
+  * event to the context or enabling existing event in the context. We can
+  * probably optimize it by rescheduling only affected pmu_ctx.
+  */
   static void ctx_resched(struct perf_cpu_context *cpuctx,
                         struct perf_event_context *task_ctx,
                         enum event_type_t event_type)
   {
-       enum event_type_t ctx_event_type;
         bool cpu_event = !!(event_type & EVENT_CPU);
   
         /*
@@@ -2713,11 -2660,13 +2670,13 @@@
         if (event_type & EVENT_PINNED)
                 event_type |= EVENT_FLEXIBLE;
   
-       ctx_event_type = event_type & EVENT_ALL;
+       event_type &= EVENT_ALL;
   
-       perf_pmu_disable(cpuctx->ctx.pmu);
-       if (task_ctx)
-               task_ctx_sched_out(cpuctx, task_ctx, event_type);
+       perf_ctx_disable(&cpuctx->ctx);
+       if (task_ctx) {
+               perf_ctx_disable(task_ctx);
+               task_ctx_sched_out(task_ctx, event_type);
+       }
   
         /*
          * Decide which cpu ctx groups to schedule out based on the types
@@@ -2727,17 -2676,20 +2686,20 @@@
          *  - otherwise, do nothing more.
          */
         if (cpu_event)
-               cpu_ctx_sched_out(cpuctx, ctx_event_type);
-       else if (ctx_event_type & EVENT_PINNED)
-               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+               ctx_sched_out(&cpuctx->ctx, event_type);
+       else if (event_type & EVENT_PINNED)
+               ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
   
         perf_event_sched_in(cpuctx, task_ctx);
-       perf_pmu_enable(cpuctx->ctx.pmu);
+ 
+       perf_ctx_enable(&cpuctx->ctx);
+       if (task_ctx)
+               perf_ctx_enable(task_ctx);
   }
   
   void perf_pmu_resched(struct pmu *pmu)
   {
-       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
         struct perf_event_context *task_ctx = cpuctx->task_ctx;
   
         perf_ctx_lock(cpuctx, task_ctx);
@@@ -2755,7 -2707,7 +2717,7 @@@ static int  __perf_install_in_context(v
   {
         struct perf_event *event = info;
         struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
         struct perf_event_context *task_ctx = cpuctx->task_ctx;
         bool reprogram = true;
         int ret = 0;
@@@ -2797,7 -2749,7 +2759,7 @@@
   #endif
   
         if (reprogram) {
-               ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_out(ctx, EVENT_TIME);
                 add_event_to_ctx(event, ctx);
                 ctx_resched(cpuctx, task_ctx, get_event_type(event));
         } else {
@@@ -2830,7 -2782,7 +2792,7 @@@ perf_install_in_context(struct perf_eve
         WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
   
         if (event->cpu != -1)
-               event->cpu = cpu;
+               WARN_ON_ONCE(event->cpu != cpu);
   
         /*
          * Ensures that if we can observe event->ctx, both the event and ctx
@@@ -2842,8 -2794,6 +2804,6 @@@
          * perf_event_attr::disabled events will not run and can be initialized
          * without IPI. Except when this is the first event for the context, in
          * that case we need the magic of the IPI to set ctx->is_active.
-        * Similarly, cgroup events for the context also needs the IPI to
-        * manipulate the cgrp_cpuctx_list.
          *
          * The IOC_ENABLE that is sure to follow the creation of a disabled
          * event will issue the IPI and reprogram the hardware.
@@@ -2945,7 -2895,7 +2905,7 @@@ static void __perf_event_enable(struct 
                 return;
   
         if (ctx->is_active)
-               ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_out(ctx, EVENT_TIME);
   
         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
         perf_cgroup_event_enable(event, ctx);
@@@ -2954,7 -2904,7 +2914,7 @@@
                 return;
   
         if (!event_filter_match(event)) {
-               ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_in(ctx, EVENT_TIME);
                 return;
         }
   
@@@ -2963,7 -2913,7 +2923,7 @@@
          * then don't put it on unless the group is on.
          */
         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
-               ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_in(ctx, EVENT_TIME);
                 return;
         }
   
@@@ -3232,11 -3182,52 +3192,52 @@@ out
         return err;
   }
   
- static void ctx_sched_out(struct perf_event_context *ctx,
-                         struct perf_cpu_context *cpuctx,
-                         enum event_type_t event_type)
+ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
+                               enum event_type_t event_type)
   {
+       struct perf_event_context *ctx = pmu_ctx->ctx;
         struct perf_event *event, *tmp;
+       struct pmu *pmu = pmu_ctx->pmu;
+ 
+       if (ctx->task && !ctx->is_active) {
+               struct perf_cpu_pmu_context *cpc;
+ 
+               cpc = this_cpu_ptr(pmu->cpu_pmu_context);
+               WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+               cpc->task_epc = NULL;
+       }
+ 
+       if (!event_type)
+               return;
+ 
+       perf_pmu_disable(pmu);
+       if (event_type & EVENT_PINNED) {
+               list_for_each_entry_safe(event, tmp,
+                                        &pmu_ctx->pinned_active,
+                                        active_list)
+                       group_sched_out(event, ctx);
+       }
+ 
+       if (event_type & EVENT_FLEXIBLE) {
+               list_for_each_entry_safe(event, tmp,
+                                        &pmu_ctx->flexible_active,
+                                        active_list)
+                       group_sched_out(event, ctx);
+               /*
+                * Since we cleared EVENT_FLEXIBLE, also clear
+                * rotate_necessary, is will be reset by
+                * ctx_flexible_sched_in() when needed.
+                */
+               pmu_ctx->rotate_necessary = 0;
+       }
+       perf_pmu_enable(pmu);
+ }
+ 
+ static void
+ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+ {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+       struct perf_event_pmu_context *pmu_ctx;
         int is_active = ctx->is_active;
   
         lockdep_assert_held(&ctx->lock);
@@@ -3284,27 -3275,8 +3285,8 @@@
   
         is_active ^= ctx->is_active; /* changed bits */
   
-       if (!ctx->nr_active || !(is_active & EVENT_ALL))
-               return;
- 
-       perf_pmu_disable(ctx->pmu);
-       if (is_active & EVENT_PINNED) {
-               list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
-                       group_sched_out(event, cpuctx, ctx);
-       }
- 
-       if (is_active & EVENT_FLEXIBLE) {
-               list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
-                       group_sched_out(event, cpuctx, ctx);
- 
-               /*
-                * Since we cleared EVENT_FLEXIBLE, also clear
-                * rotate_necessary, is will be reset by
-                * ctx_flexible_sched_in() when needed.
-                */
-               ctx->rotate_necessary = 0;
-       }
-       perf_pmu_enable(ctx->pmu);
+       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+               __pmu_ctx_sched_out(pmu_ctx, is_active);
   }
   
   /*
@@@ -3409,26 -3381,68 +3391,68 @@@ static void perf_event_sync_stat(struc
         }
   }
   
- static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
-                                        struct task_struct *next)
+ #define double_list_for_each_entry(pos1, pos2, head1, head2, member)  \
+       for (pos1 = list_first_entry(head1, typeof(*pos1), member),     \
+            pos2 = list_first_entry(head2, typeof(*pos2), member);     \
+            !list_entry_is_head(pos1, head1, member) &&                \
+            !list_entry_is_head(pos2, head2, member);                  \
+            pos1 = list_next_entry(pos1, member),                      \
+            pos2 = list_next_entry(pos2, member))
+ 
+ static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx,
+                                         struct perf_event_context *next_ctx)
+ {
+       struct perf_event_pmu_context *prev_epc, *next_epc;
+ 
+       if (!prev_ctx->nr_task_data)
+               return;
+ 
+       double_list_for_each_entry(prev_epc, next_epc,
+                                  &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list,
+                                  pmu_ctx_entry) {
+ 
+               if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu))
+                       continue;
+ 
+               /*
+                * PMU specific parts of task perf context can require
+                * additional synchronization. As an example of such
+                * synchronization see implementation details of Intel
+                * LBR call stack data profiling;
+                */
+               if (prev_epc->pmu->swap_task_ctx)
+                       prev_epc->pmu->swap_task_ctx(prev_epc, next_epc);
+               else
+                       swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
+       }
+ }
+ 
+ static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in)
+ {
+       struct perf_event_pmu_context *pmu_ctx;
+       struct perf_cpu_pmu_context *cpc;
+ 
+       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+               cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ 
+               if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
+                       pmu_ctx->pmu->sched_task(pmu_ctx, sched_in);
+       }
+ }
+ 
+ static void
+ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
   {
-       struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
+       struct perf_event_context *ctx = task->perf_event_ctxp;
         struct perf_event_context *next_ctx;
         struct perf_event_context *parent, *next_parent;
-       struct perf_cpu_context *cpuctx;
         int do_switch = 1;
-       struct pmu *pmu;
   
         if (likely(!ctx))
                 return;
   
-       pmu = ctx->pmu;
-       cpuctx = __get_cpu_context(ctx);
-       if (!cpuctx->task_ctx)
-               return;
- 
         rcu_read_lock();
-       next_ctx = next->perf_event_ctxp[ctxn];
+       next_ctx = rcu_dereference(next->perf_event_ctxp);
         if (!next_ctx)
                 goto unlock;
   
@@@ -3453,7 -3467,7 +3477,7 @@@
                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
                 if (context_equiv(ctx, next_ctx)) {
   
-                       perf_pmu_disable(pmu);
+                       perf_ctx_disable(ctx);
   
                         /* PMIs are disabled; ctx->nr_pending is stable. */
                         if (local_read(&ctx->nr_pending) ||
@@@ -3470,21 -3484,10 +3494,10 @@@
                         WRITE_ONCE(ctx->task, next);
                         WRITE_ONCE(next_ctx->task, task);
   
-                       if (cpuctx->sched_cb_usage && pmu->sched_task)
-                               pmu->sched_task(ctx, false);
- 
-                       /*
-                        * PMU specific parts of task perf context can require
-                        * additional synchronization. As an example of such
-                        * synchronization see implementation details of Intel
-                        * LBR call stack data profiling;
-                        */
-                       if (pmu->swap_task_ctx)
-                               pmu->swap_task_ctx(ctx, next_ctx);
-                       else
-                               swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+                       perf_ctx_sched_task_cb(ctx, false);
+                       perf_event_swap_task_ctx_data(ctx, next_ctx);
   
-                       perf_pmu_enable(pmu);
+                       perf_ctx_enable(ctx);
   
                         /*
                          * RCU_INIT_POINTER here is safe because we've not
@@@ -3493,8 -3496,8 +3506,8 @@@
                          * since those values are always verified under
                          * ctx->lock which we're now holding.
                          */
-                       RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
-                       RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
+                       RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
+                       RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
   
                         do_switch = 0;
   
@@@ -3508,38 -3511,40 +3521,40 @@@ unlock
   
         if (do_switch) {
                 raw_spin_lock(&ctx->lock);
-               perf_pmu_disable(pmu);
+               perf_ctx_disable(ctx);
   
   inside_switch:
-               if (cpuctx->sched_cb_usage && pmu->sched_task)
-                       pmu->sched_task(ctx, false);
-               task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+               perf_ctx_sched_task_cb(ctx, false);
+               task_ctx_sched_out(ctx, EVENT_ALL);
   
-               perf_pmu_enable(pmu);
+               perf_ctx_enable(ctx);
                 raw_spin_unlock(&ctx->lock);
         }
   }
   
   static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+ static DEFINE_PER_CPU(int, perf_sched_cb_usages);
   
   void perf_sched_cb_dec(struct pmu *pmu)
   {
-       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+       struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
   
         this_cpu_dec(perf_sched_cb_usages);
+       barrier();
   
-       if (!--cpuctx->sched_cb_usage)
-               list_del(&cpuctx->sched_cb_entry);
+       if (!--cpc->sched_cb_usage)
+               list_del(&cpc->sched_cb_entry);
   }
   
   
   void perf_sched_cb_inc(struct pmu *pmu)
   {
-       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+       struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
   
-       if (!cpuctx->sched_cb_usage++)
-               list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+       if (!cpc->sched_cb_usage++)
+               list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
   
+       barrier();
         this_cpu_inc(perf_sched_cb_usages);
   }
   
@@@ -3551,19 -3556,21 +3566,21 @@@
    * PEBS requires this to provide PID/TID information. This requires we flush
    * all queued PEBS records before we context switch to a new task.
    */
- static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
+ static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in)
   {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
         struct pmu *pmu;
   
-       pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
+       pmu = cpc->epc.pmu;
   
+       /* software PMUs will not have sched_task */
         if (WARN_ON_ONCE(!pmu->sched_task))
                 return;
   
         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
         perf_pmu_disable(pmu);
   
-       pmu->sched_task(cpuctx->task_ctx, sched_in);
+       pmu->sched_task(cpc->task_epc, sched_in);
   
         perf_pmu_enable(pmu);
         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@@ -3573,26 -3580,20 +3590,20 @@@ static void perf_pmu_sched_task(struct 
                                 struct task_struct *next,
                                 bool sched_in)
   {
-       struct perf_cpu_context *cpuctx;
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+       struct perf_cpu_pmu_context *cpc;
   
-       if (prev == next)
+       /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
+       if (prev == next || cpuctx->task_ctx)
                 return;
   
-       list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
-               /* will be handled in perf_event_context_sched_in/out */
-               if (cpuctx->task_ctx)
-                       continue;
- 
-               __perf_pmu_sched_task(cpuctx, sched_in);
-       }
+       list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
+               __perf_pmu_sched_task(cpc, sched_in);
   }
   
   static void perf_event_switch(struct task_struct *task,
                               struct task_struct *next_prev, bool sched_in);
   
- #define for_each_task_context_nr(ctxn)                                        \
-       for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
- 
   /*
    * Called from scheduler to remove the events of the current task,
    * with interrupts disabled.
@@@ -3607,16 -3608,13 +3618,13 @@@
   void __perf_event_task_sched_out(struct task_struct *task,
                                  struct task_struct *next)
   {
-       int ctxn;
- 
         if (__this_cpu_read(perf_sched_cb_usages))
                 perf_pmu_sched_task(task, next, false);
   
         if (atomic_read(&nr_switch_events))
                 perf_event_switch(task, next, false);
   
-       for_each_task_context_nr(ctxn)
-               perf_event_context_sched_out(task, ctxn, next);
+       perf_event_context_sched_out(task, next);
   
         /*
          * if cgroup events exist on this CPU, then we need
@@@ -3627,15 -3625,6 +3635,6 @@@
                 perf_cgroup_switch(next);
   }
   
- /*
-  * Called with IRQs disabled
-  */
- static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
-                             enum event_type_t event_type)
- {
-       ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
- }
- 
   static bool perf_less_group_idx(const void *l, const void *r)
   {
         const struct perf_event *le = *(const struct perf_event **)l;
@@@ -3667,21 -3656,39 +3666,39 @@@ static void __heap_add(struct min_heap 
         }
   }
   
- static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
+ static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
+ {
+       struct perf_cpu_pmu_context *cpc;
+ 
+       if (!pmu_ctx->ctx->task)
+               return;
+ 
+       cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+       WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+       cpc->task_epc = pmu_ctx;
+ }
+ 
+ static noinline int visit_groups_merge(struct perf_event_context *ctx,
                                 struct perf_event_groups *groups, int cpu,
+                               struct pmu *pmu,
                                 int (*func)(struct perf_event *, void *),
                                 void *data)
   {
   #ifdef CONFIG_CGROUP_PERF
         struct cgroup_subsys_state *css = NULL;
   #endif
+       struct perf_cpu_context *cpuctx = NULL;
         /* Space for per CPU and/or any CPU event iterators. */
         struct perf_event *itrs[2];
         struct min_heap event_heap;
         struct perf_event **evt;
         int ret;
   
-       if (cpuctx) {
+       if (pmu->filter && pmu->filter(pmu, cpu))
+               return 0;
+ 
+       if (!ctx->task) {
+               cpuctx = this_cpu_ptr(&perf_cpu_context);
                 event_heap = (struct min_heap){
                         .data = cpuctx->heap,
                         .nr = 0,
@@@ -3701,17 -3708,22 +3718,22 @@@
                         .size = ARRAY_SIZE(itrs),
                 };
                 /* Events not within a CPU context may be on any CPU. */
-               __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
+               __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
         }
         evt = event_heap.data;
   
-       __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
+       __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));
   
   #ifdef CONFIG_CGROUP_PERF
         for (; css; css = css->parent)
-               __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
+               __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
   #endif
   
+       if (event_heap.nr) {
+               __link_epc((*evt)->pmu_ctx);
+               perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
+       }
+ 
         min_heapify_all(&event_heap, &perf_min_heap);
   
         while (event_heap.nr) {
@@@ -3719,7 -3731,7 +3741,7 @@@
                 if (ret)
                         return ret;
   
-               *evt = perf_event_groups_next(*evt);
+               *evt = perf_event_groups_next(*evt, pmu);
                 if (*evt)
                         min_heapify(&event_heap, 0, &perf_min_heap);
                 else
@@@ -3761,7 -3773,6 +3783,6 @@@ static inline void group_update_userpag
   static int merge_sched_in(struct perf_event *event, void *data)
   {
         struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
         int *can_add_hw = data;
   
         if (event->state <= PERF_EVENT_STATE_OFF)
@@@ -3770,8 -3781,8 +3791,8 @@@
         if (!event_filter_match(event))
                 return 0;
   
-       if (group_can_go_on(event, cpuctx, *can_add_hw)) {
-               if (!group_sched_in(event, cpuctx, ctx))
+       if (group_can_go_on(event, *can_add_hw)) {
+               if (!group_sched_in(event, ctx))
                         list_add_tail(&event->active_list, get_event_list(event));
         }
   
@@@ -3781,8 -3792,11 +3802,11 @@@
                         perf_cgroup_event_disable(event, ctx);
                         perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
                 } else {
-                       ctx->rotate_necessary = 1;
-                       perf_mux_hrtimer_restart(cpuctx);
+                       struct perf_cpu_pmu_context *cpc;
+ 
+                       event->pmu_ctx->rotate_necessary = 1;
+                       cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context);
+                       perf_mux_hrtimer_restart(cpc);
                         group_update_userpage(event);
                 }
         }
@@@ -3790,39 -3804,53 +3814,53 @@@
         return 0;
   }
   
- static void
- ctx_pinned_sched_in(struct perf_event_context *ctx,
-                   struct perf_cpu_context *cpuctx)
+ static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
   {
+       struct perf_event_pmu_context *pmu_ctx;
         int can_add_hw = 1;
   
-       if (ctx != &cpuctx->ctx)
-               cpuctx = NULL;
- 
-       visit_groups_merge(cpuctx, &ctx->pinned_groups,
-                          smp_processor_id(),
-                          merge_sched_in, &can_add_hw);
+       if (pmu) {
+               visit_groups_merge(ctx, &ctx->pinned_groups,
+                                  smp_processor_id(), pmu,
+                                  merge_sched_in, &can_add_hw);
+       } else {
+               list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+                       can_add_hw = 1;
+                       visit_groups_merge(ctx, &ctx->pinned_groups,
+                                          smp_processor_id(), pmu_ctx->pmu,
+                                          merge_sched_in, &can_add_hw);
+               }
+       }
   }
   
- static void
- ctx_flexible_sched_in(struct perf_event_context *ctx,
-                     struct perf_cpu_context *cpuctx)
+ static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
   {
+       struct perf_event_pmu_context *pmu_ctx;
         int can_add_hw = 1;
   
-       if (ctx != &cpuctx->ctx)
-               cpuctx = NULL;
+       if (pmu) {
+               visit_groups_merge(ctx, &ctx->flexible_groups,
+                                  smp_processor_id(), pmu,
+                                  merge_sched_in, &can_add_hw);
+       } else {
+               list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+                       can_add_hw = 1;
+                       visit_groups_merge(ctx, &ctx->flexible_groups,
+                                          smp_processor_id(), pmu_ctx->pmu,
+                                          merge_sched_in, &can_add_hw);
+               }
+       }
+ }
   
-       visit_groups_merge(cpuctx, &ctx->flexible_groups,
-                          smp_processor_id(),
-                          merge_sched_in, &can_add_hw);
+ static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+ {
+       ctx_flexible_sched_in(ctx, pmu);
   }
   
   static void
- ctx_sched_in(struct perf_event_context *ctx,
-            struct perf_cpu_context *cpuctx,
-            enum event_type_t event_type)
+ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
   {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
         int is_active = ctx->is_active;
   
         lockdep_assert_held(&ctx->lock);
@@@ -3856,39 -3884,32 +3894,32 @@@
          * in order to give them the best chance of going on.
          */
         if (is_active & EVENT_PINNED)
-               ctx_pinned_sched_in(ctx, cpuctx);
+               ctx_pinned_sched_in(ctx, NULL);
   
         /* Then walk through the lower prio flexible groups */
         if (is_active & EVENT_FLEXIBLE)
-               ctx_flexible_sched_in(ctx, cpuctx);
+               ctx_flexible_sched_in(ctx, NULL);
   }
   
- static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-                            enum event_type_t event_type)
+ static void perf_event_context_sched_in(struct task_struct *task)
   {
-       struct perf_event_context *ctx = &cpuctx->ctx;
- 
-       ctx_sched_in(ctx, cpuctx, event_type);
- }
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+       struct perf_event_context *ctx;
   
- static void perf_event_context_sched_in(struct perf_event_context *ctx,
-                                       struct task_struct *task)
- {
-       struct perf_cpu_context *cpuctx;
-       struct pmu *pmu;
+       rcu_read_lock();
+       ctx = rcu_dereference(task->perf_event_ctxp);
+       if (!ctx)
+               goto rcu_unlock;
   
-       cpuctx = __get_cpu_context(ctx);
+       if (cpuctx->task_ctx == ctx) {
+               perf_ctx_lock(cpuctx, ctx);
+               perf_ctx_disable(ctx);
   
-       /*
-        * HACK: for HETEROGENEOUS the task context might have switched to a
-        * different PMU, force (re)set the context,
-        */
-       pmu = ctx->pmu = cpuctx->ctx.pmu;
+               perf_ctx_sched_task_cb(ctx, true);
   
-       if (cpuctx->task_ctx == ctx) {
-               if (cpuctx->sched_cb_usage)
-                       __perf_pmu_sched_task(cpuctx, true);
-               return;
+               perf_ctx_enable(ctx);
+               perf_ctx_unlock(cpuctx, ctx);
+               goto rcu_unlock;
         }
   
         perf_ctx_lock(cpuctx, ctx);
@@@ -3899,7 -3920,7 +3930,7 @@@
         if (!ctx->nr_events)
                 goto unlock;
   
-       perf_pmu_disable(pmu);
+       perf_ctx_disable(ctx);
         /*
          * We want to keep the following priority order:
          * cpu pinned (that don't need to move), task pinned,
@@@ -3908,17 -3929,24 +3939,24 @@@
          * However, if task's ctx is not carrying any pinned
          * events, no need to flip the cpuctx's events around.
          */
-       if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
-               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+       if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
+               perf_ctx_disable(&cpuctx->ctx);
+               ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+       }
+ 
         perf_event_sched_in(cpuctx, ctx);
   
-       if (cpuctx->sched_cb_usage && pmu->sched_task)
-               pmu->sched_task(cpuctx->task_ctx, true);
+       perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
   
-       perf_pmu_enable(pmu);
+       if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
+               perf_ctx_enable(&cpuctx->ctx);
+ 
+       perf_ctx_enable(ctx);
   
   unlock:
         perf_ctx_unlock(cpuctx, ctx);
+ rcu_unlock:
+       rcu_read_unlock();
   }
   
   /*
@@@ -3935,16 -3963,7 +3973,7 @@@
   void __perf_event_task_sched_in(struct task_struct *prev,
                                 struct task_struct *task)
   {
-       struct perf_event_context *ctx;
-       int ctxn;
- 
-       for_each_task_context_nr(ctxn) {
-               ctx = task->perf_event_ctxp[ctxn];
-               if (likely(!ctx))
-                       continue;
- 
-               perf_event_context_sched_in(ctx, task);
-       }
+       perf_event_context_sched_in(task);
   
         if (atomic_read(&nr_switch_events))
                 perf_event_switch(task, prev, true);
@@@ -4063,8 -4082,8 +4092,8 @@@ static void perf_adjust_period(struct p
    * events. At the same time, make sure, having freq events does not change
    * the rate of unthrottling as that would introduce bias.
    */
- static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
-                                          int needs_unthr)
+ static void
+ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
   {
         struct perf_event *event;
         struct hw_perf_event *hwc;
@@@ -4076,16 -4095,16 +4105,16 @@@
          * - context have events in frequency mode (needs freq adjust)
          * - there are events to unthrottle on this cpu
          */
-       if (!(ctx->nr_freq || needs_unthr))
+       if (!(ctx->nr_freq || unthrottle))
                 return;
   
         raw_spin_lock(&ctx->lock);
-       perf_pmu_disable(ctx->pmu);
   
         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                 if (event->state != PERF_EVENT_STATE_ACTIVE)
                         continue;
   
+               // XXX use visit thingy to avoid the -1,cpu match
                 if (!event_filter_match(event))
                         continue;
   
@@@ -4126,7 -4145,6 +4155,6 @@@
                 perf_pmu_enable(event->pmu);
         }
   
-       perf_pmu_enable(ctx->pmu);
         raw_spin_unlock(&ctx->lock);
   }
   
@@@ -4148,72 -4166,109 +4176,109 @@@ static void rotate_ctx(struct perf_even
   
   /* pick an event from the flexible_groups to rotate */
   static inline struct perf_event *
- ctx_event_to_rotate(struct perf_event_context *ctx)
+ ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
   {
         struct perf_event *event;
+       struct rb_node *node;
+       struct rb_root *tree;
+       struct __group_key key = {
+               .pmu = pmu_ctx->pmu,
+       };
   
         /* pick the first active flexible event */
-       event = list_first_entry_or_null(&ctx->flexible_active,
+       event = list_first_entry_or_null(&pmu_ctx->flexible_active,
                                          struct perf_event, active_list);
+       if (event)
+               goto out;
   
         /* if no active flexible event, pick the first event */
-       if (!event) {
-               event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
-                                     typeof(*event), group_node);
-       }
+       tree = &pmu_ctx->ctx->flexible_groups.tree;
   
+       if (!pmu_ctx->ctx->task) {
+               key.cpu = smp_processor_id();
+ 
+               node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+               if (node)
+                       event = __node_2_pe(node);
+               goto out;
+       }
+ 
+       key.cpu = -1;
+       node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+       if (node) {
+               event = __node_2_pe(node);
+               goto out;
+       }
+ 
+       key.cpu = smp_processor_id();
+       node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+       if (node)
+               event = __node_2_pe(node);
+ 
+ out:
         /*
          * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
          * finds there are unschedulable events, it will set it again.
          */
-       ctx->rotate_necessary = 0;
+       pmu_ctx->rotate_necessary = 0;
   
         return event;
   }
   
- static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
+ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
   {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+       struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
         struct perf_event *cpu_event = NULL, *task_event = NULL;
-       struct perf_event_context *task_ctx = NULL;
         int cpu_rotate, task_rotate;
+       struct pmu *pmu;
   
         /*
          * Since we run this from IRQ context, nobody can install new
          * events, thus the event count values are stable.
          */
   
-       cpu_rotate = cpuctx->ctx.rotate_necessary;
-       task_ctx = cpuctx->task_ctx;
-       task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
+       cpu_epc = &cpc->epc;
+       pmu = cpu_epc->pmu;
+       task_epc = cpc->task_epc;
+ 
+       cpu_rotate = cpu_epc->rotate_necessary;
+       task_rotate = task_epc ? task_epc->rotate_necessary : 0;
   
         if (!(cpu_rotate || task_rotate))
                 return false;
   
         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-       perf_pmu_disable(cpuctx->ctx.pmu);
+       perf_pmu_disable(pmu);
   
         if (task_rotate)
-               task_event = ctx_event_to_rotate(task_ctx);
+               task_event = ctx_event_to_rotate(task_epc);
         if (cpu_rotate)
-               cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
+               cpu_event = ctx_event_to_rotate(cpu_epc);
   
         /*
          * As per the order given at ctx_resched() first 'pop' task flexible
          * and then, if needed CPU flexible.
          */
-       if (task_event || (task_ctx && cpu_event))
-               ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
-       if (cpu_event)
-               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+       if (task_event || (task_epc && cpu_event)) {
+               update_context_time(task_epc->ctx);
+               __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
+       }
   
-       if (task_event)
-               rotate_ctx(task_ctx, task_event);
-       if (cpu_event)
+       if (cpu_event) {
+               update_context_time(&cpuctx->ctx);
+               __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
                 rotate_ctx(&cpuctx->ctx, cpu_event);
+               __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+       }
   
-       perf_event_sched_in(cpuctx, task_ctx);
+       if (task_event)
+               rotate_ctx(task_epc->ctx, task_event);
   
-       perf_pmu_enable(cpuctx->ctx.pmu);
+       if (task_event || (task_epc && cpu_event))
+               __pmu_ctx_sched_in(task_epc->ctx, pmu);
+ 
+       perf_pmu_enable(pmu);
         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
   
         return true;
@@@ -4221,8 -4276,8 +4286,8 @@@
   
   void perf_event_task_tick(void)
   {
-       struct list_head *head = this_cpu_ptr(&active_ctx_list);
-       struct perf_event_context *ctx, *tmp;
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+       struct perf_event_context *ctx;
         int throttled;
   
         lockdep_assert_irqs_disabled();
@@@ -4231,8 -4286,13 +4296,13 @@@
         throttled = __this_cpu_xchg(perf_throttled_count, 0);
         tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
   
-       list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
-               perf_adjust_freq_unthr_context(ctx, throttled);
+       perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);
+ 
+       rcu_read_lock();
+       ctx = rcu_dereference(current->perf_event_ctxp);
+       if (ctx)
+               perf_adjust_freq_unthr_context(ctx, !!throttled);
+       rcu_read_unlock();
   }
   
   static int event_enable_on_exec(struct perf_event *event,
@@@ -4254,9 -4314,9 +4324,9 @@@
    * Enable all of a task's events that have been marked enable-on-exec.
    * This expects task == current.
    */
- static void perf_event_enable_on_exec(int ctxn)
+ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
   {
-       struct perf_event_context *ctx, *clone_ctx = NULL;
+       struct perf_event_context *clone_ctx = NULL;
         enum event_type_t event_type = 0;
         struct perf_cpu_context *cpuctx;
         struct perf_event *event;
@@@ -4264,13 -4324,16 +4334,16 @@@
         int enabled = 0;
   
         local_irq_save(flags);
-       ctx = current->perf_event_ctxp[ctxn];
-       if (!ctx || !ctx->nr_events)
+       if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
+               goto out;
+ 
+       if (!ctx->nr_events)
                 goto out;
   
-       cpuctx = __get_cpu_context(ctx);
+       cpuctx = this_cpu_ptr(&perf_cpu_context);
         perf_ctx_lock(cpuctx, ctx);
-       ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+       ctx_sched_out(ctx, EVENT_TIME);
+ 
         list_for_each_entry(event, &ctx->event_list, event_entry) {
                 enabled |= event_enable_on_exec(event, ctx);
                 event_type |= get_event_type(event);
@@@ -4283,7 -4346,7 +4356,7 @@@
                 clone_ctx = unclone_ctx(ctx);
                 ctx_resched(cpuctx, ctx, event_type);
         } else {
-               ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_in(ctx, EVENT_TIME);
         }
         perf_ctx_unlock(cpuctx, ctx);
   
@@@ -4302,17 -4365,13 +4375,13 @@@ static void perf_event_exit_event(struc
    * Removes all events from the current task that have been marked
    * remove-on-exec, and feeds their values back to parent events.
    */
- static void perf_event_remove_on_exec(int ctxn)
+ static void perf_event_remove_on_exec(struct perf_event_context *ctx)
   {
-       struct perf_event_context *ctx, *clone_ctx = NULL;
+       struct perf_event_context *clone_ctx = NULL;
         struct perf_event *event, *next;
         unsigned long flags;
         bool modified = false;
   
-       ctx = perf_pin_task_context(current, ctxn);
-       if (!ctx)
-               return;
- 
         mutex_lock(&ctx->mutex);
   
         if (WARN_ON_ONCE(ctx->task != current))
@@@ -4333,13 -4392,11 +4402,11 @@@
         raw_spin_lock_irqsave(&ctx->lock, flags);
         if (modified)
                 clone_ctx = unclone_ctx(ctx);
-       --ctx->pin_count;
         raw_spin_unlock_irqrestore(&ctx->lock, flags);
   
   unlock:
         mutex_unlock(&ctx->mutex);
   
-       put_ctx(ctx);
         if (clone_ctx)
                 put_ctx(clone_ctx);
   }
@@@ -4375,7 -4432,7 +4442,7 @@@ static void __perf_event_read(void *inf
         struct perf_read_data *data = info;
         struct perf_event *sub, *event = data->event;
         struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
         struct pmu *pmu = event->pmu;
   
         /*
@@@ -4601,17 -4658,25 +4668,25 @@@ static void __perf_event_init_context(s
   {
         raw_spin_lock_init(&ctx->lock);
         mutex_init(&ctx->mutex);
-       INIT_LIST_HEAD(&ctx->active_ctx_list);
+       INIT_LIST_HEAD(&ctx->pmu_ctx_list);
         perf_event_groups_init(&ctx->pinned_groups);
         perf_event_groups_init(&ctx->flexible_groups);
         INIT_LIST_HEAD(&ctx->event_list);
-       INIT_LIST_HEAD(&ctx->pinned_active);
-       INIT_LIST_HEAD(&ctx->flexible_active);
         refcount_set(&ctx->refcount, 1);
   }
   
+ static void
+ __perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
+ {
+       epc->pmu = pmu;
+       INIT_LIST_HEAD(&epc->pmu_ctx_entry);
+       INIT_LIST_HEAD(&epc->pinned_active);
+       INIT_LIST_HEAD(&epc->flexible_active);
+       atomic_set(&epc->refcount, 1);
+ }
+ 
   static struct perf_event_context *
- alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+ alloc_perf_context(struct task_struct *task)
   {
         struct perf_event_context *ctx;
   
@@@ -4622,7 -4687,6 +4697,6 @@@
         __perf_event_init_context(ctx);
         if (task)
                 ctx->task = get_task_struct(task);
-       ctx->pmu = pmu;
   
         return ctx;
   }
@@@ -4651,15 -4715,12 +4725,12 @@@ find_lively_task_by_vpid(pid_t vpid
    * Returns a matching context with refcount and pincount.
    */
   static struct perf_event_context *
- find_get_context(struct pmu *pmu, struct task_struct *task,
-               struct perf_event *event)
+ find_get_context(struct task_struct *task, struct perf_event *event)
   {
         struct perf_event_context *ctx, *clone_ctx = NULL;
         struct perf_cpu_context *cpuctx;
-       void *task_ctx_data = NULL;
         unsigned long flags;
-       int ctxn, err;
-       int cpu = event->cpu;
+       int err;
   
         if (!task) {
                 /* Must be root to operate on a CPU event: */
@@@ -4667,7 -4728,7 +4738,7 @@@
                 if (err)
                         return ERR_PTR(err);
   
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+               cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
                 ctx = &cpuctx->ctx;
                 get_ctx(ctx);
                 raw_spin_lock_irqsave(&ctx->lock, flags);
@@@ -4678,43 -4739,22 +4749,22 @@@
         }
   
         err = -EINVAL;
-       ctxn = pmu->task_ctx_nr;
-       if (ctxn < 0)
-               goto errout;
- 
-       if (event->attach_state & PERF_ATTACH_TASK_DATA) {
-               task_ctx_data = alloc_task_ctx_data(pmu);
-               if (!task_ctx_data) {
-                       err = -ENOMEM;
-                       goto errout;
-               }
-       }
- 
   retry:
-       ctx = perf_lock_task_context(task, ctxn, &flags);
+       ctx = perf_lock_task_context(task, &flags);
         if (ctx) {
                 clone_ctx = unclone_ctx(ctx);
                 ++ctx->pin_count;
   
-               if (task_ctx_data && !ctx->task_ctx_data) {
-                       ctx->task_ctx_data = task_ctx_data;
-                       task_ctx_data = NULL;
-               }
                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
   
                 if (clone_ctx)
                         put_ctx(clone_ctx);
         } else {
-               ctx = alloc_perf_context(pmu, task);
+               ctx = alloc_perf_context(task);
                 err = -ENOMEM;
                 if (!ctx)
                         goto errout;
   
-               if (task_ctx_data) {
-                       ctx->task_ctx_data = task_ctx_data;
-                       task_ctx_data = NULL;
-               }
- 
                 err = 0;
                 mutex_lock(&task->perf_event_mutex);
                 /*
@@@ -4723,12 -4763,12 +4773,12 @@@
                  */
                 if (task->flags & PF_EXITING)
                         err = -ESRCH;
-               else if (task->perf_event_ctxp[ctxn])
+               else if (task->perf_event_ctxp)
                         err = -EAGAIN;
                 else {
                         get_ctx(ctx);
                         ++ctx->pin_count;
-                       rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+                       rcu_assign_pointer(task->perf_event_ctxp, ctx);
                 }
                 mutex_unlock(&task->perf_event_mutex);
   
@@@ -4741,21 -4781,146 +4791,146 @@@
                 }
         }
   
-       free_task_ctx_data(pmu, task_ctx_data);
         return ctx;
   
   errout:
-       free_task_ctx_data(pmu, task_ctx_data);
         return ERR_PTR(err);
   }
   
+ static struct perf_event_pmu_context *
+ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
+                    struct perf_event *event)
+ {
+       struct perf_event_pmu_context *new = NULL, *epc;
+       void *task_ctx_data = NULL;
+ 
+       if (!ctx->task) {
+               struct perf_cpu_pmu_context *cpc;
+ 
+               cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
+               epc = &cpc->epc;
+ 
+               if (!epc->ctx) {
+                       atomic_set(&epc->refcount, 1);
+                       epc->embedded = 1;
+                       raw_spin_lock_irq(&ctx->lock);
+                       list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+                       epc->ctx = ctx;
+                       raw_spin_unlock_irq(&ctx->lock);
+               } else {
+                       WARN_ON_ONCE(epc->ctx != ctx);
+                       atomic_inc(&epc->refcount);
+               }
+ 
+               return epc;
+       }
+ 
+       new = kzalloc(sizeof(*epc), GFP_KERNEL);
+       if (!new)
+               return ERR_PTR(-ENOMEM);
+ 
+       if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+               task_ctx_data = alloc_task_ctx_data(pmu);
+               if (!task_ctx_data) {
+                       kfree(new);
+                       return ERR_PTR(-ENOMEM);
+               }
+       }
+ 
+       __perf_init_event_pmu_context(new, pmu);
+ 
+       /*
+        * XXX
+        *
+        * lockdep_assert_held(&ctx->mutex);
+        *
+        * can't because perf_event_init_task() doesn't actually hold the
+        * child_ctx->mutex.
+        */
+ 
+       raw_spin_lock_irq(&ctx->lock);
+       list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+               if (epc->pmu == pmu) {
+                       WARN_ON_ONCE(epc->ctx != ctx);
+                       atomic_inc(&epc->refcount);
+                       goto found_epc;
+               }
+       }
+ 
+       epc = new;
+       new = NULL;
+ 
+       list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+       epc->ctx = ctx;
+ 
+ found_epc:
+       if (task_ctx_data && !epc->task_ctx_data) {
+               epc->task_ctx_data = task_ctx_data;
+               task_ctx_data = NULL;
+               ctx->nr_task_data++;
+       }
+       raw_spin_unlock_irq(&ctx->lock);
+ 
+       free_task_ctx_data(pmu, task_ctx_data);
+       kfree(new);
+ 
+       return epc;
+ }
+ 
+ static void get_pmu_ctx(struct perf_event_pmu_context *epc)
+ {
+       WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
+ }
+ 
+ static void free_epc_rcu(struct rcu_head *head)
+ {
+       struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
+ 
+       kfree(epc->task_ctx_data);
+       kfree(epc);
+ }
+ 
+ static void put_pmu_ctx(struct perf_event_pmu_context *epc)
+ {
+       unsigned long flags;
+ 
+       if (!atomic_dec_and_test(&epc->refcount))
+               return;
+ 
+       if (epc->ctx) {
+               struct perf_event_context *ctx = epc->ctx;
+ 
+               /*
+                * XXX
+                *
+                * lockdep_assert_held(&ctx->mutex);
+                *
+                * can't because of the call-site in _free_event()/put_event()
+                * which isn't always called under ctx->mutex.
+                */
+ 
+               WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
+               raw_spin_lock_irqsave(&ctx->lock, flags);
+               list_del_init(&epc->pmu_ctx_entry);
+               epc->ctx = NULL;
+               raw_spin_unlock_irqrestore(&ctx->lock, flags);
+       }
+ 
+       WARN_ON_ONCE(!list_empty(&epc->pinned_active));
+       WARN_ON_ONCE(!list_empty(&epc->flexible_active));
+ 
+       if (epc->embedded)
+               return;
+ 
+       call_rcu(&epc->rcu_head, free_epc_rcu);
+ }
+ 
   static void perf_event_free_filter(struct perf_event *event);
   
   static void free_event_rcu(struct rcu_head *head)
   {
-       struct perf_event *event;
+       struct perf_event *event = container_of(head, typeof(*event), rcu_head);
   
-       event = container_of(head, struct perf_event, rcu_head);
         if (event->ns)
                 put_pid_ns(event->ns);
         perf_event_free_filter(event);
@@@ -4893,7 -5058,7 +5068,7 @@@ static void perf_sched_delayed(struct w
    *
    *  1) cpu-wide events in the presence of per-task events,
    *  2) per-task events in the presence of cpu-wide events,
-  *  3) two matching events on the same context.
+  *  3) two matching events on the same perf_event_context.
    *
    * The former two cases are handled in the allocation path (perf_event_alloc(),
    * _free_event()), the latter -- before the first perf_install_in_context().
@@@ -5017,6 -5182,9 +5192,9 @@@ static void _free_event(struct perf_eve
         if (event->hw.target)
                 put_task_struct(event->hw.target);
   
+       if (event->pmu_ctx)
+               put_pmu_ctx(event->pmu_ctx);
+ 
         /*
          * perf_event_free_task() relies on put_ctx() being 'last', in particular
          * all task references must be cleaned up.
@@@ -5117,8 -5285,8 +5295,8 @@@ int perf_event_release_kernel(struct pe
         LIST_HEAD(free_list);
   
         /*
-        * If we got here through err_file: fput(event_file); we will not have
-        * attached to a context yet.
+        * If we got here through err_alloc: free_event(event); we will not
+        * have attached to a context yet.
          */
         if (!ctx) {
                 WARN_ON_ONCE(event->attach_state &
@@@ -5131,7 -5299,9 +5309,7 @@@
   
         ctx = perf_event_ctx_lock(event);
         WARN_ON_ONCE(ctx->parent_ctx);
- -      perf_remove_from_context(event, DETACH_GROUP);
   
- -      raw_spin_lock_irq(&ctx->lock);
         /*
          * Mark this event as STATE_DEAD, there is no external reference to it
          * anymore.
@@@ -5143,7 -5313,8 +5321,7 @@@
          * Thus this guarantees that we will in fact observe and kill _ALL_
          * child events.
          */
- -      event->state = PERF_EVENT_STATE_DEAD;
- -      raw_spin_unlock_irq(&ctx->lock);
+ +      perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
   
         perf_event_ctx_unlock(event, ctx);
   
@@@ -5550,7 -5721,7 +5728,7 @@@ static void __perf_event_period(struct 
   
         active = (event->state == PERF_EVENT_STATE_ACTIVE);
         if (active) {
-               perf_pmu_disable(ctx->pmu);
+               perf_pmu_disable(event->pmu);
                 /*
                  * We could be throttled; unthrottle now to avoid the tick
                  * trying to unthrottle while we already re-started the event.
@@@ -5566,7 -5737,7 +5744,7 @@@
   
         if (active) {
                 event->pmu->start(event, PERF_EF_RELOAD);
-               perf_pmu_enable(ctx->pmu);
+               perf_pmu_enable(event->pmu);
         }
   }
   
@@@ -6584,8 -6755,6 +6762,8 @@@ static void perf_pending_task(struct ca
         if (rctx >= 0)
                 perf_swevent_put_recursion_context(rctx);
         preempt_enable_notrace();
+ +
+ +      put_event(event);
   }
   
   #ifdef CONFIG_GUEST_PERF_EVENTS
@@@ -7729,7 -7898,6 +7907,6 @@@ perf_iterate_sb(perf_iterate_f output, 
                struct perf_event_context *task_ctx)
   {
         struct perf_event_context *ctx;
-       int ctxn;
   
         rcu_read_lock();
         preempt_disable();
@@@ -7746,11 -7914,9 +7923,9 @@@
   
         perf_iterate_sb_cpu(output, data);
   
-       for_each_task_context_nr(ctxn) {
-               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-               if (ctx)
-                       perf_iterate_ctx(ctx, output, data, false);
-       }
+       ctx = rcu_dereference(current->perf_event_ctxp);
+       if (ctx)
+               perf_iterate_ctx(ctx, output, data, false);
   done:
         preempt_enable();
         rcu_read_unlock();
@@@ -7792,20 -7958,17 +7967,17 @@@ static void perf_event_addr_filters_exe
   void perf_event_exec(void)
   {
         struct perf_event_context *ctx;
-       int ctxn;
   
-       for_each_task_context_nr(ctxn) {
-               perf_event_enable_on_exec(ctxn);
-               perf_event_remove_on_exec(ctxn);
+       ctx = perf_pin_task_context(current);
+       if (!ctx)
+               return;
   
-               rcu_read_lock();
-               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-               if (ctx) {
-                       perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
-                                        NULL, true);
-               }
-               rcu_read_unlock();
-       }
+       perf_event_enable_on_exec(ctx);
+       perf_event_remove_on_exec(ctx);
+       perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
+ 
+       perf_unpin_context(ctx);
+       put_ctx(ctx);
   }
   
   struct remote_output {
@@@ -7845,8 -8008,7 +8017,7 @@@ static void __perf_event_output_stop(st
   static int __perf_pmu_output_stop(void *info)
   {
         struct perf_event *event = info;
-       struct pmu *pmu = event->ctx->pmu;
-       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
         struct remote_output ro = {
                 .rb     = event->rb,
         };
@@@ -8635,7 -8797,6 +8806,6 @@@ static void __perf_addr_filters_adjust(
   static void perf_addr_filters_adjust(struct vm_area_struct *vma)
   {
         struct perf_event_context *ctx;
-       int ctxn;
   
         /*
          * Data tracing isn't supported yet and as such there is no need
@@@ -8645,13 -8806,9 +8815,9 @@@
                 return;
   
         rcu_read_lock();
-       for_each_task_context_nr(ctxn) {
-               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-               if (!ctx)
-                       continue;
- 
+       ctx = rcu_dereference(current->perf_event_ctxp);
+       if (ctx)
                 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
-       }
         rcu_read_unlock();
   }
   
@@@ -9039,7 -9196,7 +9205,7 @@@ static void perf_event_bpf_emit_ksymbol
                                 PERF_RECORD_KSYMBOL_TYPE_BPF,
                                 (u64)(unsigned long)subprog->bpf_func,
                                 subprog->jited_len, unregister,
- -                              prog->aux->ksym.name);
+ +                              subprog->aux->ksym.name);
                 }
         }
   }
@@@ -9282,19 -9439,6 +9448,19 @@@ int perf_event_account_interrupt(struc
         return __perf_event_account_interrupt(event, 1);
   }
   
+ +static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
+ +{
+ +      /*
+ +       * Due to interrupt latency (AKA "skid"), we may enter the
+ +       * kernel before taking an overflow, even if the PMU is only
+ +       * counting user events.
+ +       */
+ +      if (event->attr.exclude_kernel && !user_mode(regs))
+ +              return false;
+ +
+ +      return true;
+ +}
+ +
   /*
    * Generic event overflow handling, sampling.
    */
@@@ -9329,38 -9473,15 +9495,38 @@@ static int __perf_event_overflow(struc
   
         if (event->attr.sigtrap) {
                 /*
- -               * Should not be able to return to user space without processing
- -               * pending_sigtrap (kernel events can overflow multiple times).
+ +               * The desired behaviour of sigtrap vs invalid samples is a bit
+ +               * tricky; on the one hand, one should not loose the SIGTRAP if
+ +               * it is the first event, on the other hand, we should also not
+ +               * trigger the WARN or override the data address.
                  */
- -              WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
+ +              bool valid_sample = sample_is_allowed(event, regs);
+ +              unsigned int pending_id = 1;
+ +
+ +              if (regs)
+ +                      pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
                 if (!event->pending_sigtrap) {
- -                      event->pending_sigtrap = 1;
+ +                      event->pending_sigtrap = pending_id;
                         local_inc(&event->ctx->nr_pending);
+ +              } else if (event->attr.exclude_kernel && valid_sample) {
+ +                      /*
+ +                       * Should not be able to return to user space without
+ +                       * consuming pending_sigtrap; with exceptions:
+ +                       *
+ +                       *  1. Where !exclude_kernel, events can overflow again
+ +                       *     in the kernel without returning to user space.
+ +                       *
+ +                       *  2. Events that can overflow again before the IRQ-
+ +                       *     work without user space progress (e.g. hrtimer).
+ +                       *     To approximate progress (with false negatives),
+ +                       *     check 32-bit hash of the current IP.
+ +                       */
+ +                      WARN_ON_ONCE(event->pending_sigtrap != pending_id);
                 }
- -              event->pending_addr = data->addr;
+ +
+ +              event->pending_addr = 0;
+ +              if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
+ +                      event->pending_addr = data->addr;
                 irq_work_queue(&event->pending_irq);
         }
   
@@@ -9826,6 -9947,44 +9992,44 @@@ static struct pmu perf_swevent = 
   
   #ifdef CONFIG_EVENT_TRACING
   
+ static void tp_perf_event_destroy(struct perf_event *event)
+ {
+       perf_trace_destroy(event);
+ }
+ 
+ static int perf_tp_event_init(struct perf_event *event)
+ {
+       int err;
+ 
+       if (event->attr.type != PERF_TYPE_TRACEPOINT)
+               return -ENOENT;
+ 
+       /*
+        * no branch sampling for tracepoint events
+        */
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+ 
+       err = perf_trace_init(event);
+       if (err)
+               return err;
+ 
+       event->destroy = tp_perf_event_destroy;
+ 
+       return 0;
+ }
+ 
+ static struct pmu perf_tracepoint = {
+       .task_ctx_nr    = perf_sw_context,
+ 
+       .event_init     = perf_tp_event_init,
+       .add            = perf_trace_add,
+       .del            = perf_trace_del,
+       .start          = perf_swevent_start,
+       .stop           = perf_swevent_stop,
+       .read           = perf_swevent_read,
+ };
+ 
   static int perf_tp_filter_match(struct perf_event *event,
                                 struct perf_sample_data *data)
   {
@@@ -9875,6 -10034,44 +10079,44 @@@ void perf_trace_run_bpf_submit(void *ra
   }
   EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
   
+ static void __perf_tp_event_target_task(u64 count, void *record,
+                                       struct pt_regs *regs,
+                                       struct perf_sample_data *data,
+                                       struct perf_event *event)
+ {
+       struct trace_entry *entry = record;
+ 
+       if (event->attr.config != entry->type)
+               return;
+       /* Cannot deliver synchronous signal to other task. */
+       if (event->attr.sigtrap)
+               return;
+       if (perf_tp_event_match(event, data, regs))
+               perf_swevent_event(event, count, data, regs);
+ }
+ 
+ static void perf_tp_event_target_task(u64 count, void *record,
+                                     struct pt_regs *regs,
+                                     struct perf_sample_data *data,
+                                     struct perf_event_context *ctx)
+ {
+       unsigned int cpu = smp_processor_id();
+       struct pmu *pmu = &perf_tracepoint;
+       struct perf_event *event, *sibling;
+ 
+       perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
+               __perf_tp_event_target_task(count, record, regs, data, event);
+               for_each_sibling_event(sibling, event)
+                       __perf_tp_event_target_task(count, record, regs, data, sibling);
+       }
+ 
+       perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
+               __perf_tp_event_target_task(count, record, regs, data, event);
+               for_each_sibling_event(sibling, event)
+                       __perf_tp_event_target_task(count, record, regs, data, sibling);
+       }
+ }
+ 
   void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
                    struct pt_regs *regs, struct hlist_head *head, int rctx,
                    struct task_struct *task)
@@@ -9891,7 -10088,6 +10133,7 @@@
   
         perf_sample_data_init(&data, 0, 0);
         data.raw = &raw;
+ +      data.sample_flags |= PERF_SAMPLE_RAW;
   
         perf_trace_buf_update(record, event_type);
   
@@@ -9906,26 -10102,15 +10148,15 @@@
          */
         if (task && task != current) {
                 struct perf_event_context *ctx;
-               struct trace_entry *entry = record;
   
                 rcu_read_lock();
-               ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+               ctx = rcu_dereference(task->perf_event_ctxp);
                 if (!ctx)
                         goto unlock;
   
-               list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-                       if (event->cpu != smp_processor_id())
-                               continue;
-                       if (event->attr.type != PERF_TYPE_TRACEPOINT)
-                               continue;
-                       if (event->attr.config != entry->type)
-                               continue;
-                       /* Cannot deliver synchronous signal to other task. */
-                       if (event->attr.sigtrap)
-                               continue;
-                       if (perf_tp_event_match(event, &data, regs))
-                               perf_swevent_event(event, count, &data, regs);
-               }
+               raw_spin_lock(&ctx->lock);
+               perf_tp_event_target_task(count, record, regs, &data, ctx);
+               raw_spin_unlock(&ctx->lock);
   unlock:
                 rcu_read_unlock();
         }
@@@ -9934,44 -10119,6 +10165,6 @@@
   }
   EXPORT_SYMBOL_GPL(perf_tp_event);
   
- static void tp_perf_event_destroy(struct perf_event *event)
- {
-       perf_trace_destroy(event);
- }
- 
- static int perf_tp_event_init(struct perf_event *event)
- {
-       int err;
- 
-       if (event->attr.type != PERF_TYPE_TRACEPOINT)
-               return -ENOENT;
- 
-       /*
-        * no branch sampling for tracepoint events
-        */
-       if (has_branch_stack(event))
-               return -EOPNOTSUPP;
- 
-       err = perf_trace_init(event);
-       if (err)
-               return err;
- 
-       event->destroy = tp_perf_event_destroy;
- 
-       return 0;
- }
- 
- static struct pmu perf_tracepoint = {
-       .task_ctx_nr    = perf_sw_context,
- 
-       .event_init     = perf_tp_event_init,
-       .add            = perf_trace_add,
-       .del            = perf_trace_del,
-       .start          = perf_swevent_start,
-       .stop           = perf_swevent_stop,
-       .read           = perf_swevent_read,
- };
- 
   #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
   /*
    * Flags in config, used by dynamic PMU kprobe and uprobe
@@@ -11058,46 -11205,19 +11251,19 @@@ static int perf_event_idx_default(struc
         return 0;
   }
   
+ static void free_pmu_context(struct pmu *pmu)
+ {
+       free_percpu(pmu->cpu_pmu_context);
+ }
+ 
   /*
-  * Ensures all contexts with the same task_ctx_nr have the same
-  * pmu_cpu_context too.
+  * Let userspace know that this PMU supports address range filtering:
    */
- static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
+ static ssize_t nr_addr_filters_show(struct device *dev,
+                                   struct device_attribute *attr,
+                                   char *page)
   {
-       struct pmu *pmu;
- 
-       if (ctxn < 0)
-               return NULL;
- 
-       list_for_each_entry(pmu, &pmus, entry) {
-               if (pmu->task_ctx_nr == ctxn)
-                       return pmu->pmu_cpu_context;
-       }
- 
-       return NULL;
- }
- 
- static void free_pmu_context(struct pmu *pmu)
- {
-       /*
-        * Static contexts such as perf_sw_context have a global lifetime
-        * and may be shared between different PMUs. Avoid freeing them
-        * when a single PMU is going away.
-        */
-       if (pmu->task_ctx_nr > perf_invalid_context)
-               return;
- 
-       free_percpu(pmu->pmu_cpu_context);
- }
- 
- /*
-  * Let userspace know that this PMU supports address range filtering:
-  */
- static ssize_t nr_addr_filters_show(struct device *dev,
-                                   struct device_attribute *attr,
-                                   char *page)
- {
-       struct pmu *pmu = dev_get_drvdata(dev);
+       struct pmu *pmu = dev_get_drvdata(dev);
   
         return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
   }
@@@ -11151,12 -11271,11 +11317,11 @@@ perf_event_mux_interval_ms_store(struc
         /* update all cpuctx for this PMU */
         cpus_read_lock();
         for_each_online_cpu(cpu) {
-               struct perf_cpu_context *cpuctx;
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-               cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+               struct perf_cpu_pmu_context *cpc;
+               cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+               cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
   
-               cpu_function_call(cpu,
-                       (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
+               cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
         }
         cpus_read_unlock();
         mutex_unlock(&mux_interval_mutex);
@@@ -11193,13 -11312,15 +11358,15 @@@ static int pmu_dev_alloc(struct pmu *pm
   
         pmu->dev->groups = pmu->attr_groups;
         device_initialize(pmu->dev);
-       ret = dev_set_name(pmu->dev, "%s", pmu->name);
-       if (ret)
-               goto free_dev;
   
         dev_set_drvdata(pmu->dev, pmu);
         pmu->dev->bus = &pmu_bus;
         pmu->dev->release = pmu_dev_release;
+ 
+       ret = dev_set_name(pmu->dev, "%s", pmu->name);
+       if (ret)
+               goto free_dev;
+ 
         ret = device_add(pmu->dev);
         if (ret)
                 goto free_dev;
@@@ -11267,47 -11388,19 +11434,19 @@@ int perf_pmu_register(struct pmu *pmu, 
         }
   
   skip_type:
-       if (pmu->task_ctx_nr == perf_hw_context) {
-               static int hw_context_taken = 0;
- 
-               /*
-                * Other than systems with heterogeneous CPUs, it never makes
-                * sense for two PMUs to share perf_hw_context. PMUs which are
-                * uncore must use perf_invalid_context.
-                */
-               if (WARN_ON_ONCE(hw_context_taken &&
-                   !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
-                       pmu->task_ctx_nr = perf_invalid_context;
- 
-               hw_context_taken = 1;
-       }
- 
-       pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
-       if (pmu->pmu_cpu_context)
-               goto got_cpu_context;
- 
         ret = -ENOMEM;
-       pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
-       if (!pmu->pmu_cpu_context)
+       pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
+       if (!pmu->cpu_pmu_context)
                 goto free_dev;
   
         for_each_possible_cpu(cpu) {
-               struct perf_cpu_context *cpuctx;
- 
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-               __perf_event_init_context(&cpuctx->ctx);
-               lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
-               lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
-               cpuctx->ctx.pmu = pmu;
-               cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
+               struct perf_cpu_pmu_context *cpc;
   
-               __perf_mux_hrtimer_init(cpuctx, cpu);
- 
-               cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
-               cpuctx->heap = cpuctx->heap_default;
+               cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+               __perf_init_event_pmu_context(&cpc->epc, pmu);
+               __perf_mux_hrtimer_init(cpc, cpu);
         }
   
- got_cpu_context:
         if (!pmu->start_txn) {
                 if (pmu->pmu_enable) {
                         /*
@@@ -11786,10 -11879,11 +11925,11 @@@ perf_event_alloc(struct perf_event_att
         }
   
         /*
-        * Disallow uncore-cgroup events, they don't make sense as the cgroup will
-        * be different on other CPUs in the uncore mask.
+        * Disallow uncore-task events. Similarly, disallow uncore-cgroup
+        * events (they don't make sense as the cgroup will be different
+        * on other CPUs in the uncore mask).
          */
-       if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
+       if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
                 err = -EINVAL;
                 goto err_pmu;
         }
@@@ -12136,37 -12230,6 +12276,6 @@@ static int perf_event_set_clock(struct 
         return 0;
   }
   
- /*
-  * Variation on perf_event_ctx_lock_nested(), except we take two context
-  * mutexes.
-  */
- static struct perf_event_context *
- __perf_event_ctx_lock_double(struct perf_event *group_leader,
-                            struct perf_event_context *ctx)
- {
-       struct perf_event_context *gctx;
- 
- again:
-       rcu_read_lock();
-       gctx = READ_ONCE(group_leader->ctx);
-       if (!refcount_inc_not_zero(&gctx->refcount)) {
-               rcu_read_unlock();
-               goto again;
-       }
-       rcu_read_unlock();
- 
-       mutex_lock_double(&gctx->mutex, &ctx->mutex);
- 
-       if (group_leader->ctx != gctx) {
-               mutex_unlock(&ctx->mutex);
-               mutex_unlock(&gctx->mutex);
-               put_ctx(gctx);
-               goto again;
-       }
- 
-       return gctx;
- }
- 
   static bool
   perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
   {
@@@ -12212,9 -12275,10 +12321,10 @@@ SYSCALL_DEFINE5(perf_event_open
                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
   {
         struct perf_event *group_leader = NULL, *output_event = NULL;
+       struct perf_event_pmu_context *pmu_ctx;
         struct perf_event *event, *sibling;
         struct perf_event_attr attr;
-       struct perf_event_context *ctx, *gctx;
+       struct perf_event_context *ctx;
         struct file *event_file = NULL;
         struct fd group = {NULL, 0};
         struct task_struct *task = NULL;
@@@ -12344,42 -12408,53 +12454,53 @@@
         if (pmu->task_ctx_nr == perf_sw_context)
                 event->event_caps |= PERF_EV_CAP_SOFTWARE;
   
-       if (group_leader) {
-               if (is_software_event(event) &&
-                   !in_software_context(group_leader)) {
-                       /*
-                        * If the event is a sw event, but the group_leader
-                        * is on hw context.
-                        *
-                        * Allow the addition of software events to hw
-                        * groups, this is safe because software events
-                        * never fail to schedule.
-                        */
-                       pmu = group_leader->ctx->pmu;
-               } else if (!is_software_event(event) &&
-                          is_software_event(group_leader) &&
-                          (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
-                       /*
-                        * In case the group is a pure software group, and we
-                        * try to add a hardware event, move the whole group to
-                        * the hardware context.
-                        */
-                       move_group = 1;
-               }
+       if (task) {
+               err = down_read_interruptible(&task->signal->exec_update_lock);
+               if (err)
+                       goto err_alloc;
+ 
+               /*
+                * We must hold exec_update_lock across this and any potential
+                * perf_install_in_context() call for this new event to
+                * serialize against exec() altering our credentials (and the
+                * perf_event_exit_task() that could imply).
+                */
+               err = -EACCES;
+               if (!perf_check_permission(&attr, task))
+                       goto err_cred;
         }
   
         /*
          * Get the target context (task or percpu):
          */
-       ctx = find_get_context(pmu, task, event);
+       ctx = find_get_context(task, event);
         if (IS_ERR(ctx)) {
                 err = PTR_ERR(ctx);
-               goto err_alloc;
+               goto err_cred;
+       }
+ 
+       mutex_lock(&ctx->mutex);
+ 
+       if (ctx->task == TASK_TOMBSTONE) {
+               err = -ESRCH;
+               goto err_locked;
+       }
+ 
+       if (!task) {
+               /*
+                * Check if the @cpu we're creating an event for is online.
+                *
+                * We use the perf_cpu_context::ctx::mutex to serialize against
+                * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+                */
+               struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
+ 
+               if (!cpuctx->online) {
+                       err = -ENODEV;
+                       goto err_locked;
+               }
         }
   
-       /*
-        * Look up the group leader (we will attach this event to it):
-        */
         if (group_leader) {
                 err = -EINVAL;
   
@@@ -12388,11 -12463,11 +12509,11 @@@
                  * becoming part of another group-sibling):
                  */
                 if (group_leader->group_leader != group_leader)
-                       goto err_context;
+                       goto err_locked;
   
                 /* All events in a group should have the same clock */
                 if (group_leader->clock != event->clock)
-                       goto err_context;
+                       goto err_locked;
   
                 /*
                  * Make sure we're both events for the same CPU;
@@@ -12400,145 -12475,76 +12521,76 @@@
                  * you can never concurrently schedule them anyhow.
                  */
                 if (group_leader->cpu != event->cpu)
-                       goto err_context;
- 
-               /*
-                * Make sure we're both on the same task, or both
-                * per-CPU events.
-                */
-               if (group_leader->ctx->task != ctx->task)
-                       goto err_context;
+                       goto err_locked;
   
                 /*
-                * Do not allow to attach to a group in a different task
-                * or CPU context. If we're moving SW events, we'll fix
-                * this up later, so allow that.
-                *
-                * Racy, not holding group_leader->ctx->mutex, see comment with
-                * perf_event_ctx_lock().
+                * Make sure we're both on the same context; either task or cpu.
                  */
-               if (!move_group && group_leader->ctx != ctx)
-                       goto err_context;
+               if (group_leader->ctx != ctx)
+                       goto err_locked;
   
                 /*
                  * Only a group leader can be exclusive or pinned
                  */
                 if (attr.exclusive || attr.pinned)
-                       goto err_context;
-       }
- 
-       if (output_event) {
-               err = perf_event_set_output(event, output_event);
-               if (err)
-                       goto err_context;
-       }
- 
-       event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
-                                       f_flags);
-       if (IS_ERR(event_file)) {
-               err = PTR_ERR(event_file);
-               event_file = NULL;
-               goto err_context;
-       }
- 
-       if (task) {
-               err = down_read_interruptible(&task->signal->exec_update_lock);
-               if (err)
-                       goto err_file;
- 
-               /*
-                * We must hold exec_update_lock across this and any potential
-                * perf_install_in_context() call for this new event to
-                * serialize against exec() altering our credentials (and the
-                * perf_event_exit_task() that could imply).
-                */
-               err = -EACCES;
-               if (!perf_check_permission(&attr, task))
-                       goto err_cred;
-       }
- 
-       if (move_group) {
-               gctx = __perf_event_ctx_lock_double(group_leader, ctx);
- 
-               if (gctx->task == TASK_TOMBSTONE) {
-                       err = -ESRCH;
                         goto err_locked;
-               }
   
-               /*
-                * Check if we raced against another sys_perf_event_open() call
-                * moving the software group underneath us.
-                */
-               if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+               if (is_software_event(event) &&
+                   !in_software_context(group_leader)) {
                         /*
-                        * If someone moved the group out from under us, check
-                        * if this new event wound up on the same ctx, if so
-                        * its the regular !move_group case, otherwise fail.
+                        * If the event is a sw event, but the group_leader
+                        * is on hw context.
+                        *
+                        * Allow the addition of software events to hw
+                        * groups, this is safe because software events
+                        * never fail to schedule.
+                        *
+                        * Note the comment that goes with struct
+                        * perf_event_pmu_context.
                          */
-                       if (gctx != ctx) {
-                               err = -EINVAL;
-                               goto err_locked;
-                       } else {
-                               perf_event_ctx_unlock(group_leader, gctx);
-                               move_group = 0;
-                               goto not_move_group;
+                       pmu = group_leader->pmu_ctx->pmu;
+               } else if (!is_software_event(event)) {
+                       if (is_software_event(group_leader) &&
+                           (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+                               /*
+                                * In case the group is a pure software group, and we
+                                * try to add a hardware event, move the whole group to
+                                * the hardware context.
+                                */
+                               move_group = 1;
                         }
-               }
- 
-               /*
-                * Failure to create exclusive events returns -EBUSY.
-                */
-               err = -EBUSY;
-               if (!exclusive_event_installable(group_leader, ctx))
-                       goto err_locked;
   
-               for_each_sibling_event(sibling, group_leader) {
-                       if (!exclusive_event_installable(sibling, ctx))
+                       /* Don't allow group of multiple hw events from different pmus */
+                       if (!in_software_context(group_leader) &&
+                           group_leader->pmu_ctx->pmu != pmu)
                                 goto err_locked;
                 }
-       } else {
-               mutex_lock(&ctx->mutex);
- 
-               /*
-                * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
-                * see the group_leader && !move_group test earlier.
-                */
-               if (group_leader && group_leader->ctx != ctx) {
-                       err = -EINVAL;
-                       goto err_locked;
-               }
         }
- not_move_group:
   
-       if (ctx->task == TASK_TOMBSTONE) {
-               err = -ESRCH;
+       /*
+        * Now that we're certain of the pmu; find the pmu_ctx.
+        */
+       pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+       if (IS_ERR(pmu_ctx)) {
+               err = PTR_ERR(pmu_ctx);
                 goto err_locked;
         }
+       event->pmu_ctx = pmu_ctx;
   
-       if (!perf_event_validate_size(event)) {
-               err = -E2BIG;
-               goto err_locked;
+       if (output_event) {
+               err = perf_event_set_output(event, output_event);
+               if (err)
+                       goto err_context;
         }
   
-       if (!task) {
-               /*
-                * Check if the @cpu we're creating an event for is online.
-                *
-                * We use the perf_cpu_context::ctx::mutex to serialize against
-                * the hotplug notifiers. See perf_event_{init,exit}_cpu().
-                */
-               struct perf_cpu_context *cpuctx =
-                       container_of(ctx, struct perf_cpu_context, ctx);
- 
-               if (!cpuctx->online) {
-                       err = -ENODEV;
-                       goto err_locked;
-               }
+       if (!perf_event_validate_size(event)) {
+               err = -E2BIG;
+               goto err_context;
         }
   
         if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
                 err = -EINVAL;
-               goto err_locked;
+               goto err_context;
         }
   
         /*
@@@ -12547,35 -12553,32 +12599,32 @@@
          */
         if (!exclusive_event_installable(event, ctx)) {
                 err = -EBUSY;
-               goto err_locked;
+               goto err_context;
         }
   
         WARN_ON_ONCE(ctx->parent_ctx);
   
+       event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
+       if (IS_ERR(event_file)) {
+               err = PTR_ERR(event_file);
+               event_file = NULL;
+               goto err_context;
+       }
+ 
         /*
          * This is the point on no return; we cannot fail hereafter. This is
          * where we start modifying current state.
          */
   
         if (move_group) {
-               /*
-                * See perf_event_ctx_lock() for comments on the details
-                * of swizzling perf_event::ctx.
-                */
                 perf_remove_from_context(group_leader, 0);
-               put_ctx(gctx);
+               put_pmu_ctx(group_leader->pmu_ctx);
   
                 for_each_sibling_event(sibling, group_leader) {
                         perf_remove_from_context(sibling, 0);
-                       put_ctx(gctx);
+                       put_pmu_ctx(sibling->pmu_ctx);
                 }
   
-               /*
-                * Wait for everybody to stop referencing the events through
-                * the old lists, before installing it on new lists.
-                */
-               synchronize_rcu();
- 
                 /*
                  * Install the group siblings before the group leader.
                  *
@@@ -12587,9 -12590,10 +12636,10 @@@
                  * reachable through the group lists.
                  */
                 for_each_sibling_event(sibling, group_leader) {
+                       sibling->pmu_ctx = pmu_ctx;
+                       get_pmu_ctx(pmu_ctx);
                         perf_event__state_init(sibling);
                         perf_install_in_context(ctx, sibling, sibling->cpu);
-                       get_ctx(ctx);
                 }
   
                 /*
@@@ -12597,9 -12601,10 +12647,10 @@@
                  * event. What we want here is event in the initial
                  * startup state, ready to be add into new context.
                  */
+               group_leader->pmu_ctx = pmu_ctx;
+               get_pmu_ctx(pmu_ctx);
                 perf_event__state_init(group_leader);
                 perf_install_in_context(ctx, group_leader, group_leader->cpu);
-               get_ctx(ctx);
         }
   
         /*
@@@ -12616,8 -12621,6 +12667,6 @@@
         perf_install_in_context(ctx, event, event->cpu);
         perf_unpin_context(ctx);
   
-       if (move_group)
-               perf_event_ctx_unlock(group_leader, gctx);
         mutex_unlock(&ctx->mutex);
   
         if (task) {
@@@ -12639,25 -12642,17 +12688,17 @@@
         fd_install(event_fd, event_file);
         return event_fd;
   
+ err_context:
+       /* event->pmu_ctx freed by free_event() */
   err_locked:
-       if (move_group)
-               perf_event_ctx_unlock(group_leader, gctx);
         mutex_unlock(&ctx->mutex);
+       perf_unpin_context(ctx);
+       put_ctx(ctx);
   err_cred:
         if (task)
                 up_read(&task->signal->exec_update_lock);
- err_file:
-       fput(event_file);
- err_context:
-       perf_unpin_context(ctx);
-       put_ctx(ctx);
   err_alloc:
-       /*
-        * If event_file is set, the fput() above will have called ->release()
-        * and that will take care of freeing the event.
-        */
-       if (!event_file)
-               free_event(event);
+       free_event(event);
   err_task:
         if (task)
                 put_task_struct(task);
@@@ -12683,8 -12678,10 +12724,10 @@@ perf_event_create_kernel_counter(struc
                                  perf_overflow_handler_t overflow_handler,
                                  void *context)
   {
+       struct perf_event_pmu_context *pmu_ctx;
         struct perf_event_context *ctx;
         struct perf_event *event;
+       struct pmu *pmu;
         int err;
   
         /*
@@@ -12703,14 -12700,18 +12746,18 @@@
   
         /* Mark owner so we could distinguish it from user events. */
         event->owner = TASK_TOMBSTONE;
+       pmu = event->pmu;
+ 
+       if (pmu->task_ctx_nr == perf_sw_context)
+               event->event_caps |= PERF_EV_CAP_SOFTWARE;
   
         /*
          * Get the target context (task or percpu):
          */
-       ctx = find_get_context(event->pmu, task, event);
+       ctx = find_get_context(task, event);
         if (IS_ERR(ctx)) {
                 err = PTR_ERR(ctx);
-               goto err_free;
+               goto err_alloc;
         }
   
         WARN_ON_ONCE(ctx->parent_ctx);
@@@ -12720,6 -12721,13 +12767,13 @@@
                 goto err_unlock;
         }
   
+       pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+       if (IS_ERR(pmu_ctx)) {
+               err = PTR_ERR(pmu_ctx);
+               goto err_unlock;
+       }
+       event->pmu_ctx = pmu_ctx;
+ 
         if (!task) {
                 /*
                  * Check if the @cpu we're creating an event for is online.
@@@ -12731,13 -12739,13 +12785,13 @@@
                         container_of(ctx, struct perf_cpu_context, ctx);
                 if (!cpuctx->online) {
                         err = -ENODEV;
-                       goto err_unlock;
+                       goto err_pmu_ctx;
                 }
         }
   
         if (!exclusive_event_installable(event, ctx)) {
                 err = -EBUSY;
-               goto err_unlock;
+               goto err_pmu_ctx;
         }
   
         perf_install_in_context(ctx, event, event->cpu);
@@@ -12746,44 -12754,61 +12800,61 @@@
   
         return event;
   
+ err_pmu_ctx:
+       put_pmu_ctx(pmu_ctx);
   err_unlock:
         mutex_unlock(&ctx->mutex);
         perf_unpin_context(ctx);
         put_ctx(ctx);
- err_free:
+ err_alloc:
         free_event(event);
   err:
         return ERR_PTR(err);
   }
   EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
   
- void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+ static void __perf_pmu_remove(struct perf_event_context *ctx,
+                             int cpu, struct pmu *pmu,
+                             struct perf_event_groups *groups,
+                             struct list_head *events)
   {
-       struct perf_event_context *src_ctx;
-       struct perf_event_context *dst_ctx;
-       struct perf_event *event, *tmp;
-       LIST_HEAD(events);
- 
-       src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
-       dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+       struct perf_event *event, *sibling;
   
-       /*
-        * See perf_event_ctx_lock() for comments on the details
-        * of swizzling perf_event::ctx.
-        */
-       mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
-       list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
-                                event_entry) {
+       perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
                 perf_remove_from_context(event, 0);
-               unaccount_event_cpu(event, src_cpu);
-               put_ctx(src_ctx);
-               list_add(&event->migrate_entry, &events);
+               unaccount_event_cpu(event, cpu);
+               put_pmu_ctx(event->pmu_ctx);
+               list_add(&event->migrate_entry, events);
+ 
+               for_each_sibling_event(sibling, event) {
+                       perf_remove_from_context(sibling, 0);
+                       unaccount_event_cpu(sibling, cpu);
+                       put_pmu_ctx(sibling->pmu_ctx);
+                       list_add(&sibling->migrate_entry, events);
+               }
         }
+ }
   
-       /*
-        * Wait for the events to quiesce before re-instating them.
-        */
-       synchronize_rcu();
+ static void __perf_pmu_install_event(struct pmu *pmu,
+                                    struct perf_event_context *ctx,
+                                    int cpu, struct perf_event *event)
+ {
+       struct perf_event_pmu_context *epc;
+ 
+       event->cpu = cpu;
+       epc = find_get_pmu_context(pmu, ctx, event);
+       event->pmu_ctx = epc;
+ 
+       if (event->state >= PERF_EVENT_STATE_OFF)
+               event->state = PERF_EVENT_STATE_INACTIVE;
+       account_event_cpu(event, cpu);
+       perf_install_in_context(ctx, event, cpu);
+ }
+ 
+ static void __perf_pmu_install(struct perf_event_context *ctx,
+                              int cpu, struct pmu *pmu, struct list_head *events)
+ {
+       struct perf_event *event, *tmp;
   
         /*
          * Re-instate events in 2 passes.
@@@ -12793,30 -12818,48 +12864,48 @@@
          * leader will enable its siblings, even if those are still on the old
          * context.
          */
-       list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+       list_for_each_entry_safe(event, tmp, events, migrate_entry) {
                 if (event->group_leader == event)
                         continue;
   
                 list_del(&event->migrate_entry);
-               if (event->state >= PERF_EVENT_STATE_OFF)
-                       event->state = PERF_EVENT_STATE_INACTIVE;
-               account_event_cpu(event, dst_cpu);
-               perf_install_in_context(dst_ctx, event, dst_cpu);
-               get_ctx(dst_ctx);
+               __perf_pmu_install_event(pmu, ctx, cpu, event);
         }
   
         /*
          * Once all the siblings are setup properly, install the group leaders
          * to make it go.
          */
-       list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+       list_for_each_entry_safe(event, tmp, events, migrate_entry) {
                 list_del(&event->migrate_entry);
-               if (event->state >= PERF_EVENT_STATE_OFF)
-                       event->state = PERF_EVENT_STATE_INACTIVE;
-               account_event_cpu(event, dst_cpu);
-               perf_install_in_context(dst_ctx, event, dst_cpu);
-               get_ctx(dst_ctx);
+               __perf_pmu_install_event(pmu, ctx, cpu, event);
         }
+ }
+ 
+ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+ {
+       struct perf_event_context *src_ctx, *dst_ctx;
+       LIST_HEAD(events);
+ 
+       src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
+       dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
+ 
+       /*
+        * See perf_event_ctx_lock() for comments on the details
+        * of swizzling perf_event::ctx.
+        */
+       mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
+ 
+       __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
+       __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
+ 
+       /*
+        * Wait for the events to quiesce before re-instating them.
+        */
+       synchronize_rcu();
+ 
+       __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
+ 
         mutex_unlock(&dst_ctx->mutex);
         mutex_unlock(&src_ctx->mutex);
   }
@@@ -12896,14 -12939,14 +12985,14 @@@ perf_event_exit_event(struct perf_even
         perf_event_wakeup(event);
   }
   
- static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
+ static void perf_event_exit_task_context(struct task_struct *child)
   {
         struct perf_event_context *child_ctx, *clone_ctx = NULL;
         struct perf_event *child_event, *next;
   
         WARN_ON_ONCE(child != current);
   
-       child_ctx = perf_pin_task_context(child, ctxn);
+       child_ctx = perf_pin_task_context(child);
         if (!child_ctx)
                 return;
   
@@@ -12925,13 -12968,13 +13014,13 @@@
          * in.
          */
         raw_spin_lock_irq(&child_ctx->lock);
-       task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
+       task_ctx_sched_out(child_ctx, EVENT_ALL);
   
         /*
          * Now that the context is inactive, destroy the task <-> ctx relation
          * and mark the context dead.
          */
-       RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
+       RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
         put_ctx(child_ctx); /* cannot be last */
         WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
         put_task_struct(current); /* cannot be last */
@@@ -12966,7 -13009,6 +13055,6 @@@
   void perf_event_exit_task(struct task_struct *child)
   {
         struct perf_event *event, *tmp;
-       int ctxn;
   
         mutex_lock(&child->perf_event_mutex);
         list_for_each_entry_safe(event, tmp, &child->perf_event_list,
@@@ -12982,8 -13024,7 +13070,7 @@@
         }
         mutex_unlock(&child->perf_event_mutex);
   
-       for_each_task_context_nr(ctxn)
-               perf_event_exit_task_context(child, ctxn);
+       perf_event_exit_task_context(child);
   
         /*
          * The perf_event_exit_task_context calls perf_event_task
@@@ -13026,56 -13067,51 +13113,51 @@@ void perf_event_free_task(struct task_s
   {
         struct perf_event_context *ctx;
         struct perf_event *event, *tmp;
-       int ctxn;
   
-       for_each_task_context_nr(ctxn) {
-               ctx = task->perf_event_ctxp[ctxn];
-               if (!ctx)
-                       continue;
+       ctx = rcu_access_pointer(task->perf_event_ctxp);
+       if (!ctx)
+               return;
   
-               mutex_lock(&ctx->mutex);
-               raw_spin_lock_irq(&ctx->lock);
-               /*
-                * Destroy the task <-> ctx relation and mark the context dead.
-                *
-                * This is important because even though the task hasn't been
-                * exposed yet the context has been (through child_list).
-                */
-               RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
-               WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
-               put_task_struct(task); /* cannot be last */
-               raw_spin_unlock_irq(&ctx->lock);
+       mutex_lock(&ctx->mutex);
+       raw_spin_lock_irq(&ctx->lock);
+       /*
+        * Destroy the task <-> ctx relation and mark the context dead.
+        *
+        * This is important because even though the task hasn't been
+        * exposed yet the context has been (through child_list).
+        */
+       RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
+       WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+       put_task_struct(task); /* cannot be last */
+       raw_spin_unlock_irq(&ctx->lock);
   
-               list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
-                       perf_free_event(event, ctx);
   
-               mutex_unlock(&ctx->mutex);
+       list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
+               perf_free_event(event, ctx);
   
-               /*
-                * perf_event_release_kernel() could've stolen some of our
-                * child events and still have them on its free_list. In that
-                * case we must wait for these events to have been freed (in
-                * particular all their references to this task must've been
-                * dropped).
-                *
-                * Without this copy_process() will unconditionally free this
-                * task (irrespective of its reference count) and
-                * _free_event()'s put_task_struct(event->hw.target) will be a
-                * use-after-free.
-                *
-                * Wait for all events to drop their context reference.
-                */
-               wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
-               put_ctx(ctx); /* must be last */
-       }
+       mutex_unlock(&ctx->mutex);
+ 
+       /*
+        * perf_event_release_kernel() could've stolen some of our
+        * child events and still have them on its free_list. In that
+        * case we must wait for these events to have been freed (in
+        * particular all their references to this task must've been
+        * dropped).
+        *
+        * Without this copy_process() will unconditionally free this
+        * task (irrespective of its reference count) and
+        * _free_event()'s put_task_struct(event->hw.target) will be a
+        * use-after-free.
+        *
+        * Wait for all events to drop their context reference.
+        */
+       wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
+       put_ctx(ctx); /* must be last */
   }
   
   void perf_event_delayed_put(struct task_struct *task)
   {
-       int ctxn;
- 
-       for_each_task_context_nr(ctxn)
-               WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+       WARN_ON_ONCE(task->perf_event_ctxp);
   }
   
   struct file *perf_event_get(unsigned int fd)
@@@ -13125,6 -13161,7 +13207,7 @@@ inherit_event(struct perf_event *parent
               struct perf_event_context *child_ctx)
   {
         enum perf_event_state parent_state = parent_event->state;
+       struct perf_event_pmu_context *pmu_ctx;
         struct perf_event *child_event;
         unsigned long flags;
   
@@@ -13145,17 -13182,12 +13228,12 @@@
         if (IS_ERR(child_event))
                 return child_event;
   
- 
-       if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
-           !child_ctx->task_ctx_data) {
-               struct pmu *pmu = child_event->pmu;
- 
-               child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
-               if (!child_ctx->task_ctx_data) {
-                       free_event(child_event);
-                       return ERR_PTR(-ENOMEM);
-               }
+       pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
+       if (IS_ERR(pmu_ctx)) {
+               free_event(child_event);
+               return NULL;
         }
+       child_event->pmu_ctx = pmu_ctx;
   
         /*
          * is_orphaned_event() and list_add_tail(&parent_event->child_list)
@@@ -13278,11 -13310,11 +13356,11 @@@ static int inherit_group(struct perf_ev
   static int
   inherit_task_group(struct perf_event *event, struct task_struct *parent,
                    struct perf_event_context *parent_ctx,
-                  struct task_struct *child, int ctxn,
+                  struct task_struct *child,
                    u64 clone_flags, int *inherited_all)
   {
-       int ret;
         struct perf_event_context *child_ctx;
+       int ret;
   
         if (!event->attr.inherit ||
             (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
@@@ -13292,7 -13324,7 +13370,7 @@@
                 return 0;
         }
   
-       child_ctx = child->perf_event_ctxp[ctxn];
+       child_ctx = child->perf_event_ctxp;
         if (!child_ctx) {
                 /*
                  * This is executed from the parent task context, so
@@@ -13300,16 -13332,14 +13378,14 @@@
                  * First allocate and initialize a context for the
                  * child.
                  */
-               child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+               child_ctx = alloc_perf_context(child);
                 if (!child_ctx)
                         return -ENOMEM;
   
-               child->perf_event_ctxp[ctxn] = child_ctx;
+               child->perf_event_ctxp = child_ctx;
         }
   
-       ret = inherit_group(event, parent, parent_ctx,
-                           child, child_ctx);
- 
+       ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
         if (ret)
                 *inherited_all = 0;
   
@@@ -13319,8 -13349,7 +13395,7 @@@
   /*
    * Initialize the perf_event context in task_struct
    */
- static int perf_event_init_context(struct task_struct *child, int ctxn,
-                                  u64 clone_flags)
+ static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
   {
         struct perf_event_context *child_ctx, *parent_ctx;
         struct perf_event_context *cloned_ctx;
@@@ -13330,14 -13359,14 +13405,14 @@@
         unsigned long flags;
         int ret = 0;
   
-       if (likely(!parent->perf_event_ctxp[ctxn]))
+       if (likely(!parent->perf_event_ctxp))
                 return 0;
   
         /*
          * If the parent's context is a clone, pin it so it won't get
          * swapped under us.
          */
-       parent_ctx = perf_pin_task_context(parent, ctxn);
+       parent_ctx = perf_pin_task_context(parent);
         if (!parent_ctx)
                 return 0;
   
@@@ -13360,8 -13389,7 +13435,7 @@@
          */
         perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
                 ret = inherit_task_group(event, parent, parent_ctx,
-                                        child, ctxn, clone_flags,
-                                        &inherited_all);
+                                        child, clone_flags, &inherited_all);
                 if (ret)
                         goto out_unlock;
         }
@@@ -13377,8 -13405,7 +13451,7 @@@
   
         perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
                 ret = inherit_task_group(event, parent, parent_ctx,
-                                        child, ctxn, clone_flags,
-                                        &inherited_all);
+                                        child, clone_flags, &inherited_all);
                 if (ret)
                         goto out_unlock;
         }
@@@ -13386,7 -13413,7 +13459,7 @@@
         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
         parent_ctx->rotate_disable = 0;
   
-       child_ctx = child->perf_event_ctxp[ctxn];
+       child_ctx = child->perf_event_ctxp;
   
         if (child_ctx && inherited_all) {
                 /*
@@@ -13422,18 -13449,16 +13495,16 @@@ out_unlock
    */
   int perf_event_init_task(struct task_struct *child, u64 clone_flags)
   {
-       int ctxn, ret;
+       int ret;
   
-       memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+       child->perf_event_ctxp = NULL;
         mutex_init(&child->perf_event_mutex);
         INIT_LIST_HEAD(&child->perf_event_list);
   
-       for_each_task_context_nr(ctxn) {
-               ret = perf_event_init_context(child, ctxn, clone_flags);
-               if (ret) {
-                       perf_event_free_task(child);
-                       return ret;
-               }
+       ret = perf_event_init_context(child, clone_flags);
+       if (ret) {
+               perf_event_free_task(child);
+               return ret;
         }
   
         return 0;
@@@ -13442,6 -13467,7 +13513,7 @@@
   static void __init perf_event_init_all_cpus(void)
   {
         struct swevent_htable *swhash;
+       struct perf_cpu_context *cpuctx;
         int cpu;
   
         zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
@@@ -13449,15 -13475,19 +13521,19 @@@
         for_each_possible_cpu(cpu) {
                 swhash = &per_cpu(swevent_htable, cpu);
                 mutex_init(&swhash->hlist_mutex);
-               INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
   
                 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
                 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
   
- #ifdef CONFIG_CGROUP_PERF
-               INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
- #endif
                 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
+ 
+               cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+               __perf_event_init_context(&cpuctx->ctx);
+               lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+               lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
+               cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
+               cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
+               cpuctx->heap = cpuctx->heap_default;
         }
   }
   
@@@ -13479,12 -13509,12 +13555,12 @@@ static void perf_swevent_init_cpu(unsig
   #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
   static void __perf_event_exit_context(void *__info)
   {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
         struct perf_event_context *ctx = __info;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
         struct perf_event *event;
   
         raw_spin_lock(&ctx->lock);
-       ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+       ctx_sched_out(ctx, EVENT_TIME);
         list_for_each_entry(event, &ctx->event_list, event_entry)
                 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
         raw_spin_unlock(&ctx->lock);
@@@ -13494,18 -13524,16 +13570,16 @@@ static void perf_event_exit_cpu_context
   {
         struct perf_cpu_context *cpuctx;
         struct perf_event_context *ctx;
-       struct pmu *pmu;
   
+       // XXX simplify cpuctx->online
         mutex_lock(&pmus_lock);
-       list_for_each_entry(pmu, &pmus, entry) {
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-               ctx = &cpuctx->ctx;
+       cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+       ctx = &cpuctx->ctx;
   
-               mutex_lock(&ctx->mutex);
-               smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
-               cpuctx->online = 0;
-               mutex_unlock(&ctx->mutex);
-       }
+       mutex_lock(&ctx->mutex);
+       smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+       cpuctx->online = 0;
+       mutex_unlock(&ctx->mutex);
         cpumask_clear_cpu(cpu, perf_online_mask);
         mutex_unlock(&pmus_lock);
   }
@@@ -13519,20 -13547,17 +13593,17 @@@ int perf_event_init_cpu(unsigned int cp
   {
         struct perf_cpu_context *cpuctx;
         struct perf_event_context *ctx;
-       struct pmu *pmu;
   
         perf_swevent_init_cpu(cpu);
   
         mutex_lock(&pmus_lock);
         cpumask_set_cpu(cpu, perf_online_mask);
-       list_for_each_entry(pmu, &pmus, entry) {
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-               ctx = &cpuctx->ctx;
+       cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+       ctx = &cpuctx->ctx;
   
-               mutex_lock(&ctx->mutex);
-               cpuctx->online = 1;
-               mutex_unlock(&ctx->mutex);
-       }
+       mutex_lock(&ctx->mutex);
+       cpuctx->online = 1;
+       mutex_unlock(&ctx->mutex);
         mutex_unlock(&pmus_lock);
   
         return 0;
@@@ -13669,9 -13694,12 +13740,12 @@@ static int perf_cgroup_css_online(struc
   static int __perf_cgroup_move(void *info)
   {
         struct task_struct *task = info;
-       rcu_read_lock();
-       perf_cgroup_switch(task);
-       rcu_read_unlock();
+ 
+       preempt_disable();
+       if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
+               perf_cgroup_switch(task);
+       preempt_enable();
+ 
         return 0;
   }
author	Linus Torvalds <[email protected]>
	Mon, 12 Dec 2022 23:19:38 +0000 (15:19 -0800)
committer	Linus Torvalds <[email protected]>
	Mon, 12 Dec 2022 23:19:38 +0000 (15:19 -0800)
		1	2
arch/arm64/kernel/perf_event.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kernel/perf_pai_crypto.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kernel/perf_pai_ext.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/events/amd/ibs.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/events/intel/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/events/intel/ds.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/perf/arm_pmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/perf/arm_pmu.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history