]> Git Repo - J-linux.git/commitdiff
Merge tag 'perf-core-2022-12-12' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <[email protected]>
Mon, 12 Dec 2022 23:19:38 +0000 (15:19 -0800)
committerLinus Torvalds <[email protected]>
Mon, 12 Dec 2022 23:19:38 +0000 (15:19 -0800)
Pull perf events updates from Ingo Molnar:

 - Thoroughly rewrite the data structures that implement perf task
   context handling, with the goal of fixing various quirks and
   unfeatures both in already merged, and in upcoming proposed code.

   The old data structure is the per task and per cpu
   perf_event_contexts:

         task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context
              ^                                 |    ^     |           ^
              `---------------------------------'    |     `--> pmu ---'
                                                     v           ^
                                                perf_event ------'

   In this new design this is replaced with a single task context and a
   single CPU context, plus intermediate data-structures:

         task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context
              ^                           |   ^ ^
              `---------------------------'   | |
                                              | |    perf_cpu_pmu_context <--.
                                              | `----.    ^                  |
                                              |      |    |                  |
                                              |      v    v                  |
                                              | ,--> perf_event_pmu_context  |
                                              | |                            |
                                              | |                            |
                                              v v                            |
                                         perf_event ---> pmu ----------------'

   [ See commit bd2756811766 for more details. ]

   This rewrite was developed by Peter Zijlstra and Ravi Bangoria.

 - Optimize perf_tp_event()

 - Update the Intel uncore PMU driver, extending it with UPI topology
   discovery on various hardware models.

 - Misc fixes & cleanups

* tag 'perf-core-2022-12-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (25 commits)
  perf/x86/intel/uncore: Fix reference count leak in __uncore_imc_init_box()
  perf/x86/intel/uncore: Fix reference count leak in snr_uncore_mmio_map()
  perf/x86/intel/uncore: Fix reference count leak in hswep_has_limit_sbox()
  perf/x86/intel/uncore: Fix reference count leak in sad_cfg_iio_topology()
  perf/x86/intel/uncore: Make set_mapping() procedure void
  perf/x86/intel/uncore: Update sysfs-devices-mapping file
  perf/x86/intel/uncore: Enable UPI topology discovery for Sapphire Rapids
  perf/x86/intel/uncore: Enable UPI topology discovery for Icelake Server
  perf/x86/intel/uncore: Get UPI NodeID and GroupID
  perf/x86/intel/uncore: Enable UPI topology discovery for Skylake Server
  perf/x86/intel/uncore: Generalize get_topology() for SKX PMUs
  perf/x86/intel/uncore: Disable I/O stacks to PMU mapping on ICX-D
  perf/x86/intel/uncore: Clear attr_update properly
  perf/x86/intel/uncore: Introduce UPI topology type
  perf/x86/intel/uncore: Generalize IIO topology support
  perf/core: Don't allow grouping events from different hw pmus
  perf/amd/ibs: Make IBS a core pmu
  perf: Fix function pointer case
  perf/x86/amd: Remove the repeated declaration
  perf: Fix possible memleak in pmu_dev_alloc()
  ...

1  2 
arch/arm64/kernel/perf_event.c
arch/s390/kernel/perf_pai_crypto.c
arch/s390/kernel/perf_pai_ext.c
arch/x86/events/amd/ibs.c
arch/x86/events/intel/core.c
arch/x86/events/intel/ds.c
drivers/perf/arm_pmu.c
include/linux/perf/arm_pmu.h
kernel/events/core.c

index a15b3c1d15d91e8b310a9b71355505e97d16927b,54186542a969e49d234c85168bb3cf35da587394..a5193f2146a689b2621c5ea548039de353c656c3
@@@ -806,10 -806,14 +806,14 @@@ static void armv8pmu_disable_event(stru
  
  static void armv8pmu_start(struct arm_pmu *cpu_pmu)
  {
-       struct perf_event_context *task_ctx =
-               this_cpu_ptr(cpu_pmu->pmu.pmu_cpu_context)->task_ctx;
+       struct perf_event_context *ctx;
+       int nr_user = 0;
  
-       if (sysctl_perf_user_access && task_ctx && task_ctx->nr_user)
+       ctx = perf_cpu_task_ctx();
+       if (ctx)
+               nr_user = ctx->nr_user;
+       if (sysctl_perf_user_access && nr_user)
                armv8pmu_enable_user_access(cpu_pmu);
        else
                armv8pmu_disable_user_access();
@@@ -1019,10 -1023,10 +1023,10 @@@ static int armv8pmu_set_event_filter(st
        return 0;
  }
  
- static int armv8pmu_filter_match(struct perf_event *event)
+ static bool armv8pmu_filter(struct pmu *pmu, int cpu)
  {
-       unsigned long evtype = event->hw.config_base & ARMV8_PMU_EVTYPE_EVENT;
-       return evtype != ARMV8_PMUV3_PERFCTR_CHAIN;
+       struct arm_pmu *armpmu = to_arm_pmu(pmu);
+       return !cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus);
  }
  
  static void armv8pmu_reset(void *info)
@@@ -1146,8 -1150,7 +1150,8 @@@ static void __armv8pmu_probe_pmu(void *
        dfr0 = read_sysreg(id_aa64dfr0_el1);
        pmuver = cpuid_feature_extract_unsigned_field(dfr0,
                        ID_AA64DFR0_EL1_PMUVer_SHIFT);
 -      if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF || pmuver == 0)
 +      if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF ||
 +          pmuver == ID_AA64DFR0_EL1_PMUVer_NI)
                return;
  
        cpu_pmu->pmuver = pmuver;
@@@ -1254,7 -1257,7 +1258,7 @@@ static int armv8_pmu_init(struct arm_pm
        cpu_pmu->stop                   = armv8pmu_stop;
        cpu_pmu->reset                  = armv8pmu_reset;
        cpu_pmu->set_event_filter       = armv8pmu_set_event_filter;
-       cpu_pmu->filter_match           = armv8pmu_filter_match;
+       cpu_pmu->filter                 = armv8pmu_filter;
  
        cpu_pmu->pmu.event_idx          = armv8pmu_user_event_idx;
  
index 529a2fee4ea511c719ad43e565aa40b854cc0049,f747137f39ae84632eab41792eb3d6e9a55dfa82..985e243a2ed83b80f3f3a24ad6910ce46c83695a
@@@ -35,9 -35,9 +35,9 @@@ struct pai_userdata 
  struct paicrypt_map {
        unsigned long *page;            /* Page for CPU to store counters */
        struct pai_userdata *save;      /* Page to store no-zero counters */
 -      unsigned int users;             /* # of PAI crypto users */
 -      unsigned int sampler;           /* # of PAI crypto samplers */
 -      unsigned int counter;           /* # of PAI crypto counters */
 +      unsigned int active_events;     /* # of PAI crypto users */
 +      unsigned int refcnt;            /* Reference count mapped buffers */
 +      enum paievt_mode mode;          /* Type of event */
        struct perf_event *event;       /* Perf event for sampling */
  };
  
@@@ -56,11 -56,15 +56,11 @@@ static void paicrypt_event_destroy(stru
        cpump->event = NULL;
        static_branch_dec(&pai_key);
        mutex_lock(&pai_reserve_mutex);
 -      if (event->attr.sample_period)
 -              cpump->sampler -= 1;
 -      else
 -              cpump->counter -= 1;
 -      debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d"
 -                          " sampler %d counter %d\n", __func__,
 -                          event->attr.config, event->cpu, cpump->sampler,
 -                          cpump->counter);
 -      if (!cpump->counter && !cpump->sampler) {
 +      debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d"
 +                          " mode %d refcnt %d\n", __func__,
 +                          event->attr.config, event->cpu,
 +                          cpump->active_events, cpump->mode, cpump->refcnt);
 +      if (!--cpump->refcnt) {
                debug_sprintf_event(cfm_dbg, 4, "%s page %#lx save %p\n",
                                    __func__, (unsigned long)cpump->page,
                                    cpump->save);
@@@ -68,7 -72,6 +68,7 @@@
                cpump->page = NULL;
                kvfree(cpump->save);
                cpump->save = NULL;
 +              cpump->mode = PAI_MODE_NONE;
        }
        mutex_unlock(&pai_reserve_mutex);
  }
@@@ -133,14 -136,17 +133,14 @@@ static u64 paicrypt_getall(struct perf_
   */
  static int paicrypt_busy(struct perf_event_attr *a, struct paicrypt_map *cpump)
  {
 -      unsigned int *use_ptr;
        int rc = 0;
  
        mutex_lock(&pai_reserve_mutex);
        if (a->sample_period) {         /* Sampling requested */
 -              use_ptr = &cpump->sampler;
 -              if (cpump->counter || cpump->sampler)
 +              if (cpump->mode != PAI_MODE_NONE)
                        rc = -EBUSY;    /* ... sampling/counting active */
        } else {                        /* Counting requested */
 -              use_ptr = &cpump->counter;
 -              if (cpump->sampler)
 +              if (cpump->mode == PAI_MODE_SAMPLING)
                        rc = -EBUSY;    /* ... and sampling active */
        }
        if (rc)
        rc = 0;
  
  unlock:
 -      /* If rc is non-zero, do not increment counter/sampler. */
 -      if (!rc)
 -              *use_ptr += 1;
 -      debug_sprintf_event(cfm_dbg, 5, "%s sample_period %#llx sampler %d"
 -                          " counter %d page %#lx save %p rc %d\n", __func__,
 -                          a->sample_period, cpump->sampler, cpump->counter,
 +      /* If rc is non-zero, do not set mode and reference count */
 +      if (!rc) {
 +              cpump->refcnt++;
 +              cpump->mode = a->sample_period ? PAI_MODE_SAMPLING
 +                                             : PAI_MODE_COUNTING;
 +      }
 +      debug_sprintf_event(cfm_dbg, 5, "%s sample_period %#llx users %d"
 +                          " mode %d refcnt %d page %#lx save %p rc %d\n",
 +                          __func__, a->sample_period, cpump->active_events,
 +                          cpump->mode, cpump->refcnt,
                            (unsigned long)cpump->page, cpump->save, rc);
        mutex_unlock(&pai_reserve_mutex);
        return rc;
@@@ -260,7 -262,7 +260,7 @@@ static int paicrypt_add(struct perf_eve
        struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map);
        unsigned long ccd;
  
 -      if (cpump->users++ == 0) {
 +      if (++cpump->active_events == 1) {
                ccd = virt_to_phys(cpump->page) | PAI_CRYPTO_KERNEL_OFFSET;
                WRITE_ONCE(S390_lowcore.ccd, ccd);
                __ctl_set_bit(0, 50);
@@@ -291,7 -293,7 +291,7 @@@ static void paicrypt_del(struct perf_ev
        if (!event->attr.sample_period)
                /* Only counting needs to read counter */
                paicrypt_stop(event, PERF_EF_UPDATE);
 -      if (cpump->users-- == 1) {
 +      if (--cpump->active_events == 0) {
                __ctl_clear_bit(0, 50);
                WRITE_ONCE(S390_lowcore.ccd, 0);
        }
@@@ -377,7 -379,7 +377,7 @@@ static int paicrypt_push_sample(void
  /* Called on schedule-in and schedule-out. No access to event structure,
   * but for sampling only event CRYPTO_ALL is allowed.
   */
- static void paicrypt_sched_task(struct perf_event_context *ctx, bool sched_in)
+ static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
  {
        /* We started with a clean page on event installation. So read out
         * results on schedule_out and if page was dirty, clear values.
index a46cd7406b20ca611d977d6073c040b70a9cbe44,9547798594f9fa7744f68926c9698c9ae33fa987..1138f57baae3f0b7e3b14577a9229b1ef6ca3e8d
  static debug_info_t *paiext_dbg;
  static unsigned int paiext_cnt;       /* Extracted with QPACI instruction */
  
 -enum paiext_mode {
 -      PAI_MODE_NONE,
 -      PAI_MODE_SAMPLING,
 -      PAI_MODE_COUNTER,
 -};
 -
  struct pai_userdata {
        u16 num;
        u64 value;
@@@ -48,7 -54,7 +48,7 @@@ struct paiext_cb {            /* PAI extension 1 
  struct paiext_map {
        unsigned long *area;            /* Area for CPU to store counters */
        struct pai_userdata *save;      /* Area to store non-zero counters */
 -      enum paiext_mode mode;          /* Type of event */
 +      enum paievt_mode mode;          /* Type of event */
        unsigned int active_events;     /* # of PAI Extension users */
        unsigned int refcnt;
        struct perf_event *event;       /* Perf event for sampling */
@@@ -186,14 -192,14 +186,14 @@@ static int paiext_alloc(struct perf_eve
                        goto unlock;
                }
                cpump->mode = a->sample_period ? PAI_MODE_SAMPLING
 -                                             : PAI_MODE_COUNTER;
 +                                             : PAI_MODE_COUNTING;
        } else {
                /* Multiple invocation, check whats active.
                 * Supported are multiple counter events or only one sampling
                 * event concurrently at any one time.
                 */
                if (cpump->mode == PAI_MODE_SAMPLING ||
 -                  (cpump->mode == PAI_MODE_COUNTER && a->sample_period)) {
 +                  (cpump->mode == PAI_MODE_COUNTING && a->sample_period)) {
                        rc = -EBUSY;
                        goto unlock;
                }
@@@ -453,7 -459,6 +453,7 @@@ static int paiext_push_sample(void
                raw.frag.data = cpump->save;
                raw.size = raw.frag.size;
                data.raw = &raw;
 +              data.sample_flags |= PERF_SAMPLE_RAW;
        }
  
        overflow = perf_event_overflow(event, &data, &regs);
  /* Called on schedule-in and schedule-out. No access to event structure,
   * but for sampling only event NNPA_ALL is allowed.
   */
- static void paiext_sched_task(struct perf_event_context *ctx, bool sched_in)
+ static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
  {
        /* We started with a clean page on event installation. So read out
         * results on schedule_out and if page was dirty, clear values.
index 4cb710efbdd9ad7e43ed1448cb471b8cd9fde975,fbc2ce86f4b81525c2e8ed71683c2705e8ac80e7..da3f5ebac4e1ca9d1dd24a4bc77b9a37113cbeef
@@@ -631,7 -631,7 +631,7 @@@ static const struct attribute_group *op
  
  static struct perf_ibs perf_ibs_fetch = {
        .pmu = {
-               .task_ctx_nr    = perf_invalid_context,
+               .task_ctx_nr    = perf_hw_context,
  
                .event_init     = perf_ibs_init,
                .add            = perf_ibs_add,
  
  static struct perf_ibs perf_ibs_op = {
        .pmu = {
-               .task_ctx_nr    = perf_invalid_context,
+               .task_ctx_nr    = perf_hw_context,
  
                .event_init     = perf_ibs_init,
                .add            = perf_ibs_add,
@@@ -801,7 -801,7 +801,7 @@@ static void perf_ibs_get_mem_lvl(union 
        /* Extension Memory */
        if (ibs_caps & IBS_CAPS_ZEN4 &&
            ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) {
 -              data_src->mem_lvl_num = PERF_MEM_LVLNUM_EXTN_MEM;
 +              data_src->mem_lvl_num = PERF_MEM_LVLNUM_CXL;
                if (op_data2->rmt_node) {
                        data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
                        /* IBS doesn't provide Remote socket detail */
index 1b92bf05fd652a7b92c158ed1b9641809bec234d,d8af75466ee90c16f57103fb467b4e90a2856521..dfd2c124cdf80a819570340caa2b886887d57890
@@@ -4536,8 -4536,6 +4536,6 @@@ end
        cpumask_set_cpu(cpu, &pmu->supported_cpus);
        cpuc->pmu = &pmu->pmu;
  
-       x86_pmu_update_cpu_context(&pmu->pmu, cpu);
        return true;
  }
  
@@@ -4671,17 -4669,17 +4669,17 @@@ static void intel_pmu_cpu_dead(int cpu
                cpumask_clear_cpu(cpu, &hybrid_pmu(cpuc->pmu)->supported_cpus);
  }
  
- static void intel_pmu_sched_task(struct perf_event_context *ctx,
+ static void intel_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
                                 bool sched_in)
  {
-       intel_pmu_pebs_sched_task(ctx, sched_in);
-       intel_pmu_lbr_sched_task(ctx, sched_in);
+       intel_pmu_pebs_sched_task(pmu_ctx, sched_in);
+       intel_pmu_lbr_sched_task(pmu_ctx, sched_in);
  }
  
- static void intel_pmu_swap_task_ctx(struct perf_event_context *prev,
-                                   struct perf_event_context *next)
+ static void intel_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
+                                   struct perf_event_pmu_context *next_epc)
  {
-       intel_pmu_lbr_swap_task_ctx(prev, next);
+       intel_pmu_lbr_swap_task_ctx(prev_epc, next_epc);
  }
  
  static int intel_pmu_check_period(struct perf_event *event, u64 value)
@@@ -4705,12 -4703,11 +4703,11 @@@ static int intel_pmu_aux_output_match(s
        return is_intel_pt_event(event);
  }
  
- static int intel_pmu_filter_match(struct perf_event *event)
+ static void intel_pmu_filter(struct pmu *pmu, int cpu, bool *ret)
  {
-       struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
-       unsigned int cpu = smp_processor_id();
+       struct x86_hybrid_pmu *hpmu = hybrid_pmu(pmu);
  
-       return cpumask_test_cpu(cpu, &pmu->supported_cpus);
+       *ret = !cpumask_test_cpu(cpu, &hpmu->supported_cpus);
  }
  
  PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
@@@ -4911,7 -4908,6 +4908,7 @@@ static const struct x86_cpu_desc isolat
        INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X,             5, 0x00000000),
        INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X,             6, 0x00000000),
        INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X,             7, 0x00000000),
 +      INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X,            11, 0x00000000),
        INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_L,             3, 0x0000007c),
        INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE,               3, 0x0000007c),
        INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE,              9, 0x0000004e),
@@@ -6413,7 -6409,7 +6410,7 @@@ __init int intel_pmu_init(void
                static_call_update(intel_pmu_set_topdown_event_period,
                                   &adl_set_topdown_event_period);
  
-               x86_pmu.filter_match = intel_pmu_filter_match;
+               x86_pmu.filter = intel_pmu_filter;
                x86_pmu.get_event_constraints = adl_get_event_constraints;
                x86_pmu.hw_config = adl_hw_config;
                x86_pmu.limit_period = spr_limit_period;
index 446d2833efa768bcc7dd2b63cca574ba29e8cb44,f141cc7b88479c16d8628f351d836d801a4269a2..88e58b6ee73c016cd55a7eb887751778a6c5d086
@@@ -982,13 -982,8 +982,13 @@@ struct event_constraint intel_icl_pebs_
        INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),  /* SLOTS */
  
        INTEL_PLD_CONSTRAINT(0x1cd, 0xff),                      /* MEM_TRANS_RETIRED.LOAD_LATENCY */
 -      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf),    /* MEM_INST_RETIRED.LOAD */
 -      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf),    /* MEM_INST_RETIRED.STORE */
 +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf),   /* MEM_INST_RETIRED.STLB_MISS_LOADS */
 +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf),   /* MEM_INST_RETIRED.STLB_MISS_STORES */
 +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf),   /* MEM_INST_RETIRED.LOCK_LOADS */
 +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf),   /* MEM_INST_RETIRED.SPLIT_LOADS */
 +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf),   /* MEM_INST_RETIRED.SPLIT_STORES */
 +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf),   /* MEM_INST_RETIRED.ALL_LOADS */
 +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf),   /* MEM_INST_RETIRED.ALL_STORES */
  
        INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), /* MEM_LOAD_*_RETIRED.* */
  
@@@ -1009,13 -1004,8 +1009,13 @@@ struct event_constraint intel_spr_pebs_
        INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe),
        INTEL_PLD_CONSTRAINT(0x1cd, 0xfe),
        INTEL_PSD_CONSTRAINT(0x2cd, 0x1),
 -      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf),
 -      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf),
 +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf),   /* MEM_INST_RETIRED.STLB_MISS_LOADS */
 +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf),   /* MEM_INST_RETIRED.STLB_MISS_STORES */
 +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf),   /* MEM_INST_RETIRED.LOCK_LOADS */
 +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf),   /* MEM_INST_RETIRED.SPLIT_LOADS */
 +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf),   /* MEM_INST_RETIRED.SPLIT_STORES */
 +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf),   /* MEM_INST_RETIRED.ALL_LOADS */
 +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf),   /* MEM_INST_RETIRED.ALL_STORES */
  
        INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf),
  
@@@ -1069,7 -1059,7 +1069,7 @@@ static inline bool pebs_needs_sched_cb(
        return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
  }
  
- void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+ void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
  {
        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
  
@@@ -1177,7 -1167,7 +1177,7 @@@ static voi
  pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
                  struct perf_event *event, bool add)
  {
-       struct pmu *pmu = event->ctx->pmu;
+       struct pmu *pmu = event->pmu;
        /*
         * Make sure we get updated with the first PEBS
         * event. It will trigger also during removal, but
diff --combined drivers/perf/arm_pmu.c
index bb56676f50ef98acfc15381d2c463925b70862b9,5ece3f132d80b593821bc12f6355f358f7b1b8c0..9b593f985805eb782ee6e9ec612ff10b2003a8bf
@@@ -514,6 -514,9 +514,6 @@@ static int armpmu_event_init(struct per
        if (has_branch_stack(event))
                return -EOPNOTSUPP;
  
 -      if (armpmu->map_event(event) == -ENOENT)
 -              return -ENOENT;
 -
        return __hw_perf_event_init(event);
  }
  
@@@ -547,15 -550,14 +547,14 @@@ static void armpmu_disable(struct pmu *
   * microarchitecture, and aren't suitable for another. Thus, only match CPUs of
   * the same microarchitecture.
   */
- static int armpmu_filter_match(struct perf_event *event)
+ static bool armpmu_filter(struct pmu *pmu, int cpu)
  {
-       struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-       unsigned int cpu = smp_processor_id();
-       int ret;
+       struct arm_pmu *armpmu = to_arm_pmu(pmu);
+       bool ret;
  
        ret = cpumask_test_cpu(cpu, &armpmu->supported_cpus);
-       if (ret && armpmu->filter_match)
-               return armpmu->filter_match(event);
+       if (ret && armpmu->filter)
+               return armpmu->filter(pmu, cpu);
  
        return ret;
  }
@@@ -858,16 -860,16 +857,16 @@@ static void cpu_pmu_destroy(struct arm_
                                            &cpu_pmu->node);
  }
  
 -static struct arm_pmu *__armpmu_alloc(gfp_t flags)
 +struct arm_pmu *armpmu_alloc(void)
  {
        struct arm_pmu *pmu;
        int cpu;
  
 -      pmu = kzalloc(sizeof(*pmu), flags);
 +      pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
        if (!pmu)
                goto out;
  
 -      pmu->hw_events = alloc_percpu_gfp(struct pmu_hw_events, flags);
 +      pmu->hw_events = alloc_percpu_gfp(struct pmu_hw_events, GFP_KERNEL);
        if (!pmu->hw_events) {
                pr_info("failed to allocate per-cpu PMU data.\n");
                goto out_free_pmu;
                .start          = armpmu_start,
                .stop           = armpmu_stop,
                .read           = armpmu_read,
-               .filter_match   = armpmu_filter_match,
+               .filter         = armpmu_filter,
                .attr_groups    = pmu->attr_groups,
                /*
                 * This is a CPU PMU potentially in a heterogeneous
                 * configuration (e.g. big.LITTLE). This is not an uncore PMU,
                 * and we have taken ctx sharing into account (e.g. with our
-                * pmu::filter_match callback and pmu::event_init group
-                * validation).
+                * pmu::filter callback and pmu::event_init group validation).
                 */
                .capabilities   = PERF_PMU_CAP_HETEROGENEOUS_CPUS | PERF_PMU_CAP_EXTENDED_REGS,
        };
@@@ -913,6 -914,17 +911,6 @@@ out
        return NULL;
  }
  
 -struct arm_pmu *armpmu_alloc(void)
 -{
 -      return __armpmu_alloc(GFP_KERNEL);
 -}
 -
 -struct arm_pmu *armpmu_alloc_atomic(void)
 -{
 -      return __armpmu_alloc(GFP_ATOMIC);
 -}
 -
 -
  void armpmu_free(struct arm_pmu *pmu)
  {
        free_percpu(pmu->hw_events);
index 0c15c5b7f801ec2684a23b292a078b1295a52f5f,725968095ea9dd0c1b9d9d91c460fd4e808e2a63..ef914a600087eaf2c5bb755f9bb354d58349469c
@@@ -100,7 -100,7 +100,7 @@@ struct arm_pmu 
        void            (*stop)(struct arm_pmu *);
        void            (*reset)(void *);
        int             (*map_event)(struct perf_event *event);
-       int             (*filter_match)(struct perf_event *event);
+       bool            (*filter)(struct pmu *pmu, int cpu);
        int             num_events;
        bool            secure_access; /* 32-bit ARM only */
  #define ARMV8_PMUV3_MAX_COMMON_EVENTS         0x40
@@@ -174,6 -174,7 +174,6 @@@ void kvm_host_pmu_init(struct arm_pmu *
  
  /* Internal functions only for core arm_pmu code */
  struct arm_pmu *armpmu_alloc(void);
 -struct arm_pmu *armpmu_alloc_atomic(void);
  void armpmu_free(struct arm_pmu *pmu);
  int armpmu_register(struct arm_pmu *pmu);
  int armpmu_request_irq(int irq, int cpu);
diff --combined kernel/events/core.c
index 7f04f995c9754891042abf3d0f695d0ba0cc890f,65e20c5c3c44e38ac18645e29fe32e1b13275633..e47914ac8732325572830cf3dee5886bca21022f
@@@ -155,12 -155,6 +155,6 @@@ static int cpu_function_call(int cpu, r
        return data.ret;
  }
  
- static inline struct perf_cpu_context *
- __get_cpu_context(struct perf_event_context *ctx)
- {
-       return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
- }
  static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
                          struct perf_event_context *ctx)
  {
@@@ -184,6 -178,14 +178,14 @@@ static bool is_kernel_event(struct perf
        return READ_ONCE(event->owner) == TASK_TOMBSTONE;
  }
  
+ static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+ struct perf_event_context *perf_cpu_task_ctx(void)
+ {
+       lockdep_assert_irqs_disabled();
+       return this_cpu_ptr(&perf_cpu_context)->task_ctx;
+ }
  /*
   * On task ctx scheduling...
   *
@@@ -217,7 -219,7 +219,7 @@@ static int event_function(void *info
        struct event_function_struct *efs = info;
        struct perf_event *event = efs->event;
        struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        int ret = 0;
  
@@@ -314,7 -316,7 +316,7 @@@ again
  static void event_function_local(struct perf_event *event, event_f func, void *data)
  {
        struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct task_struct *task = READ_ONCE(ctx->task);
        struct perf_event_context *task_ctx = NULL;
  
@@@ -388,7 -390,6 +390,6 @@@ static DEFINE_MUTEX(perf_sched_mutex)
  static atomic_t perf_sched_count;
  
  static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
- static DEFINE_PER_CPU(int, perf_sched_cb_usages);
  static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
  
  static atomic_t nr_mmap_events __read_mostly;
@@@ -448,7 -449,7 +449,7 @@@ static void update_perf_cpu_limits(void
        WRITE_ONCE(perf_sample_allowed_ns, tmp);
  }
  
- static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
+ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
  
  int perf_proc_update_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
@@@ -571,12 -572,6 +572,6 @@@ void perf_sample_event_took(u64 sample_
  
  static atomic64_t perf_event_id;
  
- static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
-                             enum event_type_t event_type);
- static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-                            enum event_type_t event_type);
  static void update_context_time(struct perf_event_context *ctx);
  static u64 perf_event_time(struct perf_event *event);
  
@@@ -691,13 -686,31 +686,31 @@@ do {                                                                    
        ___p;                                                           \
  })
  
+ static void perf_ctx_disable(struct perf_event_context *ctx)
+ {
+       struct perf_event_pmu_context *pmu_ctx;
+       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+               perf_pmu_disable(pmu_ctx->pmu);
+ }
+ static void perf_ctx_enable(struct perf_event_context *ctx)
+ {
+       struct perf_event_pmu_context *pmu_ctx;
+       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+               perf_pmu_enable(pmu_ctx->pmu);
+ }
+ static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
+ static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
  #ifdef CONFIG_CGROUP_PERF
  
  static inline bool
  perf_cgroup_match(struct perf_event *event)
  {
-       struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
  
        /* @event doesn't care about cgroup */
        if (!event->cgrp)
@@@ -823,54 -836,39 +836,39 @@@ perf_cgroup_set_timestamp(struct perf_c
        }
  }
  
- static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
  /*
   * reschedule events based on the cgroup constraint of task.
   */
  static void perf_cgroup_switch(struct task_struct *task)
  {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_cgroup *cgrp;
-       struct perf_cpu_context *cpuctx, *tmp;
-       struct list_head *list;
-       unsigned long flags;
-       /*
-        * Disable interrupts and preemption to avoid this CPU's
-        * cgrp_cpuctx_entry to change under us.
-        */
-       local_irq_save(flags);
  
        cgrp = perf_cgroup_from_task(task, NULL);
  
-       list = this_cpu_ptr(&cgrp_cpuctx_list);
-       list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
-               WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
-               if (READ_ONCE(cpuctx->cgrp) == cgrp)
-                       continue;
-               perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-               perf_pmu_disable(cpuctx->ctx.pmu);
+       WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
+       if (READ_ONCE(cpuctx->cgrp) == cgrp)
+               return;
  
-               cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-               /*
-                * must not be done before ctxswout due
-                * to update_cgrp_time_from_cpuctx() in
-                * ctx_sched_out()
-                */
-               cpuctx->cgrp = cgrp;
-               /*
-                * set cgrp before ctxsw in to allow
-                * perf_cgroup_set_timestamp() in ctx_sched_in()
-                * to not have to pass task around
-                */
-               cpu_ctx_sched_in(cpuctx, EVENT_ALL);
+       perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+       perf_ctx_disable(&cpuctx->ctx);
  
-               perf_pmu_enable(cpuctx->ctx.pmu);
-               perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-       }
+       ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+       /*
+        * must not be done before ctxswout due
+        * to update_cgrp_time_from_cpuctx() in
+        * ctx_sched_out()
+        */
+       cpuctx->cgrp = cgrp;
+       /*
+        * set cgrp before ctxsw in to allow
+        * perf_cgroup_set_timestamp() in ctx_sched_in()
+        * to not have to pass task around
+        */
+       ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
  
-       local_irq_restore(flags);
+       perf_ctx_enable(&cpuctx->ctx);
+       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
  }
  
  static int perf_cgroup_ensure_storage(struct perf_event *event,
                heap_size++;
  
        for_each_possible_cpu(cpu) {
-               cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
+               cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
                if (heap_size <= cpuctx->heap_size)
                        continue;
  
@@@ -972,8 -970,6 +970,6 @@@ perf_cgroup_event_enable(struct perf_ev
                return;
  
        cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
-       list_add(&cpuctx->cgrp_cpuctx_entry,
-                       per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
  }
  
  static inline void
@@@ -994,7 -990,6 +990,6 @@@ perf_cgroup_event_disable(struct perf_e
                return;
  
        cpuctx->cgrp = NULL;
-       list_del(&cpuctx->cgrp_cpuctx_entry);
  }
  
  #else /* !CONFIG_CGROUP_PERF */
@@@ -1069,34 -1064,30 +1064,30 @@@ static void perf_cgroup_switch(struct t
   */
  static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
  {
-       struct perf_cpu_context *cpuctx;
+       struct perf_cpu_pmu_context *cpc;
        bool rotations;
  
        lockdep_assert_irqs_disabled();
  
-       cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
-       rotations = perf_rotate_context(cpuctx);
+       cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
+       rotations = perf_rotate_context(cpc);
  
-       raw_spin_lock(&cpuctx->hrtimer_lock);
+       raw_spin_lock(&cpc->hrtimer_lock);
        if (rotations)
-               hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+               hrtimer_forward_now(hr, cpc->hrtimer_interval);
        else
-               cpuctx->hrtimer_active = 0;
-       raw_spin_unlock(&cpuctx->hrtimer_lock);
+               cpc->hrtimer_active = 0;
+       raw_spin_unlock(&cpc->hrtimer_lock);
  
        return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
  }
  
- static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+ static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
  {
-       struct hrtimer *timer = &cpuctx->hrtimer;
-       struct pmu *pmu = cpuctx->ctx.pmu;
+       struct hrtimer *timer = &cpc->hrtimer;
+       struct pmu *pmu = cpc->epc.pmu;
        u64 interval;
  
-       /* no multiplexing needed for SW PMU */
-       if (pmu->task_ctx_nr == perf_sw_context)
-               return;
        /*
         * check default is sane, if not set then force to
         * default interval (1/tick)
        if (interval < 1)
                interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
  
-       cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
+       cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
  
-       raw_spin_lock_init(&cpuctx->hrtimer_lock);
+       raw_spin_lock_init(&cpc->hrtimer_lock);
        hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
        timer->function = perf_mux_hrtimer_handler;
  }
  
- static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
+ static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
  {
-       struct hrtimer *timer = &cpuctx->hrtimer;
-       struct pmu *pmu = cpuctx->ctx.pmu;
+       struct hrtimer *timer = &cpc->hrtimer;
        unsigned long flags;
  
-       /* not for SW PMU */
-       if (pmu->task_ctx_nr == perf_sw_context)
-               return 0;
-       raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
-       if (!cpuctx->hrtimer_active) {
-               cpuctx->hrtimer_active = 1;
-               hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+       raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
+       if (!cpc->hrtimer_active) {
+               cpc->hrtimer_active = 1;
+               hrtimer_forward_now(timer, cpc->hrtimer_interval);
                hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
        }
-       raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
+       raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);
  
        return 0;
  }
  
+ static int perf_mux_hrtimer_restart_ipi(void *arg)
+ {
+       return perf_mux_hrtimer_restart(arg);
+ }
  void perf_pmu_disable(struct pmu *pmu)
  {
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@@ -1147,32 -1138,9 +1138,9 @@@ void perf_pmu_enable(struct pmu *pmu
                pmu->pmu_enable(pmu);
  }
  
- static DEFINE_PER_CPU(struct list_head, active_ctx_list);
- /*
-  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
-  * perf_event_task_tick() are fully serialized because they're strictly cpu
-  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
-  * disabled, while perf_event_task_tick is called from IRQ context.
-  */
- static void perf_event_ctx_activate(struct perf_event_context *ctx)
- {
-       struct list_head *head = this_cpu_ptr(&active_ctx_list);
-       lockdep_assert_irqs_disabled();
-       WARN_ON(!list_empty(&ctx->active_ctx_list));
-       list_add(&ctx->active_ctx_list, head);
- }
- static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+ static void perf_assert_pmu_disabled(struct pmu *pmu)
  {
-       lockdep_assert_irqs_disabled();
-       WARN_ON(list_empty(&ctx->active_ctx_list));
-       list_del_init(&ctx->active_ctx_list);
+       WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
  }
  
  static void get_ctx(struct perf_event_context *ctx)
@@@ -1199,7 -1167,6 +1167,6 @@@ static void free_ctx(struct rcu_head *h
        struct perf_event_context *ctx;
  
        ctx = container_of(head, struct perf_event_context, rcu_head);
-       free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
        kfree(ctx);
  }
  
@@@ -1384,7 -1351,7 +1351,7 @@@ static u64 primary_event_id(struct perf
   * the context could get moved to another task.
   */
  static struct perf_event_context *
- perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
+ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
  {
        struct perf_event_context *ctx;
  
@@@ -1400,7 -1367,7 +1367,7 @@@ retry
         */
        local_irq_save(*flags);
        rcu_read_lock();
-       ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
+       ctx = rcu_dereference(task->perf_event_ctxp);
        if (ctx) {
                /*
                 * If this context is a clone of another, it might
                 * can't get swapped on us any more.
                 */
                raw_spin_lock(&ctx->lock);
-               if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
+               if (ctx != rcu_dereference(task->perf_event_ctxp)) {
                        raw_spin_unlock(&ctx->lock);
                        rcu_read_unlock();
                        local_irq_restore(*flags);
   * reference count so that the context can't get freed.
   */
  static struct perf_event_context *
- perf_pin_task_context(struct task_struct *task, int ctxn)
+ perf_pin_task_context(struct task_struct *task)
  {
        struct perf_event_context *ctx;
        unsigned long flags;
  
-       ctx = perf_lock_task_context(task, ctxn, &flags);
+       ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@@ -1593,14 -1560,22 +1560,22 @@@ static inline struct cgroup *event_cgro
   * which provides ordering when rotating groups for the same CPU.
   */
  static __always_inline int
- perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
-                     const u64 left_group_index, const struct perf_event *right)
+ perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
+                     const struct cgroup *left_cgroup, const u64 left_group_index,
+                     const struct perf_event *right)
  {
        if (left_cpu < right->cpu)
                return -1;
        if (left_cpu > right->cpu)
                return 1;
  
+       if (left_pmu) {
+               if (left_pmu < right->pmu_ctx->pmu)
+                       return -1;
+               if (left_pmu > right->pmu_ctx->pmu)
+                       return 1;
+       }
  #ifdef CONFIG_CGROUP_PERF
        {
                const struct cgroup *right_cgroup = event_cgroup(right);
  static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
  {
        struct perf_event *e = __node_2_pe(a);
-       return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
-                                    __node_2_pe(b)) < 0;
+       return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
+                                    e->group_index, __node_2_pe(b)) < 0;
  }
  
  struct __group_key {
        int cpu;
+       struct pmu *pmu;
        struct cgroup *cgroup;
  };
  
@@@ -1657,14 -1633,25 +1633,25 @@@ static inline int __group_cmp(const voi
        const struct __group_key *a = key;
        const struct perf_event *b = __node_2_pe(node);
  
-       /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
-       return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
+       /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
+       return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
+ }
+ static inline int
+ __group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
+ {
+       const struct __group_key *a = key;
+       const struct perf_event *b = __node_2_pe(node);
+       /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
+       return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
+                                    b->group_index, b);
  }
  
  /*
-  * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
-  * key (see perf_event_groups_less). This places it last inside the CPU
-  * subtree.
+  * Insert @event into @groups' tree; using
+  *   {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
+  * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
   */
  static void
  perf_event_groups_insert(struct perf_event_groups *groups,
@@@ -1714,14 -1701,15 +1701,15 @@@ del_event_from_groups(struct perf_even
  }
  
  /*
-  * Get the leftmost event in the cpu/cgroup subtree.
+  * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
   */
  static struct perf_event *
  perf_event_groups_first(struct perf_event_groups *groups, int cpu,
-                       struct cgroup *cgrp)
+                       struct pmu *pmu, struct cgroup *cgrp)
  {
        struct __group_key key = {
                .cpu = cpu,
+               .pmu = pmu,
                .cgroup = cgrp,
        };
        struct rb_node *node;
        return NULL;
  }
  
- /*
-  * Like rb_entry_next_safe() for the @cpu subtree.
-  */
  static struct perf_event *
- perf_event_groups_next(struct perf_event *event)
+ perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
  {
        struct __group_key key = {
                .cpu = event->cpu,
+               .pmu = pmu,
                .cgroup = event_cgroup(event),
        };
        struct rb_node *next;
        return NULL;
  }
  
+ #define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu)                \
+       for (event = perf_event_groups_first(groups, cpu, pmu, NULL);   \
+            event; event = perf_event_groups_next(event, pmu))
  /*
   * Iterate through the whole groups tree.
   */
@@@ -1796,6 -1786,7 +1786,7 @@@ list_add_event(struct perf_event *event
                perf_cgroup_event_enable(event, ctx);
  
        ctx->generation++;
+       event->pmu_ctx->nr_events++;
  }
  
  /*
@@@ -1941,7 -1932,8 +1932,8 @@@ static void perf_group_attach(struct pe
        lockdep_assert_held(&event->ctx->lock);
  
        /*
-        * We can have double attach due to group movement in perf_event_open.
+        * We can have double attach due to group movement (move_group) in
+        * perf_event_open().
         */
        if (event->attach_state & PERF_ATTACH_GROUP)
                return;
@@@ -2006,6 -1998,7 +1998,7 @@@ list_del_event(struct perf_event *event
        }
  
        ctx->generation++;
+       event->pmu_ctx->nr_events--;
  }
  
  static int
@@@ -2022,13 -2015,11 +2015,11 @@@ perf_aux_output_match(struct perf_even
  
  static void put_event(struct perf_event *event);
  static void event_sched_out(struct perf_event *event,
-                           struct perf_cpu_context *cpuctx,
                            struct perf_event_context *ctx);
  
  static void perf_put_aux_event(struct perf_event *event)
  {
        struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct perf_event *iter;
  
        /*
                 * state so that we don't try to schedule it again. Note
                 * that perf_event_enable() will clear the ERROR status.
                 */
-               event_sched_out(iter, cpuctx, ctx);
+               event_sched_out(iter, ctx);
                perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
        }
  }
@@@ -2108,8 -2099,8 +2099,8 @@@ static int perf_get_aux_event(struct pe
  
  static inline struct list_head *get_event_list(struct perf_event *event)
  {
-       struct perf_event_context *ctx = event->ctx;
-       return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
+       return event->attr.pinned ? &event->pmu_ctx->pinned_active :
+                                   &event->pmu_ctx->flexible_active;
  }
  
  /*
   */
  static inline void perf_remove_sibling_event(struct perf_event *event)
  {
-       struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-       event_sched_out(event, cpuctx, ctx);
+       event_sched_out(event, event->ctx);
        perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
  }
  
@@@ -2212,53 -2200,22 +2200,22 @@@ static bool is_orphaned_event(struct pe
        return event->state == PERF_EVENT_STATE_DEAD;
  }
  
- static inline int __pmu_filter_match(struct perf_event *event)
- {
-       struct pmu *pmu = event->pmu;
-       return pmu->filter_match ? pmu->filter_match(event) : 1;
- }
- /*
-  * Check whether we should attempt to schedule an event group based on
-  * PMU-specific filtering. An event group can consist of HW and SW events,
-  * potentially with a SW leader, so we must check all the filters, to
-  * determine whether a group is schedulable:
-  */
- static inline int pmu_filter_match(struct perf_event *event)
- {
-       struct perf_event *sibling;
-       unsigned long flags;
-       int ret = 1;
-       if (!__pmu_filter_match(event))
-               return 0;
-       local_irq_save(flags);
-       for_each_sibling_event(sibling, event) {
-               if (!__pmu_filter_match(sibling)) {
-                       ret = 0;
-                       break;
-               }
-       }
-       local_irq_restore(flags);
-       return ret;
- }
  static inline int
  event_filter_match(struct perf_event *event)
  {
        return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
-              perf_cgroup_match(event) && pmu_filter_match(event);
+              perf_cgroup_match(event);
  }
  
  static void
- event_sched_out(struct perf_event *event,
-                 struct perf_cpu_context *cpuctx,
-                 struct perf_event_context *ctx)
+ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
  {
+       struct perf_event_pmu_context *epc = event->pmu_ctx;
+       struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
        enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
  
+       // XXX cpc serialization, probably per-cpu IRQ disabled
        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);
  
                    !event->pending_work) {
                        event->pending_work = 1;
                        dec = false;
 +                      WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
                        task_work_add(current, &event->pending_task, TWA_RESUME);
                }
                if (dec)
        perf_event_set_state(event, state);
  
        if (!is_software_event(event))
-               cpuctx->active_oncpu--;
-       if (!--ctx->nr_active)
-               perf_event_ctx_deactivate(ctx);
+               cpc->active_oncpu--;
        if (event->attr.freq && event->attr.sample_freq)
                ctx->nr_freq--;
-       if (event->attr.exclusive || !cpuctx->active_oncpu)
-               cpuctx->exclusive = 0;
+       if (event->attr.exclusive || !cpc->active_oncpu)
+               cpc->exclusive = 0;
  
        perf_pmu_enable(event->pmu);
  }
  
  static void
- group_sched_out(struct perf_event *group_event,
-               struct perf_cpu_context *cpuctx,
-               struct perf_event_context *ctx)
+ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
  {
        struct perf_event *event;
  
        if (group_event->state != PERF_EVENT_STATE_ACTIVE)
                return;
  
-       perf_pmu_disable(ctx->pmu);
+       perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
  
-       event_sched_out(group_event, cpuctx, ctx);
+       event_sched_out(group_event, ctx);
  
        /*
         * Schedule out siblings (if any):
         */
        for_each_sibling_event(event, group_event)
-               event_sched_out(event, cpuctx, ctx);
-       perf_pmu_enable(ctx->pmu);
+               event_sched_out(event, ctx);
  }
  
  #define DETACH_GROUP  0x01UL
  #define DETACH_CHILD  0x02UL
 +#define DETACH_DEAD   0x04UL
  
  /*
   * Cross CPU call to remove a performance event
@@@ -2351,6 -2300,7 +2302,7 @@@ __perf_remove_from_context(struct perf_
                           struct perf_event_context *ctx,
                           void *info)
  {
+       struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
        unsigned long flags = (unsigned long)info;
  
        if (ctx->is_active & EVENT_TIME) {
                update_cgrp_time_from_cpuctx(cpuctx, false);
        }
  
-       event_sched_out(event, cpuctx, ctx);
 +      /*
 +       * Ensure event_sched_out() switches to OFF, at the very least
 +       * this avoids raising perf_pending_task() at this time.
 +       */
 +      if (flags & DETACH_DEAD)
 +              event->pending_disable = 1;
+       event_sched_out(event, ctx);
        if (flags & DETACH_GROUP)
                perf_group_detach(event);
        if (flags & DETACH_CHILD)
                perf_child_detach(event);
        list_del_event(event, ctx);
 +      if (flags & DETACH_DEAD)
 +              event->state = PERF_EVENT_STATE_DEAD;
  
+       if (!pmu_ctx->nr_events) {
+               pmu_ctx->rotate_necessary = 0;
+               if (ctx->task && ctx->is_active) {
+                       struct perf_cpu_pmu_context *cpc;
+                       cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+                       WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+                       cpc->task_epc = NULL;
+               }
+       }
        if (!ctx->nr_events && ctx->is_active) {
                if (ctx == &cpuctx->ctx)
                        update_cgrp_time_from_cpuctx(cpuctx, true);
  
                ctx->is_active = 0;
-               ctx->rotate_necessary = 0;
                if (ctx->task) {
                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
                        cpuctx->task_ctx = NULL;
@@@ -2408,12 -2361,8 +2371,8 @@@ static void perf_remove_from_context(st
         * event_function_call() user.
         */
        raw_spin_lock_irq(&ctx->lock);
-       /*
-        * Cgroup events are per-cpu events, and must IPI because of
-        * cgrp_cpuctx_list.
-        */
-       if (!ctx->is_active && !is_cgroup_event(event)) {
-               __perf_remove_from_context(event, __get_cpu_context(ctx),
+       if (!ctx->is_active) {
+               __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
                                           ctx, (void *)flags);
                raw_spin_unlock_irq(&ctx->lock);
                return;
@@@ -2439,13 -2388,17 +2398,17 @@@ static void __perf_event_disable(struc
                update_cgrp_time_from_event(event);
        }
  
+       perf_pmu_disable(event->pmu_ctx->pmu);
        if (event == event->group_leader)
-               group_sched_out(event, cpuctx, ctx);
+               group_sched_out(event, ctx);
        else
-               event_sched_out(event, cpuctx, ctx);
+               event_sched_out(event, ctx);
  
        perf_event_set_state(event, PERF_EVENT_STATE_OFF);
        perf_cgroup_event_disable(event, ctx);
+       perf_pmu_enable(event->pmu_ctx->pmu);
  }
  
  /*
@@@ -2507,10 -2460,10 +2470,10 @@@ static void perf_log_throttle(struct pe
  static void perf_log_itrace_start(struct perf_event *event);
  
  static int
- event_sched_in(struct perf_event *event,
-                struct perf_cpu_context *cpuctx,
-                struct perf_event_context *ctx)
+ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
  {
+       struct perf_event_pmu_context *epc = event->pmu_ctx;
+       struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
        int ret = 0;
  
        WARN_ON_ONCE(event->ctx != ctx);
        }
  
        if (!is_software_event(event))
-               cpuctx->active_oncpu++;
-       if (!ctx->nr_active++)
-               perf_event_ctx_activate(ctx);
+               cpc->active_oncpu++;
        if (event->attr.freq && event->attr.sample_freq)
                ctx->nr_freq++;
  
        if (event->attr.exclusive)
-               cpuctx->exclusive = 1;
+               cpc->exclusive = 1;
  
  out:
        perf_pmu_enable(event->pmu);
  }
  
  static int
- group_sched_in(struct perf_event *group_event,
-              struct perf_cpu_context *cpuctx,
-              struct perf_event_context *ctx)
+ group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
  {
        struct perf_event *event, *partial_group = NULL;
-       struct pmu *pmu = ctx->pmu;
+       struct pmu *pmu = group_event->pmu_ctx->pmu;
  
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
  
        pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
  
-       if (event_sched_in(group_event, cpuctx, ctx))
+       if (event_sched_in(group_event, ctx))
                goto error;
  
        /*
         * Schedule in siblings as one group (if any):
         */
        for_each_sibling_event(event, group_event) {
-               if (event_sched_in(event, cpuctx, ctx)) {
+               if (event_sched_in(event, ctx)) {
                        partial_group = event;
                        goto group_error;
                }
@@@ -2605,9 -2554,9 +2564,9 @@@ group_error
                if (event == partial_group)
                        break;
  
-               event_sched_out(event, cpuctx, ctx);
+               event_sched_out(event, ctx);
        }
-       event_sched_out(group_event, cpuctx, ctx);
+       event_sched_out(group_event, ctx);
  
  error:
        pmu->cancel_txn(pmu);
  /*
   * Work out whether we can put this event group on the CPU now.
   */
- static int group_can_go_on(struct perf_event *event,
-                          struct perf_cpu_context *cpuctx,
-                          int can_add_hw)
+ static int group_can_go_on(struct perf_event *event, int can_add_hw)
  {
+       struct perf_event_pmu_context *epc = event->pmu_ctx;
+       struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
        /*
         * Groups consisting entirely of software events can always go on.
         */
         * If an exclusive group is already on, no other hardware
         * events can go on.
         */
-       if (cpuctx->exclusive)
+       if (cpc->exclusive)
                return 0;
        /*
         * If this group is exclusive and there are already
@@@ -2652,36 -2602,29 +2612,29 @@@ static void add_event_to_ctx(struct per
        perf_group_attach(event);
  }
  
- static void ctx_sched_out(struct perf_event_context *ctx,
-                         struct perf_cpu_context *cpuctx,
-                         enum event_type_t event_type);
- static void
- ctx_sched_in(struct perf_event_context *ctx,
-            struct perf_cpu_context *cpuctx,
-            enum event_type_t event_type);
- static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
-                              struct perf_event_context *ctx,
-                              enum event_type_t event_type)
+ static void task_ctx_sched_out(struct perf_event_context *ctx,
+                               enum event_type_t event_type)
  {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        if (!cpuctx->task_ctx)
                return;
  
        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                return;
  
-       ctx_sched_out(ctx, cpuctx, event_type);
+       ctx_sched_out(ctx, event_type);
  }
  
  static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx)
  {
-       cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
+       ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
        if (ctx)
-               ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
-       cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+                ctx_sched_in(ctx, EVENT_PINNED);
+       ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
        if (ctx)
-               ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+                ctx_sched_in(ctx, EVENT_FLEXIBLE);
  }
  
  /*
   * event_type is a bit mask of the types of events involved. For CPU events,
   * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
   */
+ /*
+  * XXX: ctx_resched() reschedule entire perf_event_context while adding new
+  * event to the context or enabling existing event in the context. We can
+  * probably optimize it by rescheduling only affected pmu_ctx.
+  */
  static void ctx_resched(struct perf_cpu_context *cpuctx,
                        struct perf_event_context *task_ctx,
                        enum event_type_t event_type)
  {
-       enum event_type_t ctx_event_type;
        bool cpu_event = !!(event_type & EVENT_CPU);
  
        /*
        if (event_type & EVENT_PINNED)
                event_type |= EVENT_FLEXIBLE;
  
-       ctx_event_type = event_type & EVENT_ALL;
+       event_type &= EVENT_ALL;
  
-       perf_pmu_disable(cpuctx->ctx.pmu);
-       if (task_ctx)
-               task_ctx_sched_out(cpuctx, task_ctx, event_type);
+       perf_ctx_disable(&cpuctx->ctx);
+       if (task_ctx) {
+               perf_ctx_disable(task_ctx);
+               task_ctx_sched_out(task_ctx, event_type);
+       }
  
        /*
         * Decide which cpu ctx groups to schedule out based on the types
         *  - otherwise, do nothing more.
         */
        if (cpu_event)
-               cpu_ctx_sched_out(cpuctx, ctx_event_type);
-       else if (ctx_event_type & EVENT_PINNED)
-               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+               ctx_sched_out(&cpuctx->ctx, event_type);
+       else if (event_type & EVENT_PINNED)
+               ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
  
        perf_event_sched_in(cpuctx, task_ctx);
-       perf_pmu_enable(cpuctx->ctx.pmu);
+       perf_ctx_enable(&cpuctx->ctx);
+       if (task_ctx)
+               perf_ctx_enable(task_ctx);
  }
  
  void perf_pmu_resched(struct pmu *pmu)
  {
-       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
  
        perf_ctx_lock(cpuctx, task_ctx);
@@@ -2755,7 -2707,7 +2717,7 @@@ static int  __perf_install_in_context(v
  {
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        bool reprogram = true;
        int ret = 0;
  #endif
  
        if (reprogram) {
-               ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_out(ctx, EVENT_TIME);
                add_event_to_ctx(event, ctx);
                ctx_resched(cpuctx, task_ctx, get_event_type(event));
        } else {
@@@ -2830,7 -2782,7 +2792,7 @@@ perf_install_in_context(struct perf_eve
        WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
  
        if (event->cpu != -1)
-               event->cpu = cpu;
+               WARN_ON_ONCE(event->cpu != cpu);
  
        /*
         * Ensures that if we can observe event->ctx, both the event and ctx
         * perf_event_attr::disabled events will not run and can be initialized
         * without IPI. Except when this is the first event for the context, in
         * that case we need the magic of the IPI to set ctx->is_active.
-        * Similarly, cgroup events for the context also needs the IPI to
-        * manipulate the cgrp_cpuctx_list.
         *
         * The IOC_ENABLE that is sure to follow the creation of a disabled
         * event will issue the IPI and reprogram the hardware.
@@@ -2945,7 -2895,7 +2905,7 @@@ static void __perf_event_enable(struct 
                return;
  
        if (ctx->is_active)
-               ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_out(ctx, EVENT_TIME);
  
        perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
        perf_cgroup_event_enable(event, ctx);
                return;
  
        if (!event_filter_match(event)) {
-               ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_in(ctx, EVENT_TIME);
                return;
        }
  
         * then don't put it on unless the group is on.
         */
        if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
-               ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_in(ctx, EVENT_TIME);
                return;
        }
  
        return err;
  }
  
- static void ctx_sched_out(struct perf_event_context *ctx,
-                         struct perf_cpu_context *cpuctx,
-                         enum event_type_t event_type)
+ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
+                               enum event_type_t event_type)
  {
+       struct perf_event_context *ctx = pmu_ctx->ctx;
        struct perf_event *event, *tmp;
+       struct pmu *pmu = pmu_ctx->pmu;
+       if (ctx->task && !ctx->is_active) {
+               struct perf_cpu_pmu_context *cpc;
+               cpc = this_cpu_ptr(pmu->cpu_pmu_context);
+               WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+               cpc->task_epc = NULL;
+       }
+       if (!event_type)
+               return;
+       perf_pmu_disable(pmu);
+       if (event_type & EVENT_PINNED) {
+               list_for_each_entry_safe(event, tmp,
+                                        &pmu_ctx->pinned_active,
+                                        active_list)
+                       group_sched_out(event, ctx);
+       }
+       if (event_type & EVENT_FLEXIBLE) {
+               list_for_each_entry_safe(event, tmp,
+                                        &pmu_ctx->flexible_active,
+                                        active_list)
+                       group_sched_out(event, ctx);
+               /*
+                * Since we cleared EVENT_FLEXIBLE, also clear
+                * rotate_necessary, is will be reset by
+                * ctx_flexible_sched_in() when needed.
+                */
+               pmu_ctx->rotate_necessary = 0;
+       }
+       perf_pmu_enable(pmu);
+ }
+ static void
+ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+ {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+       struct perf_event_pmu_context *pmu_ctx;
        int is_active = ctx->is_active;
  
        lockdep_assert_held(&ctx->lock);
  
        is_active ^= ctx->is_active; /* changed bits */
  
-       if (!ctx->nr_active || !(is_active & EVENT_ALL))
-               return;
-       perf_pmu_disable(ctx->pmu);
-       if (is_active & EVENT_PINNED) {
-               list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
-                       group_sched_out(event, cpuctx, ctx);
-       }
-       if (is_active & EVENT_FLEXIBLE) {
-               list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
-                       group_sched_out(event, cpuctx, ctx);
-               /*
-                * Since we cleared EVENT_FLEXIBLE, also clear
-                * rotate_necessary, is will be reset by
-                * ctx_flexible_sched_in() when needed.
-                */
-               ctx->rotate_necessary = 0;
-       }
-       perf_pmu_enable(ctx->pmu);
+       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+               __pmu_ctx_sched_out(pmu_ctx, is_active);
  }
  
  /*
@@@ -3409,26 -3381,68 +3391,68 @@@ static void perf_event_sync_stat(struc
        }
  }
  
- static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
-                                        struct task_struct *next)
+ #define double_list_for_each_entry(pos1, pos2, head1, head2, member)  \
+       for (pos1 = list_first_entry(head1, typeof(*pos1), member),     \
+            pos2 = list_first_entry(head2, typeof(*pos2), member);     \
+            !list_entry_is_head(pos1, head1, member) &&                \
+            !list_entry_is_head(pos2, head2, member);                  \
+            pos1 = list_next_entry(pos1, member),                      \
+            pos2 = list_next_entry(pos2, member))
+ static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx,
+                                         struct perf_event_context *next_ctx)
+ {
+       struct perf_event_pmu_context *prev_epc, *next_epc;
+       if (!prev_ctx->nr_task_data)
+               return;
+       double_list_for_each_entry(prev_epc, next_epc,
+                                  &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list,
+                                  pmu_ctx_entry) {
+               if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu))
+                       continue;
+               /*
+                * PMU specific parts of task perf context can require
+                * additional synchronization. As an example of such
+                * synchronization see implementation details of Intel
+                * LBR call stack data profiling;
+                */
+               if (prev_epc->pmu->swap_task_ctx)
+                       prev_epc->pmu->swap_task_ctx(prev_epc, next_epc);
+               else
+                       swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
+       }
+ }
+ static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in)
+ {
+       struct perf_event_pmu_context *pmu_ctx;
+       struct perf_cpu_pmu_context *cpc;
+       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+               cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+               if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
+                       pmu_ctx->pmu->sched_task(pmu_ctx, sched_in);
+       }
+ }
+ static void
+ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
  {
-       struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
+       struct perf_event_context *ctx = task->perf_event_ctxp;
        struct perf_event_context *next_ctx;
        struct perf_event_context *parent, *next_parent;
-       struct perf_cpu_context *cpuctx;
        int do_switch = 1;
-       struct pmu *pmu;
  
        if (likely(!ctx))
                return;
  
-       pmu = ctx->pmu;
-       cpuctx = __get_cpu_context(ctx);
-       if (!cpuctx->task_ctx)
-               return;
        rcu_read_lock();
-       next_ctx = next->perf_event_ctxp[ctxn];
+       next_ctx = rcu_dereference(next->perf_event_ctxp);
        if (!next_ctx)
                goto unlock;
  
                raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
                if (context_equiv(ctx, next_ctx)) {
  
-                       perf_pmu_disable(pmu);
+                       perf_ctx_disable(ctx);
  
                        /* PMIs are disabled; ctx->nr_pending is stable. */
                        if (local_read(&ctx->nr_pending) ||
                        WRITE_ONCE(ctx->task, next);
                        WRITE_ONCE(next_ctx->task, task);
  
-                       if (cpuctx->sched_cb_usage && pmu->sched_task)
-                               pmu->sched_task(ctx, false);
-                       /*
-                        * PMU specific parts of task perf context can require
-                        * additional synchronization. As an example of such
-                        * synchronization see implementation details of Intel
-                        * LBR call stack data profiling;
-                        */
-                       if (pmu->swap_task_ctx)
-                               pmu->swap_task_ctx(ctx, next_ctx);
-                       else
-                               swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+                       perf_ctx_sched_task_cb(ctx, false);
+                       perf_event_swap_task_ctx_data(ctx, next_ctx);
  
-                       perf_pmu_enable(pmu);
+                       perf_ctx_enable(ctx);
  
                        /*
                         * RCU_INIT_POINTER here is safe because we've not
                         * since those values are always verified under
                         * ctx->lock which we're now holding.
                         */
-                       RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
-                       RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
+                       RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
+                       RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
  
                        do_switch = 0;
  
@@@ -3508,38 -3511,40 +3521,40 @@@ unlock
  
        if (do_switch) {
                raw_spin_lock(&ctx->lock);
-               perf_pmu_disable(pmu);
+               perf_ctx_disable(ctx);
  
  inside_switch:
-               if (cpuctx->sched_cb_usage && pmu->sched_task)
-                       pmu->sched_task(ctx, false);
-               task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+               perf_ctx_sched_task_cb(ctx, false);
+               task_ctx_sched_out(ctx, EVENT_ALL);
  
-               perf_pmu_enable(pmu);
+               perf_ctx_enable(ctx);
                raw_spin_unlock(&ctx->lock);
        }
  }
  
  static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+ static DEFINE_PER_CPU(int, perf_sched_cb_usages);
  
  void perf_sched_cb_dec(struct pmu *pmu)
  {
-       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+       struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
  
        this_cpu_dec(perf_sched_cb_usages);
+       barrier();
  
-       if (!--cpuctx->sched_cb_usage)
-               list_del(&cpuctx->sched_cb_entry);
+       if (!--cpc->sched_cb_usage)
+               list_del(&cpc->sched_cb_entry);
  }
  
  
  void perf_sched_cb_inc(struct pmu *pmu)
  {
-       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+       struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
  
-       if (!cpuctx->sched_cb_usage++)
-               list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+       if (!cpc->sched_cb_usage++)
+               list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
  
+       barrier();
        this_cpu_inc(perf_sched_cb_usages);
  }
  
   * PEBS requires this to provide PID/TID information. This requires we flush
   * all queued PEBS records before we context switch to a new task.
   */
- static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
+ static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in)
  {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct pmu *pmu;
  
-       pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
+       pmu = cpc->epc.pmu;
  
+       /* software PMUs will not have sched_task */
        if (WARN_ON_ONCE(!pmu->sched_task))
                return;
  
        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(pmu);
  
-       pmu->sched_task(cpuctx->task_ctx, sched_in);
+       pmu->sched_task(cpc->task_epc, sched_in);
  
        perf_pmu_enable(pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@@ -3573,26 -3580,20 +3590,20 @@@ static void perf_pmu_sched_task(struct 
                                struct task_struct *next,
                                bool sched_in)
  {
-       struct perf_cpu_context *cpuctx;
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+       struct perf_cpu_pmu_context *cpc;
  
-       if (prev == next)
+       /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
+       if (prev == next || cpuctx->task_ctx)
                return;
  
-       list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
-               /* will be handled in perf_event_context_sched_in/out */
-               if (cpuctx->task_ctx)
-                       continue;
-               __perf_pmu_sched_task(cpuctx, sched_in);
-       }
+       list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
+               __perf_pmu_sched_task(cpc, sched_in);
  }
  
  static void perf_event_switch(struct task_struct *task,
                              struct task_struct *next_prev, bool sched_in);
  
- #define for_each_task_context_nr(ctxn)                                        \
-       for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
  /*
   * Called from scheduler to remove the events of the current task,
   * with interrupts disabled.
  void __perf_event_task_sched_out(struct task_struct *task,
                                 struct task_struct *next)
  {
-       int ctxn;
        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(task, next, false);
  
        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, next, false);
  
-       for_each_task_context_nr(ctxn)
-               perf_event_context_sched_out(task, ctxn, next);
+       perf_event_context_sched_out(task, next);
  
        /*
         * if cgroup events exist on this CPU, then we need
                perf_cgroup_switch(next);
  }
  
- /*
-  * Called with IRQs disabled
-  */
- static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
-                             enum event_type_t event_type)
- {
-       ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
- }
  static bool perf_less_group_idx(const void *l, const void *r)
  {
        const struct perf_event *le = *(const struct perf_event **)l;
@@@ -3667,21 -3656,39 +3666,39 @@@ static void __heap_add(struct min_heap 
        }
  }
  
- static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
+ static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
+ {
+       struct perf_cpu_pmu_context *cpc;
+       if (!pmu_ctx->ctx->task)
+               return;
+       cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+       WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+       cpc->task_epc = pmu_ctx;
+ }
+ static noinline int visit_groups_merge(struct perf_event_context *ctx,
                                struct perf_event_groups *groups, int cpu,
+                               struct pmu *pmu,
                                int (*func)(struct perf_event *, void *),
                                void *data)
  {
  #ifdef CONFIG_CGROUP_PERF
        struct cgroup_subsys_state *css = NULL;
  #endif
+       struct perf_cpu_context *cpuctx = NULL;
        /* Space for per CPU and/or any CPU event iterators. */
        struct perf_event *itrs[2];
        struct min_heap event_heap;
        struct perf_event **evt;
        int ret;
  
-       if (cpuctx) {
+       if (pmu->filter && pmu->filter(pmu, cpu))
+               return 0;
+       if (!ctx->task) {
+               cpuctx = this_cpu_ptr(&perf_cpu_context);
                event_heap = (struct min_heap){
                        .data = cpuctx->heap,
                        .nr = 0,
                        .size = ARRAY_SIZE(itrs),
                };
                /* Events not within a CPU context may be on any CPU. */
-               __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
+               __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
        }
        evt = event_heap.data;
  
-       __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
+       __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));
  
  #ifdef CONFIG_CGROUP_PERF
        for (; css; css = css->parent)
-               __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
+               __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
  #endif
  
+       if (event_heap.nr) {
+               __link_epc((*evt)->pmu_ctx);
+               perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
+       }
        min_heapify_all(&event_heap, &perf_min_heap);
  
        while (event_heap.nr) {
                if (ret)
                        return ret;
  
-               *evt = perf_event_groups_next(*evt);
+               *evt = perf_event_groups_next(*evt, pmu);
                if (*evt)
                        min_heapify(&event_heap, 0, &perf_min_heap);
                else
@@@ -3761,7 -3773,6 +3783,6 @@@ static inline void group_update_userpag
  static int merge_sched_in(struct perf_event *event, void *data)
  {
        struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        int *can_add_hw = data;
  
        if (event->state <= PERF_EVENT_STATE_OFF)
        if (!event_filter_match(event))
                return 0;
  
-       if (group_can_go_on(event, cpuctx, *can_add_hw)) {
-               if (!group_sched_in(event, cpuctx, ctx))
+       if (group_can_go_on(event, *can_add_hw)) {
+               if (!group_sched_in(event, ctx))
                        list_add_tail(&event->active_list, get_event_list(event));
        }
  
                        perf_cgroup_event_disable(event, ctx);
                        perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
                } else {
-                       ctx->rotate_necessary = 1;
-                       perf_mux_hrtimer_restart(cpuctx);
+                       struct perf_cpu_pmu_context *cpc;
+                       event->pmu_ctx->rotate_necessary = 1;
+                       cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context);
+                       perf_mux_hrtimer_restart(cpc);
                        group_update_userpage(event);
                }
        }
        return 0;
  }
  
- static void
- ctx_pinned_sched_in(struct perf_event_context *ctx,
-                   struct perf_cpu_context *cpuctx)
+ static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
  {
+       struct perf_event_pmu_context *pmu_ctx;
        int can_add_hw = 1;
  
-       if (ctx != &cpuctx->ctx)
-               cpuctx = NULL;
-       visit_groups_merge(cpuctx, &ctx->pinned_groups,
-                          smp_processor_id(),
-                          merge_sched_in, &can_add_hw);
+       if (pmu) {
+               visit_groups_merge(ctx, &ctx->pinned_groups,
+                                  smp_processor_id(), pmu,
+                                  merge_sched_in, &can_add_hw);
+       } else {
+               list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+                       can_add_hw = 1;
+                       visit_groups_merge(ctx, &ctx->pinned_groups,
+                                          smp_processor_id(), pmu_ctx->pmu,
+                                          merge_sched_in, &can_add_hw);
+               }
+       }
  }
  
- static void
- ctx_flexible_sched_in(struct perf_event_context *ctx,
-                     struct perf_cpu_context *cpuctx)
+ static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
  {
+       struct perf_event_pmu_context *pmu_ctx;
        int can_add_hw = 1;
  
-       if (ctx != &cpuctx->ctx)
-               cpuctx = NULL;
+       if (pmu) {
+               visit_groups_merge(ctx, &ctx->flexible_groups,
+                                  smp_processor_id(), pmu,
+                                  merge_sched_in, &can_add_hw);
+       } else {
+               list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+                       can_add_hw = 1;
+                       visit_groups_merge(ctx, &ctx->flexible_groups,
+                                          smp_processor_id(), pmu_ctx->pmu,
+                                          merge_sched_in, &can_add_hw);
+               }
+       }
+ }
  
-       visit_groups_merge(cpuctx, &ctx->flexible_groups,
-                          smp_processor_id(),
-                          merge_sched_in, &can_add_hw);
+ static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+ {
+       ctx_flexible_sched_in(ctx, pmu);
  }
  
  static void
- ctx_sched_in(struct perf_event_context *ctx,
-            struct perf_cpu_context *cpuctx,
-            enum event_type_t event_type)
+ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
  {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        int is_active = ctx->is_active;
  
        lockdep_assert_held(&ctx->lock);
         * in order to give them the best chance of going on.
         */
        if (is_active & EVENT_PINNED)
-               ctx_pinned_sched_in(ctx, cpuctx);
+               ctx_pinned_sched_in(ctx, NULL);
  
        /* Then walk through the lower prio flexible groups */
        if (is_active & EVENT_FLEXIBLE)
-               ctx_flexible_sched_in(ctx, cpuctx);
+               ctx_flexible_sched_in(ctx, NULL);
  }
  
- static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-                            enum event_type_t event_type)
+ static void perf_event_context_sched_in(struct task_struct *task)
  {
-       struct perf_event_context *ctx = &cpuctx->ctx;
-       ctx_sched_in(ctx, cpuctx, event_type);
- }
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+       struct perf_event_context *ctx;
  
- static void perf_event_context_sched_in(struct perf_event_context *ctx,
-                                       struct task_struct *task)
- {
-       struct perf_cpu_context *cpuctx;
-       struct pmu *pmu;
+       rcu_read_lock();
+       ctx = rcu_dereference(task->perf_event_ctxp);
+       if (!ctx)
+               goto rcu_unlock;
  
-       cpuctx = __get_cpu_context(ctx);
+       if (cpuctx->task_ctx == ctx) {
+               perf_ctx_lock(cpuctx, ctx);
+               perf_ctx_disable(ctx);
  
-       /*
-        * HACK: for HETEROGENEOUS the task context might have switched to a
-        * different PMU, force (re)set the context,
-        */
-       pmu = ctx->pmu = cpuctx->ctx.pmu;
+               perf_ctx_sched_task_cb(ctx, true);
  
-       if (cpuctx->task_ctx == ctx) {
-               if (cpuctx->sched_cb_usage)
-                       __perf_pmu_sched_task(cpuctx, true);
-               return;
+               perf_ctx_enable(ctx);
+               perf_ctx_unlock(cpuctx, ctx);
+               goto rcu_unlock;
        }
  
        perf_ctx_lock(cpuctx, ctx);
        if (!ctx->nr_events)
                goto unlock;
  
-       perf_pmu_disable(pmu);
+       perf_ctx_disable(ctx);
        /*
         * We want to keep the following priority order:
         * cpu pinned (that don't need to move), task pinned,
         * However, if task's ctx is not carrying any pinned
         * events, no need to flip the cpuctx's events around.
         */
-       if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
-               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+       if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
+               perf_ctx_disable(&cpuctx->ctx);
+               ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+       }
        perf_event_sched_in(cpuctx, ctx);
  
-       if (cpuctx->sched_cb_usage && pmu->sched_task)
-               pmu->sched_task(cpuctx->task_ctx, true);
+       perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
  
-       perf_pmu_enable(pmu);
+       if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
+               perf_ctx_enable(&cpuctx->ctx);
+       perf_ctx_enable(ctx);
  
  unlock:
        perf_ctx_unlock(cpuctx, ctx);
+ rcu_unlock:
+       rcu_read_unlock();
  }
  
  /*
  void __perf_event_task_sched_in(struct task_struct *prev,
                                struct task_struct *task)
  {
-       struct perf_event_context *ctx;
-       int ctxn;
-       for_each_task_context_nr(ctxn) {
-               ctx = task->perf_event_ctxp[ctxn];
-               if (likely(!ctx))
-                       continue;
-               perf_event_context_sched_in(ctx, task);
-       }
+       perf_event_context_sched_in(task);
  
        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, prev, true);
@@@ -4063,8 -4082,8 +4092,8 @@@ static void perf_adjust_period(struct p
   * events. At the same time, make sure, having freq events does not change
   * the rate of unthrottling as that would introduce bias.
   */
- static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
                                         int needs_unthr)
+ static void
perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
  {
        struct perf_event *event;
        struct hw_perf_event *hwc;
         * - context have events in frequency mode (needs freq adjust)
         * - there are events to unthrottle on this cpu
         */
-       if (!(ctx->nr_freq || needs_unthr))
+       if (!(ctx->nr_freq || unthrottle))
                return;
  
        raw_spin_lock(&ctx->lock);
-       perf_pmu_disable(ctx->pmu);
  
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;
  
+               // XXX use visit thingy to avoid the -1,cpu match
                if (!event_filter_match(event))
                        continue;
  
                perf_pmu_enable(event->pmu);
        }
  
-       perf_pmu_enable(ctx->pmu);
        raw_spin_unlock(&ctx->lock);
  }
  
@@@ -4148,72 -4166,109 +4176,109 @@@ static void rotate_ctx(struct perf_even
  
  /* pick an event from the flexible_groups to rotate */
  static inline struct perf_event *
- ctx_event_to_rotate(struct perf_event_context *ctx)
+ ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
  {
        struct perf_event *event;
+       struct rb_node *node;
+       struct rb_root *tree;
+       struct __group_key key = {
+               .pmu = pmu_ctx->pmu,
+       };
  
        /* pick the first active flexible event */
-       event = list_first_entry_or_null(&ctx->flexible_active,
+       event = list_first_entry_or_null(&pmu_ctx->flexible_active,
                                         struct perf_event, active_list);
+       if (event)
+               goto out;
  
        /* if no active flexible event, pick the first event */
-       if (!event) {
-               event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
-                                     typeof(*event), group_node);
-       }
+       tree = &pmu_ctx->ctx->flexible_groups.tree;
  
+       if (!pmu_ctx->ctx->task) {
+               key.cpu = smp_processor_id();
+               node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+               if (node)
+                       event = __node_2_pe(node);
+               goto out;
+       }
+       key.cpu = -1;
+       node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+       if (node) {
+               event = __node_2_pe(node);
+               goto out;
+       }
+       key.cpu = smp_processor_id();
+       node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+       if (node)
+               event = __node_2_pe(node);
+ out:
        /*
         * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
         * finds there are unschedulable events, it will set it again.
         */
-       ctx->rotate_necessary = 0;
+       pmu_ctx->rotate_necessary = 0;
  
        return event;
  }
  
- static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
+ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
  {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+       struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
        struct perf_event *cpu_event = NULL, *task_event = NULL;
-       struct perf_event_context *task_ctx = NULL;
        int cpu_rotate, task_rotate;
+       struct pmu *pmu;
  
        /*
         * Since we run this from IRQ context, nobody can install new
         * events, thus the event count values are stable.
         */
  
-       cpu_rotate = cpuctx->ctx.rotate_necessary;
-       task_ctx = cpuctx->task_ctx;
-       task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
+       cpu_epc = &cpc->epc;
+       pmu = cpu_epc->pmu;
+       task_epc = cpc->task_epc;
+       cpu_rotate = cpu_epc->rotate_necessary;
+       task_rotate = task_epc ? task_epc->rotate_necessary : 0;
  
        if (!(cpu_rotate || task_rotate))
                return false;
  
        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-       perf_pmu_disable(cpuctx->ctx.pmu);
+       perf_pmu_disable(pmu);
  
        if (task_rotate)
-               task_event = ctx_event_to_rotate(task_ctx);
+               task_event = ctx_event_to_rotate(task_epc);
        if (cpu_rotate)
-               cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
+               cpu_event = ctx_event_to_rotate(cpu_epc);
  
        /*
         * As per the order given at ctx_resched() first 'pop' task flexible
         * and then, if needed CPU flexible.
         */
-       if (task_event || (task_ctx && cpu_event))
-               ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
-       if (cpu_event)
-               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+       if (task_event || (task_epc && cpu_event)) {
+               update_context_time(task_epc->ctx);
+               __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
+       }
  
-       if (task_event)
-               rotate_ctx(task_ctx, task_event);
-       if (cpu_event)
+       if (cpu_event) {
+               update_context_time(&cpuctx->ctx);
+               __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
                rotate_ctx(&cpuctx->ctx, cpu_event);
+               __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+       }
  
-       perf_event_sched_in(cpuctx, task_ctx);
+       if (task_event)
+               rotate_ctx(task_epc->ctx, task_event);
  
-       perf_pmu_enable(cpuctx->ctx.pmu);
+       if (task_event || (task_epc && cpu_event))
+               __pmu_ctx_sched_in(task_epc->ctx, pmu);
+       perf_pmu_enable(pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
  
        return true;
  
  void perf_event_task_tick(void)
  {
-       struct list_head *head = this_cpu_ptr(&active_ctx_list);
-       struct perf_event_context *ctx, *tmp;
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+       struct perf_event_context *ctx;
        int throttled;
  
        lockdep_assert_irqs_disabled();
        throttled = __this_cpu_xchg(perf_throttled_count, 0);
        tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
  
-       list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
-               perf_adjust_freq_unthr_context(ctx, throttled);
+       perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);
+       rcu_read_lock();
+       ctx = rcu_dereference(current->perf_event_ctxp);
+       if (ctx)
+               perf_adjust_freq_unthr_context(ctx, !!throttled);
+       rcu_read_unlock();
  }
  
  static int event_enable_on_exec(struct perf_event *event,
   * Enable all of a task's events that have been marked enable-on-exec.
   * This expects task == current.
   */
- static void perf_event_enable_on_exec(int ctxn)
+ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
  {
-       struct perf_event_context *ctx, *clone_ctx = NULL;
+       struct perf_event_context *clone_ctx = NULL;
        enum event_type_t event_type = 0;
        struct perf_cpu_context *cpuctx;
        struct perf_event *event;
        int enabled = 0;
  
        local_irq_save(flags);
-       ctx = current->perf_event_ctxp[ctxn];
-       if (!ctx || !ctx->nr_events)
+       if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
+               goto out;
+       if (!ctx->nr_events)
                goto out;
  
-       cpuctx = __get_cpu_context(ctx);
+       cpuctx = this_cpu_ptr(&perf_cpu_context);
        perf_ctx_lock(cpuctx, ctx);
-       ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+       ctx_sched_out(ctx, EVENT_TIME);
        list_for_each_entry(event, &ctx->event_list, event_entry) {
                enabled |= event_enable_on_exec(event, ctx);
                event_type |= get_event_type(event);
                clone_ctx = unclone_ctx(ctx);
                ctx_resched(cpuctx, ctx, event_type);
        } else {
-               ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_in(ctx, EVENT_TIME);
        }
        perf_ctx_unlock(cpuctx, ctx);
  
@@@ -4302,17 -4365,13 +4375,13 @@@ static void perf_event_exit_event(struc
   * Removes all events from the current task that have been marked
   * remove-on-exec, and feeds their values back to parent events.
   */
- static void perf_event_remove_on_exec(int ctxn)
+ static void perf_event_remove_on_exec(struct perf_event_context *ctx)
  {
-       struct perf_event_context *ctx, *clone_ctx = NULL;
+       struct perf_event_context *clone_ctx = NULL;
        struct perf_event *event, *next;
        unsigned long flags;
        bool modified = false;
  
-       ctx = perf_pin_task_context(current, ctxn);
-       if (!ctx)
-               return;
        mutex_lock(&ctx->mutex);
  
        if (WARN_ON_ONCE(ctx->task != current))
        raw_spin_lock_irqsave(&ctx->lock, flags);
        if (modified)
                clone_ctx = unclone_ctx(ctx);
-       --ctx->pin_count;
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
  
  unlock:
        mutex_unlock(&ctx->mutex);
  
-       put_ctx(ctx);
        if (clone_ctx)
                put_ctx(clone_ctx);
  }
@@@ -4375,7 -4432,7 +4442,7 @@@ static void __perf_event_read(void *inf
        struct perf_read_data *data = info;
        struct perf_event *sub, *event = data->event;
        struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct pmu *pmu = event->pmu;
  
        /*
@@@ -4601,17 -4658,25 +4668,25 @@@ static void __perf_event_init_context(s
  {
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
-       INIT_LIST_HEAD(&ctx->active_ctx_list);
+       INIT_LIST_HEAD(&ctx->pmu_ctx_list);
        perf_event_groups_init(&ctx->pinned_groups);
        perf_event_groups_init(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
-       INIT_LIST_HEAD(&ctx->pinned_active);
-       INIT_LIST_HEAD(&ctx->flexible_active);
        refcount_set(&ctx->refcount, 1);
  }
  
+ static void
+ __perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
+ {
+       epc->pmu = pmu;
+       INIT_LIST_HEAD(&epc->pmu_ctx_entry);
+       INIT_LIST_HEAD(&epc->pinned_active);
+       INIT_LIST_HEAD(&epc->flexible_active);
+       atomic_set(&epc->refcount, 1);
+ }
  static struct perf_event_context *
- alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+ alloc_perf_context(struct task_struct *task)
  {
        struct perf_event_context *ctx;
  
        __perf_event_init_context(ctx);
        if (task)
                ctx->task = get_task_struct(task);
-       ctx->pmu = pmu;
  
        return ctx;
  }
@@@ -4651,15 -4715,12 +4725,12 @@@ find_lively_task_by_vpid(pid_t vpid
   * Returns a matching context with refcount and pincount.
   */
  static struct perf_event_context *
- find_get_context(struct pmu *pmu, struct task_struct *task,
-               struct perf_event *event)
+ find_get_context(struct task_struct *task, struct perf_event *event)
  {
        struct perf_event_context *ctx, *clone_ctx = NULL;
        struct perf_cpu_context *cpuctx;
-       void *task_ctx_data = NULL;
        unsigned long flags;
-       int ctxn, err;
-       int cpu = event->cpu;
+       int err;
  
        if (!task) {
                /* Must be root to operate on a CPU event: */
                if (err)
                        return ERR_PTR(err);
  
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+               cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
                ctx = &cpuctx->ctx;
                get_ctx(ctx);
                raw_spin_lock_irqsave(&ctx->lock, flags);
        }
  
        err = -EINVAL;
-       ctxn = pmu->task_ctx_nr;
-       if (ctxn < 0)
-               goto errout;
-       if (event->attach_state & PERF_ATTACH_TASK_DATA) {
-               task_ctx_data = alloc_task_ctx_data(pmu);
-               if (!task_ctx_data) {
-                       err = -ENOMEM;
-                       goto errout;
-               }
-       }
  retry:
-       ctx = perf_lock_task_context(task, ctxn, &flags);
+       ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                clone_ctx = unclone_ctx(ctx);
                ++ctx->pin_count;
  
-               if (task_ctx_data && !ctx->task_ctx_data) {
-                       ctx->task_ctx_data = task_ctx_data;
-                       task_ctx_data = NULL;
-               }
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
  
                if (clone_ctx)
                        put_ctx(clone_ctx);
        } else {
-               ctx = alloc_perf_context(pmu, task);
+               ctx = alloc_perf_context(task);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;
  
-               if (task_ctx_data) {
-                       ctx->task_ctx_data = task_ctx_data;
-                       task_ctx_data = NULL;
-               }
                err = 0;
                mutex_lock(&task->perf_event_mutex);
                /*
                 */
                if (task->flags & PF_EXITING)
                        err = -ESRCH;
-               else if (task->perf_event_ctxp[ctxn])
+               else if (task->perf_event_ctxp)
                        err = -EAGAIN;
                else {
                        get_ctx(ctx);
                        ++ctx->pin_count;
-                       rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+                       rcu_assign_pointer(task->perf_event_ctxp, ctx);
                }
                mutex_unlock(&task->perf_event_mutex);
  
                }
        }
  
-       free_task_ctx_data(pmu, task_ctx_data);
        return ctx;
  
  errout:
-       free_task_ctx_data(pmu, task_ctx_data);
        return ERR_PTR(err);
  }
  
+ static struct perf_event_pmu_context *
+ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
+                    struct perf_event *event)
+ {
+       struct perf_event_pmu_context *new = NULL, *epc;
+       void *task_ctx_data = NULL;
+       if (!ctx->task) {
+               struct perf_cpu_pmu_context *cpc;
+               cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
+               epc = &cpc->epc;
+               if (!epc->ctx) {
+                       atomic_set(&epc->refcount, 1);
+                       epc->embedded = 1;
+                       raw_spin_lock_irq(&ctx->lock);
+                       list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+                       epc->ctx = ctx;
+                       raw_spin_unlock_irq(&ctx->lock);
+               } else {
+                       WARN_ON_ONCE(epc->ctx != ctx);
+                       atomic_inc(&epc->refcount);
+               }
+               return epc;
+       }
+       new = kzalloc(sizeof(*epc), GFP_KERNEL);
+       if (!new)
+               return ERR_PTR(-ENOMEM);
+       if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+               task_ctx_data = alloc_task_ctx_data(pmu);
+               if (!task_ctx_data) {
+                       kfree(new);
+                       return ERR_PTR(-ENOMEM);
+               }
+       }
+       __perf_init_event_pmu_context(new, pmu);
+       /*
+        * XXX
+        *
+        * lockdep_assert_held(&ctx->mutex);
+        *
+        * can't because perf_event_init_task() doesn't actually hold the
+        * child_ctx->mutex.
+        */
+       raw_spin_lock_irq(&ctx->lock);
+       list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+               if (epc->pmu == pmu) {
+                       WARN_ON_ONCE(epc->ctx != ctx);
+                       atomic_inc(&epc->refcount);
+                       goto found_epc;
+               }
+       }
+       epc = new;
+       new = NULL;
+       list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+       epc->ctx = ctx;
+ found_epc:
+       if (task_ctx_data && !epc->task_ctx_data) {
+               epc->task_ctx_data = task_ctx_data;
+               task_ctx_data = NULL;
+               ctx->nr_task_data++;
+       }
+       raw_spin_unlock_irq(&ctx->lock);
+       free_task_ctx_data(pmu, task_ctx_data);
+       kfree(new);
+       return epc;
+ }
+ static void get_pmu_ctx(struct perf_event_pmu_context *epc)
+ {
+       WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
+ }
+ static void free_epc_rcu(struct rcu_head *head)
+ {
+       struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
+       kfree(epc->task_ctx_data);
+       kfree(epc);
+ }
+ static void put_pmu_ctx(struct perf_event_pmu_context *epc)
+ {
+       unsigned long flags;
+       if (!atomic_dec_and_test(&epc->refcount))
+               return;
+       if (epc->ctx) {
+               struct perf_event_context *ctx = epc->ctx;
+               /*
+                * XXX
+                *
+                * lockdep_assert_held(&ctx->mutex);
+                *
+                * can't because of the call-site in _free_event()/put_event()
+                * which isn't always called under ctx->mutex.
+                */
+               WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
+               raw_spin_lock_irqsave(&ctx->lock, flags);
+               list_del_init(&epc->pmu_ctx_entry);
+               epc->ctx = NULL;
+               raw_spin_unlock_irqrestore(&ctx->lock, flags);
+       }
+       WARN_ON_ONCE(!list_empty(&epc->pinned_active));
+       WARN_ON_ONCE(!list_empty(&epc->flexible_active));
+       if (epc->embedded)
+               return;
+       call_rcu(&epc->rcu_head, free_epc_rcu);
+ }
  static void perf_event_free_filter(struct perf_event *event);
  
  static void free_event_rcu(struct rcu_head *head)
  {
-       struct perf_event *event;
+       struct perf_event *event = container_of(head, typeof(*event), rcu_head);
  
-       event = container_of(head, struct perf_event, rcu_head);
        if (event->ns)
                put_pid_ns(event->ns);
        perf_event_free_filter(event);
@@@ -4893,7 -5058,7 +5068,7 @@@ static void perf_sched_delayed(struct w
   *
   *  1) cpu-wide events in the presence of per-task events,
   *  2) per-task events in the presence of cpu-wide events,
-  *  3) two matching events on the same context.
+  *  3) two matching events on the same perf_event_context.
   *
   * The former two cases are handled in the allocation path (perf_event_alloc(),
   * _free_event()), the latter -- before the first perf_install_in_context().
@@@ -5017,6 -5182,9 +5192,9 @@@ static void _free_event(struct perf_eve
        if (event->hw.target)
                put_task_struct(event->hw.target);
  
+       if (event->pmu_ctx)
+               put_pmu_ctx(event->pmu_ctx);
        /*
         * perf_event_free_task() relies on put_ctx() being 'last', in particular
         * all task references must be cleaned up.
@@@ -5117,8 -5285,8 +5295,8 @@@ int perf_event_release_kernel(struct pe
        LIST_HEAD(free_list);
  
        /*
-        * If we got here through err_file: fput(event_file); we will not have
-        * attached to a context yet.
+        * If we got here through err_alloc: free_event(event); we will not
+        * have attached to a context yet.
         */
        if (!ctx) {
                WARN_ON_ONCE(event->attach_state &
  
        ctx = perf_event_ctx_lock(event);
        WARN_ON_ONCE(ctx->parent_ctx);
 -      perf_remove_from_context(event, DETACH_GROUP);
  
 -      raw_spin_lock_irq(&ctx->lock);
        /*
         * Mark this event as STATE_DEAD, there is no external reference to it
         * anymore.
         * Thus this guarantees that we will in fact observe and kill _ALL_
         * child events.
         */
 -      event->state = PERF_EVENT_STATE_DEAD;
 -      raw_spin_unlock_irq(&ctx->lock);
 +      perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
  
        perf_event_ctx_unlock(event, ctx);
  
@@@ -5550,7 -5721,7 +5728,7 @@@ static void __perf_event_period(struct 
  
        active = (event->state == PERF_EVENT_STATE_ACTIVE);
        if (active) {
-               perf_pmu_disable(ctx->pmu);
+               perf_pmu_disable(event->pmu);
                /*
                 * We could be throttled; unthrottle now to avoid the tick
                 * trying to unthrottle while we already re-started the event.
  
        if (active) {
                event->pmu->start(event, PERF_EF_RELOAD);
-               perf_pmu_enable(ctx->pmu);
+               perf_pmu_enable(event->pmu);
        }
  }
  
@@@ -6584,8 -6755,6 +6762,8 @@@ static void perf_pending_task(struct ca
        if (rctx >= 0)
                perf_swevent_put_recursion_context(rctx);
        preempt_enable_notrace();
 +
 +      put_event(event);
  }
  
  #ifdef CONFIG_GUEST_PERF_EVENTS
@@@ -7729,7 -7898,6 +7907,6 @@@ perf_iterate_sb(perf_iterate_f output, 
               struct perf_event_context *task_ctx)
  {
        struct perf_event_context *ctx;
-       int ctxn;
  
        rcu_read_lock();
        preempt_disable();
  
        perf_iterate_sb_cpu(output, data);
  
-       for_each_task_context_nr(ctxn) {
-               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-               if (ctx)
-                       perf_iterate_ctx(ctx, output, data, false);
-       }
+       ctx = rcu_dereference(current->perf_event_ctxp);
+       if (ctx)
+               perf_iterate_ctx(ctx, output, data, false);
  done:
        preempt_enable();
        rcu_read_unlock();
@@@ -7792,20 -7958,17 +7967,17 @@@ static void perf_event_addr_filters_exe
  void perf_event_exec(void)
  {
        struct perf_event_context *ctx;
-       int ctxn;
  
-       for_each_task_context_nr(ctxn) {
-               perf_event_enable_on_exec(ctxn);
-               perf_event_remove_on_exec(ctxn);
+       ctx = perf_pin_task_context(current);
+       if (!ctx)
+               return;
  
-               rcu_read_lock();
-               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-               if (ctx) {
-                       perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
-                                        NULL, true);
-               }
-               rcu_read_unlock();
-       }
+       perf_event_enable_on_exec(ctx);
+       perf_event_remove_on_exec(ctx);
+       perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
+       perf_unpin_context(ctx);
+       put_ctx(ctx);
  }
  
  struct remote_output {
@@@ -7845,8 -8008,7 +8017,7 @@@ static void __perf_event_output_stop(st
  static int __perf_pmu_output_stop(void *info)
  {
        struct perf_event *event = info;
-       struct pmu *pmu = event->ctx->pmu;
-       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct remote_output ro = {
                .rb     = event->rb,
        };
@@@ -8635,7 -8797,6 +8806,6 @@@ static void __perf_addr_filters_adjust(
  static void perf_addr_filters_adjust(struct vm_area_struct *vma)
  {
        struct perf_event_context *ctx;
-       int ctxn;
  
        /*
         * Data tracing isn't supported yet and as such there is no need
                return;
  
        rcu_read_lock();
-       for_each_task_context_nr(ctxn) {
-               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-               if (!ctx)
-                       continue;
+       ctx = rcu_dereference(current->perf_event_ctxp);
+       if (ctx)
                perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
-       }
        rcu_read_unlock();
  }
  
@@@ -9039,7 -9196,7 +9205,7 @@@ static void perf_event_bpf_emit_ksymbol
                                PERF_RECORD_KSYMBOL_TYPE_BPF,
                                (u64)(unsigned long)subprog->bpf_func,
                                subprog->jited_len, unregister,
 -                              prog->aux->ksym.name);
 +                              subprog->aux->ksym.name);
                }
        }
  }
@@@ -9282,19 -9439,6 +9448,19 @@@ int perf_event_account_interrupt(struc
        return __perf_event_account_interrupt(event, 1);
  }
  
 +static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
 +{
 +      /*
 +       * Due to interrupt latency (AKA "skid"), we may enter the
 +       * kernel before taking an overflow, even if the PMU is only
 +       * counting user events.
 +       */
 +      if (event->attr.exclude_kernel && !user_mode(regs))
 +              return false;
 +
 +      return true;
 +}
 +
  /*
   * Generic event overflow handling, sampling.
   */
@@@ -9329,38 -9473,15 +9495,38 @@@ static int __perf_event_overflow(struc
  
        if (event->attr.sigtrap) {
                /*
 -               * Should not be able to return to user space without processing
 -               * pending_sigtrap (kernel events can overflow multiple times).
 +               * The desired behaviour of sigtrap vs invalid samples is a bit
 +               * tricky; on the one hand, one should not loose the SIGTRAP if
 +               * it is the first event, on the other hand, we should also not
 +               * trigger the WARN or override the data address.
                 */
 -              WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
 +              bool valid_sample = sample_is_allowed(event, regs);
 +              unsigned int pending_id = 1;
 +
 +              if (regs)
 +                      pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
                if (!event->pending_sigtrap) {
 -                      event->pending_sigtrap = 1;
 +                      event->pending_sigtrap = pending_id;
                        local_inc(&event->ctx->nr_pending);
 +              } else if (event->attr.exclude_kernel && valid_sample) {
 +                      /*
 +                       * Should not be able to return to user space without
 +                       * consuming pending_sigtrap; with exceptions:
 +                       *
 +                       *  1. Where !exclude_kernel, events can overflow again
 +                       *     in the kernel without returning to user space.
 +                       *
 +                       *  2. Events that can overflow again before the IRQ-
 +                       *     work without user space progress (e.g. hrtimer).
 +                       *     To approximate progress (with false negatives),
 +                       *     check 32-bit hash of the current IP.
 +                       */
 +                      WARN_ON_ONCE(event->pending_sigtrap != pending_id);
                }
 -              event->pending_addr = data->addr;
 +
 +              event->pending_addr = 0;
 +              if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
 +                      event->pending_addr = data->addr;
                irq_work_queue(&event->pending_irq);
        }
  
@@@ -9826,6 -9947,44 +9992,44 @@@ static struct pmu perf_swevent = 
  
  #ifdef CONFIG_EVENT_TRACING
  
+ static void tp_perf_event_destroy(struct perf_event *event)
+ {
+       perf_trace_destroy(event);
+ }
+ static int perf_tp_event_init(struct perf_event *event)
+ {
+       int err;
+       if (event->attr.type != PERF_TYPE_TRACEPOINT)
+               return -ENOENT;
+       /*
+        * no branch sampling for tracepoint events
+        */
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+       err = perf_trace_init(event);
+       if (err)
+               return err;
+       event->destroy = tp_perf_event_destroy;
+       return 0;
+ }
+ static struct pmu perf_tracepoint = {
+       .task_ctx_nr    = perf_sw_context,
+       .event_init     = perf_tp_event_init,
+       .add            = perf_trace_add,
+       .del            = perf_trace_del,
+       .start          = perf_swevent_start,
+       .stop           = perf_swevent_stop,
+       .read           = perf_swevent_read,
+ };
  static int perf_tp_filter_match(struct perf_event *event,
                                struct perf_sample_data *data)
  {
@@@ -9875,6 -10034,44 +10079,44 @@@ void perf_trace_run_bpf_submit(void *ra
  }
  EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
  
+ static void __perf_tp_event_target_task(u64 count, void *record,
+                                       struct pt_regs *regs,
+                                       struct perf_sample_data *data,
+                                       struct perf_event *event)
+ {
+       struct trace_entry *entry = record;
+       if (event->attr.config != entry->type)
+               return;
+       /* Cannot deliver synchronous signal to other task. */
+       if (event->attr.sigtrap)
+               return;
+       if (perf_tp_event_match(event, data, regs))
+               perf_swevent_event(event, count, data, regs);
+ }
+ static void perf_tp_event_target_task(u64 count, void *record,
+                                     struct pt_regs *regs,
+                                     struct perf_sample_data *data,
+                                     struct perf_event_context *ctx)
+ {
+       unsigned int cpu = smp_processor_id();
+       struct pmu *pmu = &perf_tracepoint;
+       struct perf_event *event, *sibling;
+       perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
+               __perf_tp_event_target_task(count, record, regs, data, event);
+               for_each_sibling_event(sibling, event)
+                       __perf_tp_event_target_task(count, record, regs, data, sibling);
+       }
+       perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
+               __perf_tp_event_target_task(count, record, regs, data, event);
+               for_each_sibling_event(sibling, event)
+                       __perf_tp_event_target_task(count, record, regs, data, sibling);
+       }
+ }
  void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
                   struct pt_regs *regs, struct hlist_head *head, int rctx,
                   struct task_struct *task)
  
        perf_sample_data_init(&data, 0, 0);
        data.raw = &raw;
 +      data.sample_flags |= PERF_SAMPLE_RAW;
  
        perf_trace_buf_update(record, event_type);
  
         */
        if (task && task != current) {
                struct perf_event_context *ctx;
-               struct trace_entry *entry = record;
  
                rcu_read_lock();
-               ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+               ctx = rcu_dereference(task->perf_event_ctxp);
                if (!ctx)
                        goto unlock;
  
-               list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-                       if (event->cpu != smp_processor_id())
-                               continue;
-                       if (event->attr.type != PERF_TYPE_TRACEPOINT)
-                               continue;
-                       if (event->attr.config != entry->type)
-                               continue;
-                       /* Cannot deliver synchronous signal to other task. */
-                       if (event->attr.sigtrap)
-                               continue;
-                       if (perf_tp_event_match(event, &data, regs))
-                               perf_swevent_event(event, count, &data, regs);
-               }
+               raw_spin_lock(&ctx->lock);
+               perf_tp_event_target_task(count, record, regs, &data, ctx);
+               raw_spin_unlock(&ctx->lock);
  unlock:
                rcu_read_unlock();
        }
  }
  EXPORT_SYMBOL_GPL(perf_tp_event);
  
- static void tp_perf_event_destroy(struct perf_event *event)
- {
-       perf_trace_destroy(event);
- }
- static int perf_tp_event_init(struct perf_event *event)
- {
-       int err;
-       if (event->attr.type != PERF_TYPE_TRACEPOINT)
-               return -ENOENT;
-       /*
-        * no branch sampling for tracepoint events
-        */
-       if (has_branch_stack(event))
-               return -EOPNOTSUPP;
-       err = perf_trace_init(event);
-       if (err)
-               return err;
-       event->destroy = tp_perf_event_destroy;
-       return 0;
- }
- static struct pmu perf_tracepoint = {
-       .task_ctx_nr    = perf_sw_context,
-       .event_init     = perf_tp_event_init,
-       .add            = perf_trace_add,
-       .del            = perf_trace_del,
-       .start          = perf_swevent_start,
-       .stop           = perf_swevent_stop,
-       .read           = perf_swevent_read,
- };
  #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
  /*
   * Flags in config, used by dynamic PMU kprobe and uprobe
@@@ -11058,46 -11205,19 +11251,19 @@@ static int perf_event_idx_default(struc
        return 0;
  }
  
+ static void free_pmu_context(struct pmu *pmu)
+ {
+       free_percpu(pmu->cpu_pmu_context);
+ }
  /*
-  * Ensures all contexts with the same task_ctx_nr have the same
-  * pmu_cpu_context too.
+  * Let userspace know that this PMU supports address range filtering:
   */
- static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
+ static ssize_t nr_addr_filters_show(struct device *dev,
+                                   struct device_attribute *attr,
+                                   char *page)
  {
-       struct pmu *pmu;
-       if (ctxn < 0)
-               return NULL;
-       list_for_each_entry(pmu, &pmus, entry) {
-               if (pmu->task_ctx_nr == ctxn)
-                       return pmu->pmu_cpu_context;
-       }
-       return NULL;
- }
- static void free_pmu_context(struct pmu *pmu)
- {
-       /*
-        * Static contexts such as perf_sw_context have a global lifetime
-        * and may be shared between different PMUs. Avoid freeing them
-        * when a single PMU is going away.
-        */
-       if (pmu->task_ctx_nr > perf_invalid_context)
-               return;
-       free_percpu(pmu->pmu_cpu_context);
- }
- /*
-  * Let userspace know that this PMU supports address range filtering:
-  */
- static ssize_t nr_addr_filters_show(struct device *dev,
-                                   struct device_attribute *attr,
-                                   char *page)
- {
-       struct pmu *pmu = dev_get_drvdata(dev);
+       struct pmu *pmu = dev_get_drvdata(dev);
  
        return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
  }
@@@ -11151,12 -11271,11 +11317,11 @@@ perf_event_mux_interval_ms_store(struc
        /* update all cpuctx for this PMU */
        cpus_read_lock();
        for_each_online_cpu(cpu) {
-               struct perf_cpu_context *cpuctx;
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-               cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+               struct perf_cpu_pmu_context *cpc;
+               cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+               cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
  
-               cpu_function_call(cpu,
-                       (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
+               cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
        }
        cpus_read_unlock();
        mutex_unlock(&mux_interval_mutex);
@@@ -11193,13 -11312,15 +11358,15 @@@ static int pmu_dev_alloc(struct pmu *pm
  
        pmu->dev->groups = pmu->attr_groups;
        device_initialize(pmu->dev);
-       ret = dev_set_name(pmu->dev, "%s", pmu->name);
-       if (ret)
-               goto free_dev;
  
        dev_set_drvdata(pmu->dev, pmu);
        pmu->dev->bus = &pmu_bus;
        pmu->dev->release = pmu_dev_release;
+       ret = dev_set_name(pmu->dev, "%s", pmu->name);
+       if (ret)
+               goto free_dev;
        ret = device_add(pmu->dev);
        if (ret)
                goto free_dev;
@@@ -11267,47 -11388,19 +11434,19 @@@ int perf_pmu_register(struct pmu *pmu, 
        }
  
  skip_type:
-       if (pmu->task_ctx_nr == perf_hw_context) {
-               static int hw_context_taken = 0;
-               /*
-                * Other than systems with heterogeneous CPUs, it never makes
-                * sense for two PMUs to share perf_hw_context. PMUs which are
-                * uncore must use perf_invalid_context.
-                */
-               if (WARN_ON_ONCE(hw_context_taken &&
-                   !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
-                       pmu->task_ctx_nr = perf_invalid_context;
-               hw_context_taken = 1;
-       }
-       pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
-       if (pmu->pmu_cpu_context)
-               goto got_cpu_context;
        ret = -ENOMEM;
-       pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
-       if (!pmu->pmu_cpu_context)
+       pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
+       if (!pmu->cpu_pmu_context)
                goto free_dev;
  
        for_each_possible_cpu(cpu) {
-               struct perf_cpu_context *cpuctx;
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-               __perf_event_init_context(&cpuctx->ctx);
-               lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
-               lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
-               cpuctx->ctx.pmu = pmu;
-               cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
+               struct perf_cpu_pmu_context *cpc;
  
-               __perf_mux_hrtimer_init(cpuctx, cpu);
-               cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
-               cpuctx->heap = cpuctx->heap_default;
+               cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+               __perf_init_event_pmu_context(&cpc->epc, pmu);
+               __perf_mux_hrtimer_init(cpc, cpu);
        }
  
- got_cpu_context:
        if (!pmu->start_txn) {
                if (pmu->pmu_enable) {
                        /*
@@@ -11786,10 -11879,11 +11925,11 @@@ perf_event_alloc(struct perf_event_att
        }
  
        /*
-        * Disallow uncore-cgroup events, they don't make sense as the cgroup will
-        * be different on other CPUs in the uncore mask.
+        * Disallow uncore-task events. Similarly, disallow uncore-cgroup
+        * events (they don't make sense as the cgroup will be different
+        * on other CPUs in the uncore mask).
         */
-       if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
+       if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
                err = -EINVAL;
                goto err_pmu;
        }
@@@ -12136,37 -12230,6 +12276,6 @@@ static int perf_event_set_clock(struct 
        return 0;
  }
  
- /*
-  * Variation on perf_event_ctx_lock_nested(), except we take two context
-  * mutexes.
-  */
- static struct perf_event_context *
- __perf_event_ctx_lock_double(struct perf_event *group_leader,
-                            struct perf_event_context *ctx)
- {
-       struct perf_event_context *gctx;
- again:
-       rcu_read_lock();
-       gctx = READ_ONCE(group_leader->ctx);
-       if (!refcount_inc_not_zero(&gctx->refcount)) {
-               rcu_read_unlock();
-               goto again;
-       }
-       rcu_read_unlock();
-       mutex_lock_double(&gctx->mutex, &ctx->mutex);
-       if (group_leader->ctx != gctx) {
-               mutex_unlock(&ctx->mutex);
-               mutex_unlock(&gctx->mutex);
-               put_ctx(gctx);
-               goto again;
-       }
-       return gctx;
- }
  static bool
  perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
  {
@@@ -12212,9 -12275,10 +12321,10 @@@ SYSCALL_DEFINE5(perf_event_open
                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
  {
        struct perf_event *group_leader = NULL, *output_event = NULL;
+       struct perf_event_pmu_context *pmu_ctx;
        struct perf_event *event, *sibling;
        struct perf_event_attr attr;
-       struct perf_event_context *ctx, *gctx;
+       struct perf_event_context *ctx;
        struct file *event_file = NULL;
        struct fd group = {NULL, 0};
        struct task_struct *task = NULL;
        if (pmu->task_ctx_nr == perf_sw_context)
                event->event_caps |= PERF_EV_CAP_SOFTWARE;
  
-       if (group_leader) {
-               if (is_software_event(event) &&
-                   !in_software_context(group_leader)) {
-                       /*
-                        * If the event is a sw event, but the group_leader
-                        * is on hw context.
-                        *
-                        * Allow the addition of software events to hw
-                        * groups, this is safe because software events
-                        * never fail to schedule.
-                        */
-                       pmu = group_leader->ctx->pmu;
-               } else if (!is_software_event(event) &&
-                          is_software_event(group_leader) &&
-                          (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
-                       /*
-                        * In case the group is a pure software group, and we
-                        * try to add a hardware event, move the whole group to
-                        * the hardware context.
-                        */
-                       move_group = 1;
-               }
+       if (task) {
+               err = down_read_interruptible(&task->signal->exec_update_lock);
+               if (err)
+                       goto err_alloc;
+               /*
+                * We must hold exec_update_lock across this and any potential
+                * perf_install_in_context() call for this new event to
+                * serialize against exec() altering our credentials (and the
+                * perf_event_exit_task() that could imply).
+                */
+               err = -EACCES;
+               if (!perf_check_permission(&attr, task))
+                       goto err_cred;
        }
  
        /*
         * Get the target context (task or percpu):
         */
-       ctx = find_get_context(pmu, task, event);
+       ctx = find_get_context(task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
-               goto err_alloc;
+               goto err_cred;
+       }
+       mutex_lock(&ctx->mutex);
+       if (ctx->task == TASK_TOMBSTONE) {
+               err = -ESRCH;
+               goto err_locked;
+       }
+       if (!task) {
+               /*
+                * Check if the @cpu we're creating an event for is online.
+                *
+                * We use the perf_cpu_context::ctx::mutex to serialize against
+                * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+                */
+               struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
+               if (!cpuctx->online) {
+                       err = -ENODEV;
+                       goto err_locked;
+               }
        }
  
-       /*
-        * Look up the group leader (we will attach this event to it):
-        */
        if (group_leader) {
                err = -EINVAL;
  
                 * becoming part of another group-sibling):
                 */
                if (group_leader->group_leader != group_leader)
-                       goto err_context;
+                       goto err_locked;
  
                /* All events in a group should have the same clock */
                if (group_leader->clock != event->clock)
-                       goto err_context;
+                       goto err_locked;
  
                /*
                 * Make sure we're both events for the same CPU;
                 * you can never concurrently schedule them anyhow.
                 */
                if (group_leader->cpu != event->cpu)
-                       goto err_context;
-               /*
-                * Make sure we're both on the same task, or both
-                * per-CPU events.
-                */
-               if (group_leader->ctx->task != ctx->task)
-                       goto err_context;
+                       goto err_locked;
  
                /*
-                * Do not allow to attach to a group in a different task
-                * or CPU context. If we're moving SW events, we'll fix
-                * this up later, so allow that.
-                *
-                * Racy, not holding group_leader->ctx->mutex, see comment with
-                * perf_event_ctx_lock().
+                * Make sure we're both on the same context; either task or cpu.
                 */
-               if (!move_group && group_leader->ctx != ctx)
-                       goto err_context;
+               if (group_leader->ctx != ctx)
+                       goto err_locked;
  
                /*
                 * Only a group leader can be exclusive or pinned
                 */
                if (attr.exclusive || attr.pinned)
-                       goto err_context;
-       }
-       if (output_event) {
-               err = perf_event_set_output(event, output_event);
-               if (err)
-                       goto err_context;
-       }
-       event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
-                                       f_flags);
-       if (IS_ERR(event_file)) {
-               err = PTR_ERR(event_file);
-               event_file = NULL;
-               goto err_context;
-       }
-       if (task) {
-               err = down_read_interruptible(&task->signal->exec_update_lock);
-               if (err)
-                       goto err_file;
-               /*
-                * We must hold exec_update_lock across this and any potential
-                * perf_install_in_context() call for this new event to
-                * serialize against exec() altering our credentials (and the
-                * perf_event_exit_task() that could imply).
-                */
-               err = -EACCES;
-               if (!perf_check_permission(&attr, task))
-                       goto err_cred;
-       }
-       if (move_group) {
-               gctx = __perf_event_ctx_lock_double(group_leader, ctx);
-               if (gctx->task == TASK_TOMBSTONE) {
-                       err = -ESRCH;
                        goto err_locked;
-               }
  
-               /*
-                * Check if we raced against another sys_perf_event_open() call
-                * moving the software group underneath us.
-                */
-               if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+               if (is_software_event(event) &&
+                   !in_software_context(group_leader)) {
                        /*
-                        * If someone moved the group out from under us, check
-                        * if this new event wound up on the same ctx, if so
-                        * its the regular !move_group case, otherwise fail.
+                        * If the event is a sw event, but the group_leader
+                        * is on hw context.
+                        *
+                        * Allow the addition of software events to hw
+                        * groups, this is safe because software events
+                        * never fail to schedule.
+                        *
+                        * Note the comment that goes with struct
+                        * perf_event_pmu_context.
                         */
-                       if (gctx != ctx) {
-                               err = -EINVAL;
-                               goto err_locked;
-                       } else {
-                               perf_event_ctx_unlock(group_leader, gctx);
-                               move_group = 0;
-                               goto not_move_group;
+                       pmu = group_leader->pmu_ctx->pmu;
+               } else if (!is_software_event(event)) {
+                       if (is_software_event(group_leader) &&
+                           (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+                               /*
+                                * In case the group is a pure software group, and we
+                                * try to add a hardware event, move the whole group to
+                                * the hardware context.
+                                */
+                               move_group = 1;
                        }
-               }
-               /*
-                * Failure to create exclusive events returns -EBUSY.
-                */
-               err = -EBUSY;
-               if (!exclusive_event_installable(group_leader, ctx))
-                       goto err_locked;
  
-               for_each_sibling_event(sibling, group_leader) {
-                       if (!exclusive_event_installable(sibling, ctx))
+                       /* Don't allow group of multiple hw events from different pmus */
+                       if (!in_software_context(group_leader) &&
+                           group_leader->pmu_ctx->pmu != pmu)
                                goto err_locked;
                }
-       } else {
-               mutex_lock(&ctx->mutex);
-               /*
-                * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
-                * see the group_leader && !move_group test earlier.
-                */
-               if (group_leader && group_leader->ctx != ctx) {
-                       err = -EINVAL;
-                       goto err_locked;
-               }
        }
- not_move_group:
  
-       if (ctx->task == TASK_TOMBSTONE) {
-               err = -ESRCH;
+       /*
+        * Now that we're certain of the pmu; find the pmu_ctx.
+        */
+       pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+       if (IS_ERR(pmu_ctx)) {
+               err = PTR_ERR(pmu_ctx);
                goto err_locked;
        }
+       event->pmu_ctx = pmu_ctx;
  
-       if (!perf_event_validate_size(event)) {
-               err = -E2BIG;
-               goto err_locked;
+       if (output_event) {
+               err = perf_event_set_output(event, output_event);
+               if (err)
+                       goto err_context;
        }
  
-       if (!task) {
-               /*
-                * Check if the @cpu we're creating an event for is online.
-                *
-                * We use the perf_cpu_context::ctx::mutex to serialize against
-                * the hotplug notifiers. See perf_event_{init,exit}_cpu().
-                */
-               struct perf_cpu_context *cpuctx =
-                       container_of(ctx, struct perf_cpu_context, ctx);
-               if (!cpuctx->online) {
-                       err = -ENODEV;
-                       goto err_locked;
-               }
+       if (!perf_event_validate_size(event)) {
+               err = -E2BIG;
+               goto err_context;
        }
  
        if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
                err = -EINVAL;
-               goto err_locked;
+               goto err_context;
        }
  
        /*
         */
        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
-               goto err_locked;
+               goto err_context;
        }
  
        WARN_ON_ONCE(ctx->parent_ctx);
  
+       event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
+       if (IS_ERR(event_file)) {
+               err = PTR_ERR(event_file);
+               event_file = NULL;
+               goto err_context;
+       }
        /*
         * This is the point on no return; we cannot fail hereafter. This is
         * where we start modifying current state.
         */
  
        if (move_group) {
-               /*
-                * See perf_event_ctx_lock() for comments on the details
-                * of swizzling perf_event::ctx.
-                */
                perf_remove_from_context(group_leader, 0);
-               put_ctx(gctx);
+               put_pmu_ctx(group_leader->pmu_ctx);
  
                for_each_sibling_event(sibling, group_leader) {
                        perf_remove_from_context(sibling, 0);
-                       put_ctx(gctx);
+                       put_pmu_ctx(sibling->pmu_ctx);
                }
  
-               /*
-                * Wait for everybody to stop referencing the events through
-                * the old lists, before installing it on new lists.
-                */
-               synchronize_rcu();
                /*
                 * Install the group siblings before the group leader.
                 *
                 * reachable through the group lists.
                 */
                for_each_sibling_event(sibling, group_leader) {
+                       sibling->pmu_ctx = pmu_ctx;
+                       get_pmu_ctx(pmu_ctx);
                        perf_event__state_init(sibling);
                        perf_install_in_context(ctx, sibling, sibling->cpu);
-                       get_ctx(ctx);
                }
  
                /*
                 * event. What we want here is event in the initial
                 * startup state, ready to be add into new context.
                 */
+               group_leader->pmu_ctx = pmu_ctx;
+               get_pmu_ctx(pmu_ctx);
                perf_event__state_init(group_leader);
                perf_install_in_context(ctx, group_leader, group_leader->cpu);
-               get_ctx(ctx);
        }
  
        /*
        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
  
-       if (move_group)
-               perf_event_ctx_unlock(group_leader, gctx);
        mutex_unlock(&ctx->mutex);
  
        if (task) {
        fd_install(event_fd, event_file);
        return event_fd;
  
+ err_context:
+       /* event->pmu_ctx freed by free_event() */
  err_locked:
-       if (move_group)
-               perf_event_ctx_unlock(group_leader, gctx);
        mutex_unlock(&ctx->mutex);
+       perf_unpin_context(ctx);
+       put_ctx(ctx);
  err_cred:
        if (task)
                up_read(&task->signal->exec_update_lock);
- err_file:
-       fput(event_file);
- err_context:
-       perf_unpin_context(ctx);
-       put_ctx(ctx);
  err_alloc:
-       /*
-        * If event_file is set, the fput() above will have called ->release()
-        * and that will take care of freeing the event.
-        */
-       if (!event_file)
-               free_event(event);
+       free_event(event);
  err_task:
        if (task)
                put_task_struct(task);
@@@ -12683,8 -12678,10 +12724,10 @@@ perf_event_create_kernel_counter(struc
                                 perf_overflow_handler_t overflow_handler,
                                 void *context)
  {
+       struct perf_event_pmu_context *pmu_ctx;
        struct perf_event_context *ctx;
        struct perf_event *event;
+       struct pmu *pmu;
        int err;
  
        /*
  
        /* Mark owner so we could distinguish it from user events. */
        event->owner = TASK_TOMBSTONE;
+       pmu = event->pmu;
+       if (pmu->task_ctx_nr == perf_sw_context)
+               event->event_caps |= PERF_EV_CAP_SOFTWARE;
  
        /*
         * Get the target context (task or percpu):
         */
-       ctx = find_get_context(event->pmu, task, event);
+       ctx = find_get_context(task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
-               goto err_free;
+               goto err_alloc;
        }
  
        WARN_ON_ONCE(ctx->parent_ctx);
                goto err_unlock;
        }
  
+       pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+       if (IS_ERR(pmu_ctx)) {
+               err = PTR_ERR(pmu_ctx);
+               goto err_unlock;
+       }
+       event->pmu_ctx = pmu_ctx;
        if (!task) {
                /*
                 * Check if the @cpu we're creating an event for is online.
                        container_of(ctx, struct perf_cpu_context, ctx);
                if (!cpuctx->online) {
                        err = -ENODEV;
-                       goto err_unlock;
+                       goto err_pmu_ctx;
                }
        }
  
        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
-               goto err_unlock;
+               goto err_pmu_ctx;
        }
  
        perf_install_in_context(ctx, event, event->cpu);
  
        return event;
  
+ err_pmu_ctx:
+       put_pmu_ctx(pmu_ctx);
  err_unlock:
        mutex_unlock(&ctx->mutex);
        perf_unpin_context(ctx);
        put_ctx(ctx);
- err_free:
+ err_alloc:
        free_event(event);
  err:
        return ERR_PTR(err);
  }
  EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
  
- void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+ static void __perf_pmu_remove(struct perf_event_context *ctx,
+                             int cpu, struct pmu *pmu,
+                             struct perf_event_groups *groups,
+                             struct list_head *events)
  {
-       struct perf_event_context *src_ctx;
-       struct perf_event_context *dst_ctx;
-       struct perf_event *event, *tmp;
-       LIST_HEAD(events);
-       src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
-       dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+       struct perf_event *event, *sibling;
  
-       /*
-        * See perf_event_ctx_lock() for comments on the details
-        * of swizzling perf_event::ctx.
-        */
-       mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
-       list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
-                                event_entry) {
+       perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
                perf_remove_from_context(event, 0);
-               unaccount_event_cpu(event, src_cpu);
-               put_ctx(src_ctx);
-               list_add(&event->migrate_entry, &events);
+               unaccount_event_cpu(event, cpu);
+               put_pmu_ctx(event->pmu_ctx);
+               list_add(&event->migrate_entry, events);
+               for_each_sibling_event(sibling, event) {
+                       perf_remove_from_context(sibling, 0);
+                       unaccount_event_cpu(sibling, cpu);
+                       put_pmu_ctx(sibling->pmu_ctx);
+                       list_add(&sibling->migrate_entry, events);
+               }
        }
+ }
  
-       /*
-        * Wait for the events to quiesce before re-instating them.
-        */
-       synchronize_rcu();
+ static void __perf_pmu_install_event(struct pmu *pmu,
+                                    struct perf_event_context *ctx,
+                                    int cpu, struct perf_event *event)
+ {
+       struct perf_event_pmu_context *epc;
+       event->cpu = cpu;
+       epc = find_get_pmu_context(pmu, ctx, event);
+       event->pmu_ctx = epc;
+       if (event->state >= PERF_EVENT_STATE_OFF)
+               event->state = PERF_EVENT_STATE_INACTIVE;
+       account_event_cpu(event, cpu);
+       perf_install_in_context(ctx, event, cpu);
+ }
+ static void __perf_pmu_install(struct perf_event_context *ctx,
+                              int cpu, struct pmu *pmu, struct list_head *events)
+ {
+       struct perf_event *event, *tmp;
  
        /*
         * Re-instate events in 2 passes.
         * leader will enable its siblings, even if those are still on the old
         * context.
         */
-       list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+       list_for_each_entry_safe(event, tmp, events, migrate_entry) {
                if (event->group_leader == event)
                        continue;
  
                list_del(&event->migrate_entry);
-               if (event->state >= PERF_EVENT_STATE_OFF)
-                       event->state = PERF_EVENT_STATE_INACTIVE;
-               account_event_cpu(event, dst_cpu);
-               perf_install_in_context(dst_ctx, event, dst_cpu);
-               get_ctx(dst_ctx);
+               __perf_pmu_install_event(pmu, ctx, cpu, event);
        }
  
        /*
         * Once all the siblings are setup properly, install the group leaders
         * to make it go.
         */
-       list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+       list_for_each_entry_safe(event, tmp, events, migrate_entry) {
                list_del(&event->migrate_entry);
-               if (event->state >= PERF_EVENT_STATE_OFF)
-                       event->state = PERF_EVENT_STATE_INACTIVE;
-               account_event_cpu(event, dst_cpu);
-               perf_install_in_context(dst_ctx, event, dst_cpu);
-               get_ctx(dst_ctx);
+               __perf_pmu_install_event(pmu, ctx, cpu, event);
        }
+ }
+ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+ {
+       struct perf_event_context *src_ctx, *dst_ctx;
+       LIST_HEAD(events);
+       src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
+       dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
+       /*
+        * See perf_event_ctx_lock() for comments on the details
+        * of swizzling perf_event::ctx.
+        */
+       mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
+       __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
+       __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
+       /*
+        * Wait for the events to quiesce before re-instating them.
+        */
+       synchronize_rcu();
+       __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
        mutex_unlock(&dst_ctx->mutex);
        mutex_unlock(&src_ctx->mutex);
  }
@@@ -12896,14 -12939,14 +12985,14 @@@ perf_event_exit_event(struct perf_even
        perf_event_wakeup(event);
  }
  
- static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
+ static void perf_event_exit_task_context(struct task_struct *child)
  {
        struct perf_event_context *child_ctx, *clone_ctx = NULL;
        struct perf_event *child_event, *next;
  
        WARN_ON_ONCE(child != current);
  
-       child_ctx = perf_pin_task_context(child, ctxn);
+       child_ctx = perf_pin_task_context(child);
        if (!child_ctx)
                return;
  
         * in.
         */
        raw_spin_lock_irq(&child_ctx->lock);
-       task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
+       task_ctx_sched_out(child_ctx, EVENT_ALL);
  
        /*
         * Now that the context is inactive, destroy the task <-> ctx relation
         * and mark the context dead.
         */
-       RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
+       RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
        put_ctx(child_ctx); /* cannot be last */
        WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
        put_task_struct(current); /* cannot be last */
  void perf_event_exit_task(struct task_struct *child)
  {
        struct perf_event *event, *tmp;
-       int ctxn;
  
        mutex_lock(&child->perf_event_mutex);
        list_for_each_entry_safe(event, tmp, &child->perf_event_list,
        }
        mutex_unlock(&child->perf_event_mutex);
  
-       for_each_task_context_nr(ctxn)
-               perf_event_exit_task_context(child, ctxn);
+       perf_event_exit_task_context(child);
  
        /*
         * The perf_event_exit_task_context calls perf_event_task
@@@ -13026,56 -13067,51 +13113,51 @@@ void perf_event_free_task(struct task_s
  {
        struct perf_event_context *ctx;
        struct perf_event *event, *tmp;
-       int ctxn;
  
-       for_each_task_context_nr(ctxn) {
-               ctx = task->perf_event_ctxp[ctxn];
-               if (!ctx)
-                       continue;
+       ctx = rcu_access_pointer(task->perf_event_ctxp);
+       if (!ctx)
+               return;
  
-               mutex_lock(&ctx->mutex);
-               raw_spin_lock_irq(&ctx->lock);
-               /*
-                * Destroy the task <-> ctx relation and mark the context dead.
-                *
-                * This is important because even though the task hasn't been
-                * exposed yet the context has been (through child_list).
-                */
-               RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
-               WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
-               put_task_struct(task); /* cannot be last */
-               raw_spin_unlock_irq(&ctx->lock);
+       mutex_lock(&ctx->mutex);
+       raw_spin_lock_irq(&ctx->lock);
+       /*
+        * Destroy the task <-> ctx relation and mark the context dead.
+        *
+        * This is important because even though the task hasn't been
+        * exposed yet the context has been (through child_list).
+        */
+       RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
+       WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+       put_task_struct(task); /* cannot be last */
+       raw_spin_unlock_irq(&ctx->lock);
  
-               list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
-                       perf_free_event(event, ctx);
  
-               mutex_unlock(&ctx->mutex);
+       list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
+               perf_free_event(event, ctx);
  
-               /*
-                * perf_event_release_kernel() could've stolen some of our
-                * child events and still have them on its free_list. In that
-                * case we must wait for these events to have been freed (in
-                * particular all their references to this task must've been
-                * dropped).
-                *
-                * Without this copy_process() will unconditionally free this
-                * task (irrespective of its reference count) and
-                * _free_event()'s put_task_struct(event->hw.target) will be a
-                * use-after-free.
-                *
-                * Wait for all events to drop their context reference.
-                */
-               wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
-               put_ctx(ctx); /* must be last */
-       }
+       mutex_unlock(&ctx->mutex);
+       /*
+        * perf_event_release_kernel() could've stolen some of our
+        * child events and still have them on its free_list. In that
+        * case we must wait for these events to have been freed (in
+        * particular all their references to this task must've been
+        * dropped).
+        *
+        * Without this copy_process() will unconditionally free this
+        * task (irrespective of its reference count) and
+        * _free_event()'s put_task_struct(event->hw.target) will be a
+        * use-after-free.
+        *
+        * Wait for all events to drop their context reference.
+        */
+       wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
+       put_ctx(ctx); /* must be last */
  }
  
  void perf_event_delayed_put(struct task_struct *task)
  {
-       int ctxn;
-       for_each_task_context_nr(ctxn)
-               WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+       WARN_ON_ONCE(task->perf_event_ctxp);
  }
  
  struct file *perf_event_get(unsigned int fd)
@@@ -13125,6 -13161,7 +13207,7 @@@ inherit_event(struct perf_event *parent
              struct perf_event_context *child_ctx)
  {
        enum perf_event_state parent_state = parent_event->state;
+       struct perf_event_pmu_context *pmu_ctx;
        struct perf_event *child_event;
        unsigned long flags;
  
        if (IS_ERR(child_event))
                return child_event;
  
-       if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
-           !child_ctx->task_ctx_data) {
-               struct pmu *pmu = child_event->pmu;
-               child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
-               if (!child_ctx->task_ctx_data) {
-                       free_event(child_event);
-                       return ERR_PTR(-ENOMEM);
-               }
+       pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
+       if (IS_ERR(pmu_ctx)) {
+               free_event(child_event);
+               return NULL;
        }
+       child_event->pmu_ctx = pmu_ctx;
  
        /*
         * is_orphaned_event() and list_add_tail(&parent_event->child_list)
@@@ -13278,11 -13310,11 +13356,11 @@@ static int inherit_group(struct perf_ev
  static int
  inherit_task_group(struct perf_event *event, struct task_struct *parent,
                   struct perf_event_context *parent_ctx,
-                  struct task_struct *child, int ctxn,
+                  struct task_struct *child,
                   u64 clone_flags, int *inherited_all)
  {
-       int ret;
        struct perf_event_context *child_ctx;
+       int ret;
  
        if (!event->attr.inherit ||
            (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
                return 0;
        }
  
-       child_ctx = child->perf_event_ctxp[ctxn];
+       child_ctx = child->perf_event_ctxp;
        if (!child_ctx) {
                /*
                 * This is executed from the parent task context, so
                 * First allocate and initialize a context for the
                 * child.
                 */
-               child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+               child_ctx = alloc_perf_context(child);
                if (!child_ctx)
                        return -ENOMEM;
  
-               child->perf_event_ctxp[ctxn] = child_ctx;
+               child->perf_event_ctxp = child_ctx;
        }
  
-       ret = inherit_group(event, parent, parent_ctx,
-                           child, child_ctx);
+       ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
        if (ret)
                *inherited_all = 0;
  
  /*
   * Initialize the perf_event context in task_struct
   */
- static int perf_event_init_context(struct task_struct *child, int ctxn,
-                                  u64 clone_flags)
+ static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
  {
        struct perf_event_context *child_ctx, *parent_ctx;
        struct perf_event_context *cloned_ctx;
        unsigned long flags;
        int ret = 0;
  
-       if (likely(!parent->perf_event_ctxp[ctxn]))
+       if (likely(!parent->perf_event_ctxp))
                return 0;
  
        /*
         * If the parent's context is a clone, pin it so it won't get
         * swapped under us.
         */
-       parent_ctx = perf_pin_task_context(parent, ctxn);
+       parent_ctx = perf_pin_task_context(parent);
        if (!parent_ctx)
                return 0;
  
         */
        perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
-                                        child, ctxn, clone_flags,
-                                        &inherited_all);
+                                        child, clone_flags, &inherited_all);
                if (ret)
                        goto out_unlock;
        }
  
        perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
-                                        child, ctxn, clone_flags,
-                                        &inherited_all);
+                                        child, clone_flags, &inherited_all);
                if (ret)
                        goto out_unlock;
        }
        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
        parent_ctx->rotate_disable = 0;
  
-       child_ctx = child->perf_event_ctxp[ctxn];
+       child_ctx = child->perf_event_ctxp;
  
        if (child_ctx && inherited_all) {
                /*
@@@ -13422,18 -13449,16 +13495,16 @@@ out_unlock
   */
  int perf_event_init_task(struct task_struct *child, u64 clone_flags)
  {
-       int ctxn, ret;
+       int ret;
  
-       memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+       child->perf_event_ctxp = NULL;
        mutex_init(&child->perf_event_mutex);
        INIT_LIST_HEAD(&child->perf_event_list);
  
-       for_each_task_context_nr(ctxn) {
-               ret = perf_event_init_context(child, ctxn, clone_flags);
-               if (ret) {
-                       perf_event_free_task(child);
-                       return ret;
-               }
+       ret = perf_event_init_context(child, clone_flags);
+       if (ret) {
+               perf_event_free_task(child);
+               return ret;
        }
  
        return 0;
  static void __init perf_event_init_all_cpus(void)
  {
        struct swevent_htable *swhash;
+       struct perf_cpu_context *cpuctx;
        int cpu;
  
        zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
        for_each_possible_cpu(cpu) {
                swhash = &per_cpu(swevent_htable, cpu);
                mutex_init(&swhash->hlist_mutex);
-               INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
  
                INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
                raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
  
- #ifdef CONFIG_CGROUP_PERF
-               INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
- #endif
                INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
+               cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+               __perf_event_init_context(&cpuctx->ctx);
+               lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+               lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
+               cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
+               cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
+               cpuctx->heap = cpuctx->heap_default;
        }
  }
  
@@@ -13479,12 -13509,12 +13555,12 @@@ static void perf_swevent_init_cpu(unsig
  #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
  static void __perf_event_exit_context(void *__info)
  {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *ctx = __info;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct perf_event *event;
  
        raw_spin_lock(&ctx->lock);
-       ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+       ctx_sched_out(ctx, EVENT_TIME);
        list_for_each_entry(event, &ctx->event_list, event_entry)
                __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
        raw_spin_unlock(&ctx->lock);
@@@ -13494,18 -13524,16 +13570,16 @@@ static void perf_event_exit_cpu_context
  {
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
-       struct pmu *pmu;
  
+       // XXX simplify cpuctx->online
        mutex_lock(&pmus_lock);
-       list_for_each_entry(pmu, &pmus, entry) {
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-               ctx = &cpuctx->ctx;
+       cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+       ctx = &cpuctx->ctx;
  
-               mutex_lock(&ctx->mutex);
-               smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
-               cpuctx->online = 0;
-               mutex_unlock(&ctx->mutex);
-       }
+       mutex_lock(&ctx->mutex);
+       smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+       cpuctx->online = 0;
+       mutex_unlock(&ctx->mutex);
        cpumask_clear_cpu(cpu, perf_online_mask);
        mutex_unlock(&pmus_lock);
  }
@@@ -13519,20 -13547,17 +13593,17 @@@ int perf_event_init_cpu(unsigned int cp
  {
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
-       struct pmu *pmu;
  
        perf_swevent_init_cpu(cpu);
  
        mutex_lock(&pmus_lock);
        cpumask_set_cpu(cpu, perf_online_mask);
-       list_for_each_entry(pmu, &pmus, entry) {
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-               ctx = &cpuctx->ctx;
+       cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+       ctx = &cpuctx->ctx;
  
-               mutex_lock(&ctx->mutex);
-               cpuctx->online = 1;
-               mutex_unlock(&ctx->mutex);
-       }
+       mutex_lock(&ctx->mutex);
+       cpuctx->online = 1;
+       mutex_unlock(&ctx->mutex);
        mutex_unlock(&pmus_lock);
  
        return 0;
@@@ -13669,9 -13694,12 +13740,12 @@@ static int perf_cgroup_css_online(struc
  static int __perf_cgroup_move(void *info)
  {
        struct task_struct *task = info;
-       rcu_read_lock();
-       perf_cgroup_switch(task);
-       rcu_read_unlock();
+       preempt_disable();
+       if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
+               perf_cgroup_switch(task);
+       preempt_enable();
        return 0;
  }
  
This page took 0.24752 seconds and 4 git commands to generate.