]> Git Repo - linux.git/commitdiff
Merge branch 'smp-hotplug-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <[email protected]>
Tue, 4 Jul 2017 01:08:06 +0000 (18:08 -0700)
committerLinus Torvalds <[email protected]>
Tue, 4 Jul 2017 01:08:06 +0000 (18:08 -0700)
Pull SMP hotplug updates from Thomas Gleixner:
 "This update is primarily a cleanup of the CPU hotplug locking code.

  The hotplug locking mechanism is an open coded RWSEM, which allows
  recursive locking. The main problem with that is the recursive nature
  as it evades the full lockdep coverage and hides potential deadlocks.

  The rework replaces the open coded RWSEM with a percpu RWSEM and
  establishes full lockdep coverage that way.

  The bulk of the changes fix up recursive locking issues and address
  the now fully reported potential deadlocks all over the place. Some of
  these deadlocks have been observed in the RT tree, but on mainline the
  probability was low enough to hide them away."

* 'smp-hotplug-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (37 commits)
  cpu/hotplug: Constify attribute_group structures
  powerpc: Only obtain cpu_hotplug_lock if called by rtasd
  ARM/hw_breakpoint: Fix possible recursive locking for arch_hw_breakpoint_init
  cpu/hotplug: Remove unused check_for_tasks() function
  perf/core: Don't release cred_guard_mutex if not taken
  cpuhotplug: Link lock stacks for hotplug callbacks
  acpi/processor: Prevent cpu hotplug deadlock
  sched: Provide is_percpu_thread() helper
  cpu/hotplug: Convert hotplug locking to percpu rwsem
  s390: Prevent hotplug rwsem recursion
  arm: Prevent hotplug rwsem recursion
  arm64: Prevent cpu hotplug rwsem recursion
  kprobes: Cure hotplug lock ordering issues
  jump_label: Reorder hotplug lock and jump_label_lock
  perf/tracing/cpuhotplug: Fix locking order
  ACPI/processor: Use cpu_hotplug_disable() instead of get_online_cpus()
  PCI: Replace the racy recursion prevention
  PCI: Use cpu_hotplug_disable() instead of get_online_cpus()
  perf/x86/intel: Drop get_online_cpus() in intel_snb_check_microcode()
  x86/perf: Drop EXPORT of perf_check_microcode
  ...

12 files changed:
1  2 
arch/powerpc/include/asm/topology.h
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/platforms/powernv/subcore.c
arch/x86/events/core.c
arch/x86/events/intel/core.c
drivers/cpufreq/cpufreq.c
include/linux/cpuhotplug.h
include/linux/pci.h
include/linux/sched.h
kernel/cpu.c
kernel/events/core.c
kernel/kprobes.c

index 329771559cbbb16048d67d27450865703a248c90,a2d36b7703ae2e0c8cf04c3798f08649452050a5..dc4e15937ccf847f4fa040d4b77cf0ae07a7ba8d
@@@ -43,23 -43,10 +43,24 @@@ extern void __init dump_numa_cpu_topolo
  
  extern int sysfs_add_device_to_node(struct device *dev, int nid);
  extern void sysfs_remove_device_from_node(struct device *dev, int nid);
+ extern int numa_update_cpu_topology(bool cpus_locked);
  
 +static inline int early_cpu_to_node(int cpu)
 +{
 +      int nid;
 +
 +      nid = numa_cpu_lookup_table[cpu];
 +
 +      /*
 +       * Fall back to node 0 if nid is unset (it should be, except bugs).
 +       * This allows callers to safely do NODE_DATA(early_cpu_to_node(cpu)).
 +       */
 +      return (nid < 0) ? 0 : nid;
 +}
  #else
  
 +static inline int early_cpu_to_node(int cpu) { return 0; }
 +
  static inline void dump_numa_cpu_topology(void) {}
  
  static inline int sysfs_add_device_to_node(struct device *dev, int nid)
@@@ -71,6 -58,11 +72,11 @@@ static inline void sysfs_remove_device_
                                                int nid)
  {
  }
+ static inline int numa_update_cpu_topology(bool cpus_locked)
+ {
+       return 0;
+ }
  #endif /* CONFIG_NUMA */
  
  #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
index 8d1a365b8edc45fa9f655b77789a0d8602b462fd,48a6bd160011e0f232fa7a9021aacf5a4d31c008..773b35d16a0b61ddc3b13f02fd8a7eaca9d4976b
@@@ -1486,14 -1486,6 +1486,14 @@@ static int kvmppc_set_one_reg_hv(struc
                r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
                break;
        case KVM_REG_PPC_TB_OFFSET:
 +              /*
 +               * POWER9 DD1 has an erratum where writing TBU40 causes
 +               * the timebase to lose ticks.  So we don't let the
 +               * timebase offset be changed on P9 DD1.  (It is
 +               * initialized to zero.)
 +               */
 +              if (cpu_has_feature(CPU_FTR_POWER9_DD1))
 +                      break;
                /* round up to multiple of 2^24 */
                vcpu->arch.vcore->tb_offset =
                        ALIGN(set_reg_val(id, *val), 1UL << 24);
@@@ -2915,36 -2907,12 +2915,36 @@@ static int kvmppc_vcpu_run_hv(struct kv
  {
        int r;
        int srcu_idx;
 +      unsigned long ebb_regs[3] = {}; /* shut up GCC */
 +      unsigned long user_tar = 0;
 +      unsigned int user_vrsave;
  
        if (!vcpu->arch.sane) {
                run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                return -EINVAL;
        }
  
 +      /*
 +       * Don't allow entry with a suspended transaction, because
 +       * the guest entry/exit code will lose it.
 +       * If the guest has TM enabled, save away their TM-related SPRs
 +       * (they will get restored by the TM unavailable interrupt).
 +       */
 +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 +      if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
 +          (current->thread.regs->msr & MSR_TM)) {
 +              if (MSR_TM_ACTIVE(current->thread.regs->msr)) {
 +                      run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 +                      run->fail_entry.hardware_entry_failure_reason = 0;
 +                      return -EINVAL;
 +              }
 +              current->thread.tm_tfhar = mfspr(SPRN_TFHAR);
 +              current->thread.tm_tfiar = mfspr(SPRN_TFIAR);
 +              current->thread.tm_texasr = mfspr(SPRN_TEXASR);
 +              current->thread.regs->msr &= ~MSR_TM;
 +      }
 +#endif
 +
        kvmppc_core_prepare_to_enter(vcpu);
  
        /* No need to go into the guest when all we'll do is come back out */
  
        flush_all_to_thread(current);
  
 +      /* Save userspace EBB and other register values */
 +      if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
 +              ebb_regs[0] = mfspr(SPRN_EBBHR);
 +              ebb_regs[1] = mfspr(SPRN_EBBRR);
 +              ebb_regs[2] = mfspr(SPRN_BESCR);
 +              user_tar = mfspr(SPRN_TAR);
 +      }
 +      user_vrsave = mfspr(SPRN_VRSAVE);
 +
        vcpu->arch.wqp = &vcpu->arch.vcore->wq;
        vcpu->arch.pgdir = current->mm->pgd;
        vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
                }
        } while (is_kvmppc_resume_guest(r));
  
 +      /* Restore userspace EBB and other register values */
 +      if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
 +              mtspr(SPRN_EBBHR, ebb_regs[0]);
 +              mtspr(SPRN_EBBRR, ebb_regs[1]);
 +              mtspr(SPRN_BESCR, ebb_regs[2]);
 +              mtspr(SPRN_TAR, user_tar);
 +              mtspr(SPRN_FSCR, current->thread.fscr);
 +      }
 +      mtspr(SPRN_VRSAVE, user_vrsave);
 +
   out:
        vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
        atomic_dec(&vcpu->kvm->arch.vcpus_running);
@@@ -3368,7 -3317,7 +3368,7 @@@ void kvmppc_alloc_host_rm_ops(void
                return;
        }
  
-       get_online_cpus();
+       cpus_read_lock();
  
        for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
                if (!cpu_online(cpu))
        l_ops = (unsigned long) ops;
  
        if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
-               put_online_cpus();
+               cpus_read_unlock();
                kfree(ops->rm_core);
                kfree(ops);
                return;
        }
  
-       cpuhp_setup_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE,
-                                 "ppc/kvm_book3s:prepare",
-                                 kvmppc_set_host_core,
-                                 kvmppc_clear_host_core);
-       put_online_cpus();
+       cpuhp_setup_state_nocalls_cpuslocked(CPUHP_KVM_PPC_BOOK3S_PREPARE,
+                                            "ppc/kvm_book3s:prepare",
+                                            kvmppc_set_host_core,
+                                            kvmppc_clear_host_core);
+       cpus_read_unlock();
  }
  
  void kvmppc_free_host_rm_ops(void)
index 8c6119280c1306afd399d2c86ce88381882ab5df,e6230f104dd9cd545a40c2b4b92ca4a71c5cc9ba..309876d699e9474c660579eae166f01716580cae
@@@ -348,7 -348,7 +348,7 @@@ static int set_subcores_per_core(int ne
                state->master = 0;
        }
  
-       get_online_cpus();
+       cpus_read_lock();
  
        /* This cpu will update the globals before exiting stop machine */
        this_cpu_ptr(&split_state)->master = 1;
        /* Ensure state is consistent before we call the other cpus */
        mb();
  
-       stop_machine(cpu_update_split_mode, &new_mode, cpu_online_mask);
+       stop_machine_cpuslocked(cpu_update_split_mode, &new_mode,
+                               cpu_online_mask);
  
-       put_online_cpus();
+       cpus_read_unlock();
  
        return 0;
  }
@@@ -407,13 -408,7 +408,13 @@@ static DEVICE_ATTR(subcores_per_core, 0
  
  static int subcore_init(void)
  {
 -      if (!cpu_has_feature(CPU_FTR_SUBCORE))
 +      unsigned pvr_ver;
 +
 +      pvr_ver = PVR_VER(mfspr(SPRN_PVR));
 +
 +      if (pvr_ver != PVR_POWER8 &&
 +          pvr_ver != PVR_POWER8E &&
 +          pvr_ver != PVR_POWER8NVL)
                return 0;
  
        /*
diff --combined arch/x86/events/core.c
index 2de0dd73830adb5983a15ad3e55c275f53026963,ac650d57ebf7e9ce14abaca85154ca56b0f3cc7b..ff1ea2fb97055e4d6a3282a18c16a0eea689c29a
@@@ -1750,8 -1750,6 +1750,8 @@@ ssize_t x86_event_sysfs_show(char *page
        return ret;
  }
  
 +static struct attribute_group x86_pmu_attr_group;
 +
  static int __init init_hw_perf_events(void)
  {
        struct x86_pmu_quirk *quirk;
                        x86_pmu_events_group.attrs = tmp;
        }
  
 +      if (x86_pmu.attrs) {
 +              struct attribute **tmp;
 +
 +              tmp = merge_attr(x86_pmu_attr_group.attrs, x86_pmu.attrs);
 +              if (!WARN_ON(!tmp))
 +                      x86_pmu_attr_group.attrs = tmp;
 +      }
 +
        pr_info("... version:                %d\n",     x86_pmu.version);
        pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
        pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
@@@ -2111,7 -2101,8 +2111,7 @@@ static int x86_pmu_event_init(struct pe
  
  static void refresh_pce(void *ignored)
  {
 -      if (current->active_mm)
 -              load_mm_cr4(current->active_mm);
 +      load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
  }
  
  static void x86_pmu_event_mapped(struct perf_event *event)
@@@ -2233,7 -2224,6 +2233,6 @@@ void perf_check_microcode(void
        if (x86_pmu.check_microcode)
                x86_pmu.check_microcode();
  }
- EXPORT_SYMBOL_GPL(perf_check_microcode);
  
  static struct pmu pmu = {
        .pmu_enable             = x86_pmu_enable,
  void arch_perf_update_userpage(struct perf_event *event,
                               struct perf_event_mmap_page *userpg, u64 now)
  {
 -      struct cyc2ns_data *data;
 +      struct cyc2ns_data data;
        u64 offset;
  
        userpg->cap_user_time = 0;
        if (!using_native_sched_clock() || !sched_clock_stable())
                return;
  
 -      data = cyc2ns_read_begin();
 +      cyc2ns_read_begin(&data);
  
 -      offset = data->cyc2ns_offset + __sched_clock_offset;
 +      offset = data.cyc2ns_offset + __sched_clock_offset;
  
        /*
         * Internal timekeeping for enabled/running/stopped times
         * is always in the local_clock domain.
         */
        userpg->cap_user_time = 1;
 -      userpg->time_mult = data->cyc2ns_mul;
 -      userpg->time_shift = data->cyc2ns_shift;
 +      userpg->time_mult = data.cyc2ns_mul;
 +      userpg->time_shift = data.cyc2ns_shift;
        userpg->time_offset = offset - now;
  
        /*
                userpg->time_zero = offset;
        }
  
 -      cyc2ns_read_end(data);
 +      cyc2ns_read_end();
  }
  
  void
@@@ -2343,7 -2333,7 +2342,7 @@@ static unsigned long get_segment_base(u
  
                /* IRQs are off, so this synchronizes with smp_store_release */
                ldt = lockless_dereference(current->active_mm->context.ldt);
 -              if (!ldt || idx > ldt->size)
 +              if (!ldt || idx > ldt->nr_entries)
                        return 0;
  
                desc = &ldt->entries[idx];
index 31acf2a9839437979bc1e849a10dbdecbe270d13,b9174aacf42feefdc9f85b4e75f281576d2684ab..aa62437d1aa142a3960804d44ad48e6f08e44eb8
@@@ -431,11 -431,11 +431,11 @@@ static __initconst const u64 skl_hw_cac
   [ C(DTLB) ] = {
        [ C(OP_READ) ] = {
                [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_INST_RETIRED.ALL_LOADS */
 -              [ C(RESULT_MISS)   ] = 0x608,   /* DTLB_LOAD_MISSES.WALK_COMPLETED */
 +              [ C(RESULT_MISS)   ] = 0xe08,   /* DTLB_LOAD_MISSES.WALK_COMPLETED */
        },
        [ C(OP_WRITE) ] = {
                [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_INST_RETIRED.ALL_STORES */
 -              [ C(RESULT_MISS)   ] = 0x649,   /* DTLB_STORE_MISSES.WALK_COMPLETED */
 +              [ C(RESULT_MISS)   ] = 0xe49,   /* DTLB_STORE_MISSES.WALK_COMPLETED */
        },
        [ C(OP_PREFETCH) ] = {
                [ C(RESULT_ACCESS) ] = 0x0,
        return -ENOMEM;
  }
  
 +static void flip_smm_bit(void *data)
 +{
 +      unsigned long set = *(unsigned long *)data;
 +
 +      if (set > 0) {
 +              msr_set_bit(MSR_IA32_DEBUGCTLMSR,
 +                          DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
 +      } else {
 +              msr_clear_bit(MSR_IA32_DEBUGCTLMSR,
 +                            DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
 +      }
 +}
 +
  static void intel_pmu_cpu_starting(int cpu)
  {
        struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
  
        cpuc->lbr_sel = NULL;
  
 +      flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
 +
        if (!cpuc->shared_regs)
                return;
  
@@@ -3425,12 -3410,10 +3425,10 @@@ static void intel_snb_check_microcode(v
        int pebs_broken = 0;
        int cpu;
  
-       get_online_cpus();
        for_each_online_cpu(cpu) {
                if ((pebs_broken = intel_snb_pebs_broken(cpu)))
                        break;
        }
-       put_online_cpus();
  
        if (pebs_broken == x86_pmu.pebs_broken)
                return;
@@@ -3503,7 -3486,9 +3501,9 @@@ static bool check_msr(unsigned long msr
  static __init void intel_sandybridge_quirk(void)
  {
        x86_pmu.check_microcode = intel_snb_check_microcode;
+       cpus_read_lock();
        intel_snb_check_microcode();
+       cpus_read_unlock();
  }
  
  static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
@@@ -3610,52 -3595,6 +3610,52 @@@ static struct attribute *hsw_events_att
        NULL
  };
  
 +static ssize_t freeze_on_smi_show(struct device *cdev,
 +                                struct device_attribute *attr,
 +                                char *buf)
 +{
 +      return sprintf(buf, "%lu\n", x86_pmu.attr_freeze_on_smi);
 +}
 +
 +static DEFINE_MUTEX(freeze_on_smi_mutex);
 +
 +static ssize_t freeze_on_smi_store(struct device *cdev,
 +                                 struct device_attribute *attr,
 +                                 const char *buf, size_t count)
 +{
 +      unsigned long val;
 +      ssize_t ret;
 +
 +      ret = kstrtoul(buf, 0, &val);
 +      if (ret)
 +              return ret;
 +
 +      if (val > 1)
 +              return -EINVAL;
 +
 +      mutex_lock(&freeze_on_smi_mutex);
 +
 +      if (x86_pmu.attr_freeze_on_smi == val)
 +              goto done;
 +
 +      x86_pmu.attr_freeze_on_smi = val;
 +
 +      get_online_cpus();
 +      on_each_cpu(flip_smm_bit, &val, 1);
 +      put_online_cpus();
 +done:
 +      mutex_unlock(&freeze_on_smi_mutex);
 +
 +      return count;
 +}
 +
 +static DEVICE_ATTR_RW(freeze_on_smi);
 +
 +static struct attribute *intel_pmu_attrs[] = {
 +      &dev_attr_freeze_on_smi.attr,
 +      NULL,
 +};
 +
  __init int intel_pmu_init(void)
  {
        union cpuid10_edx edx;
  
        x86_pmu.max_pebs_events         = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
  
 +
 +      x86_pmu.attrs                   = intel_pmu_attrs;
        /*
         * Quirk: v2 perfmon does not report fixed-purpose events, so
         * assume at least 3 events, when not running in a hypervisor:
@@@ -4175,13 -4112,12 +4175,12 @@@ static __init int fixup_ht_bug(void
  
        lockup_detector_resume();
  
-       get_online_cpus();
+       cpus_read_lock();
  
-       for_each_online_cpu(c) {
+       for_each_online_cpu(c)
                free_excl_cntrs(c);
-       }
  
-       put_online_cpus();
+       cpus_read_unlock();
        pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
        return 0;
  }
index 26b643d57847de0fca4afc099f4bec49d6326be9,6001369f9aeb4d2b9e1b90401bb399a75d9cb406..29c5b0cbad967ecc3ef0cd8d6c34bc6d88131c2c
@@@ -887,7 -887,7 +887,7 @@@ static ssize_t store(struct kobject *ko
        struct freq_attr *fattr = to_attr(attr);
        ssize_t ret = -EINVAL;
  
-       get_online_cpus();
+       cpus_read_lock();
  
        if (cpu_online(policy->cpu)) {
                down_write(&policy->rwsem);
                up_write(&policy->rwsem);
        }
  
-       put_online_cpus();
+       cpus_read_unlock();
  
        return ret;
  }
@@@ -2441,7 -2441,7 +2441,7 @@@ int cpufreq_register_driver(struct cpuf
        pr_debug("trying to register driver %s\n", driver_data->name);
  
        /* Protect against concurrent CPU online/offline. */
-       get_online_cpus();
+       cpus_read_lock();
  
        write_lock_irqsave(&cpufreq_driver_lock, flags);
        if (cpufreq_driver) {
        if (!(cpufreq_driver->flags & CPUFREQ_STICKY) &&
            list_empty(&cpufreq_policy_list)) {
                /* if all ->init() calls failed, unregister */
 +              ret = -ENODEV;
                pr_debug("%s: No CPU initialized for driver %s\n", __func__,
                         driver_data->name);
                goto err_if_unreg;
        }
  
-       ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "cpufreq:online",
-                                       cpuhp_cpufreq_online,
-                                       cpuhp_cpufreq_offline);
+       ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN,
+                                                  "cpufreq:online",
+                                                  cpuhp_cpufreq_online,
+                                                  cpuhp_cpufreq_offline);
        if (ret < 0)
                goto err_if_unreg;
        hp_online = ret;
@@@ -2494,7 -2494,7 +2495,7 @@@ err_null_driver
        cpufreq_driver = NULL;
        write_unlock_irqrestore(&cpufreq_driver_lock, flags);
  out:
-       put_online_cpus();
+       cpus_read_unlock();
        return ret;
  }
  EXPORT_SYMBOL_GPL(cpufreq_register_driver);
@@@ -2517,17 -2517,17 +2518,17 @@@ int cpufreq_unregister_driver(struct cp
        pr_debug("unregistering driver %s\n", driver->name);
  
        /* Protect against concurrent cpu hotplug */
-       get_online_cpus();
+       cpus_read_lock();
        subsys_interface_unregister(&cpufreq_interface);
        remove_boost_sysfs_file();
-       cpuhp_remove_state_nocalls(hp_online);
+       cpuhp_remove_state_nocalls_cpuslocked(hp_online);
  
        write_lock_irqsave(&cpufreq_driver_lock, flags);
  
        cpufreq_driver = NULL;
  
        write_unlock_irqrestore(&cpufreq_driver_lock, flags);
-       put_online_cpus();
+       cpus_read_unlock();
  
        return 0;
  }
index 7f815d91597765ff207d35129b356e0fe0716d31,df3d2719a796c1063de545970e26a85c251cf211..b56573bf440db4b85f8f11f678edb9e28d4e6cd8
@@@ -58,6 -58,7 +58,6 @@@ enum cpuhp_state 
        CPUHP_XEN_EVTCHN_PREPARE,
        CPUHP_ARM_SHMOBILE_SCU_PREPARE,
        CPUHP_SH_SH3X_PREPARE,
 -      CPUHP_BLK_MQ_PREPARE,
        CPUHP_NET_FLOW_PREPARE,
        CPUHP_TOPOLOGY_PREPARE,
        CPUHP_NET_IUCV_PREPARE,
        CPUHP_AP_ONLINE_IDLE,
        CPUHP_AP_SMPBOOT_THREADS,
        CPUHP_AP_X86_VDSO_VMA_ONLINE,
 +      CPUHP_AP_IRQ_AFFINITY_ONLINE,
        CPUHP_AP_PERF_ONLINE,
        CPUHP_AP_PERF_X86_ONLINE,
        CPUHP_AP_PERF_X86_UNCORE_ONLINE,
@@@ -153,6 -153,11 +153,11 @@@ int __cpuhp_setup_state(enum cpuhp_stat
                        int (*startup)(unsigned int cpu),
                        int (*teardown)(unsigned int cpu), bool multi_instance);
  
+ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name,
+                                  bool invoke,
+                                  int (*startup)(unsigned int cpu),
+                                  int (*teardown)(unsigned int cpu),
+                                  bool multi_instance);
  /**
   * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks
   * @state:    The state for which the calls are installed
@@@ -171,6 -176,15 +176,15 @@@ static inline int cpuhp_setup_state(enu
        return __cpuhp_setup_state(state, name, true, startup, teardown, false);
  }
  
+ static inline int cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
+                                              const char *name,
+                                              int (*startup)(unsigned int cpu),
+                                              int (*teardown)(unsigned int cpu))
+ {
+       return __cpuhp_setup_state_cpuslocked(state, name, true, startup,
+                                             teardown, false);
+ }
  /**
   * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the
   *                           callbacks
@@@ -191,6 -205,15 +205,15 @@@ static inline int cpuhp_setup_state_noc
                                   false);
  }
  
+ static inline int cpuhp_setup_state_nocalls_cpuslocked(enum cpuhp_state state,
+                                                    const char *name,
+                                                    int (*startup)(unsigned int cpu),
+                                                    int (*teardown)(unsigned int cpu))
+ {
+       return __cpuhp_setup_state_cpuslocked(state, name, false, startup,
+                                           teardown, false);
+ }
  /**
   * cpuhp_setup_state_multi - Add callbacks for multi state
   * @state:    The state for which the calls are installed
@@@ -217,6 -240,8 +240,8 @@@ static inline int cpuhp_setup_state_mul
  
  int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
                               bool invoke);
+ int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
+                                         struct hlist_node *node, bool invoke);
  
  /**
   * cpuhp_state_add_instance - Add an instance for a state and invoke startup
@@@ -249,7 -274,15 +274,15 @@@ static inline int cpuhp_state_add_insta
        return __cpuhp_state_add_instance(state, node, false);
  }
  
+ static inline int
+ cpuhp_state_add_instance_nocalls_cpuslocked(enum cpuhp_state state,
+                                           struct hlist_node *node)
+ {
+       return __cpuhp_state_add_instance_cpuslocked(state, node, false);
+ }
  void __cpuhp_remove_state(enum cpuhp_state state, bool invoke);
+ void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke);
  
  /**
   * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown
@@@ -273,6 -306,11 +306,11 @@@ static inline void cpuhp_remove_state_n
        __cpuhp_remove_state(state, false);
  }
  
+ static inline void cpuhp_remove_state_nocalls_cpuslocked(enum cpuhp_state state)
+ {
+       __cpuhp_remove_state_cpuslocked(state, false);
+ }
  /**
   * cpuhp_remove_multi_state - Remove hotplug multi state callback
   * @state:    The state for which the calls are removed
diff --combined include/linux/pci.h
index 8039f9f0ca054ba20fd9992b13b7926859d029b5,5026f2ae86db2431df02a0a1e6e2519e317b0246..58f1ab06c4e8f645a757dca55a67bcb19ec36b62
@@@ -183,11 -183,6 +183,11 @@@ enum pci_dev_flags 
        PCI_DEV_FLAGS_BRIDGE_XLATE_ROOT = (__force pci_dev_flags_t) (1 << 9),
        /* Do not use FLR even if device advertises PCI_AF_CAP */
        PCI_DEV_FLAGS_NO_FLR_RESET = (__force pci_dev_flags_t) (1 << 10),
 +      /*
 +       * Resume before calling the driver's system suspend hooks, disabling
 +       * the direct_complete optimization.
 +       */
 +      PCI_DEV_FLAGS_NEEDS_RESUME = (__force pci_dev_flags_t) (1 << 11),
  };
  
  enum pci_irq_reroute_variant {
@@@ -376,6 -371,7 +376,7 @@@ struct pci_dev 
        unsigned int    irq_managed:1;
        unsigned int    has_secondary_link:1;
        unsigned int    non_compliant_bars:1;   /* broken BARs; ignore them */
+       unsigned int    is_probed:1;            /* device probing in progress */
        pci_dev_flags_t dev_flags;
        atomic_t        enable_cnt;     /* pci_enable_device has been called */
  
@@@ -1347,9 -1343,9 +1348,9 @@@ pci_alloc_irq_vectors_affinity(struct p
                               unsigned int max_vecs, unsigned int flags,
                               const struct irq_affinity *aff_desc)
  {
 -      if (min_vecs > 1)
 -              return -EINVAL;
 -      return 1;
 +      if ((flags & PCI_IRQ_LEGACY) && min_vecs == 1 && dev->irq)
 +              return 1;
 +      return -ENOSPC;
  }
  
  static inline void pci_free_irq_vectors(struct pci_dev *dev)
diff --combined include/linux/sched.h
index 1f0f427e0292f4ad1a7549bfc9d396353c7f97dd,3dfa5f99d6ee576630deb7d9644ef97e3f7458dd..9c4ca7433d9d6d89aa6bad857b5299ae1bbe5577
@@@ -421,8 -421,7 +421,8 @@@ struct sched_dl_entity 
        u64                             dl_runtime;     /* Maximum runtime for each instance    */
        u64                             dl_deadline;    /* Relative deadline of each instance   */
        u64                             dl_period;      /* Separation of two instances (period) */
 -      u64                             dl_bw;          /* dl_runtime / dl_deadline             */
 +      u64                             dl_bw;          /* dl_runtime / dl_period               */
 +      u64                             dl_density;     /* dl_runtime / dl_deadline             */
  
        /*
         * Actual scheduling parameters. Initialized with the values above,
         *
         * @dl_yielded tells if task gave up the CPU before consuming
         * all its available runtime during the last job.
 +       *
 +       * @dl_non_contending tells if the task is inactive while still
 +       * contributing to the active utilization. In other words, it
 +       * indicates if the inactive timer has been armed and its handler
 +       * has not been executed yet. This flag is useful to avoid race
 +       * conditions between the inactive timer handler and the wakeup
 +       * code.
         */
        int                             dl_throttled;
        int                             dl_boosted;
        int                             dl_yielded;
 +      int                             dl_non_contending;
  
        /*
         * Bandwidth enforcement timer. Each -deadline task has its
         * own bandwidth to be enforced, thus we need one timer per task.
         */
        struct hrtimer                  dl_timer;
 +
 +      /*
 +       * Inactive timer, responsible for decreasing the active utilization
 +       * at the "0-lag time". When a -deadline task blocks, it contributes
 +       * to GRUB's active utilization until the "0-lag time", hence a
 +       * timer is needed to decrease the active utilization at the correct
 +       * time.
 +       */
 +      struct hrtimer inactive_timer;
  };
  
  union rcu_special {
@@@ -1114,6 -1096,8 +1114,6 @@@ static inline struct pid *task_session(
   *                     current.
   * task_xid_nr_ns()  : id seen from the ns specified;
   *
 - * set_task_vxid()   : assigns a virtual id to a task;
 - *
   * see also pid_nr() etc in include/linux/pid.h
   */
  pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
@@@ -1281,6 -1265,16 +1281,16 @@@ extern struct pid *cad_pid
  #define tsk_used_math(p)                      ((p)->flags & PF_USED_MATH)
  #define used_math()                           tsk_used_math(current)
  
+ static inline bool is_percpu_thread(void)
+ {
+ #ifdef CONFIG_SMP
+       return (current->flags & PF_NO_SETAFFINITY) &&
+               (current->nr_cpus_allowed  == 1);
+ #else
+       return true;
+ #endif
+ }
  /* Per-process atomic flags. */
  #define PFA_NO_NEW_PRIVS              0       /* May not gain new privileges. */
  #define PFA_SPREAD_PAGE                       1       /* Spread page cache over cpuset */
diff --combined kernel/cpu.c
index b86b32ebb3b2eeeb15a4cec3bbb65912d67809ec,b69c0588f8c9ccbece87abcd4419cf657a2c31e4..b03a32595cfebc34cf60e353b75028870477c4fe
@@@ -27,6 -27,7 +27,7 @@@
  #include <linux/smpboot.h>
  #include <linux/relay.h>
  #include <linux/slab.h>
+ #include <linux/percpu-rwsem.h>
  
  #include <trace/events/power.h>
  #define CREATE_TRACE_POINTS
@@@ -65,6 -66,12 +66,12 @@@ struct cpuhp_cpu_state 
  
  static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
  
+ #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
+ static struct lock_class_key cpuhp_state_key;
+ static struct lockdep_map cpuhp_state_lock_map =
+       STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key);
+ #endif
  /**
   * cpuhp_step - Hotplug state machine step
   * @name:     Name of the step
@@@ -196,121 -203,41 +203,41 @@@ void cpu_maps_update_done(void
        mutex_unlock(&cpu_add_remove_lock);
  }
  
- /* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
+ /*
+  * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
   * Should always be manipulated under cpu_add_remove_lock
   */
  static int cpu_hotplug_disabled;
  
  #ifdef CONFIG_HOTPLUG_CPU
  
- static struct {
-       struct task_struct *active_writer;
-       /* wait queue to wake up the active_writer */
-       wait_queue_head_t wq;
-       /* verifies that no writer will get active while readers are active */
-       struct mutex lock;
-       /*
-        * Also blocks the new readers during
-        * an ongoing cpu hotplug operation.
-        */
-       atomic_t refcount;
- #ifdef CONFIG_DEBUG_LOCK_ALLOC
-       struct lockdep_map dep_map;
- #endif
- } cpu_hotplug = {
-       .active_writer = NULL,
-       .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
-       .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
- #ifdef CONFIG_DEBUG_LOCK_ALLOC
-       .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map),
- #endif
- };
- /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
- #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
- #define cpuhp_lock_acquire_tryread() \
-                                 lock_map_acquire_tryread(&cpu_hotplug.dep_map)
- #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
- #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
+ DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
  
- void get_online_cpus(void)
+ void cpus_read_lock(void)
  {
-       might_sleep();
-       if (cpu_hotplug.active_writer == current)
-               return;
-       cpuhp_lock_acquire_read();
-       mutex_lock(&cpu_hotplug.lock);
-       atomic_inc(&cpu_hotplug.refcount);
-       mutex_unlock(&cpu_hotplug.lock);
+       percpu_down_read(&cpu_hotplug_lock);
  }
- EXPORT_SYMBOL_GPL(get_online_cpus);
+ EXPORT_SYMBOL_GPL(cpus_read_lock);
  
- void put_online_cpus(void)
+ void cpus_read_unlock(void)
  {
-       int refcount;
-       if (cpu_hotplug.active_writer == current)
-               return;
-       refcount = atomic_dec_return(&cpu_hotplug.refcount);
-       if (WARN_ON(refcount < 0)) /* try to fix things up */
-               atomic_inc(&cpu_hotplug.refcount);
-       if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
-               wake_up(&cpu_hotplug.wq);
-       cpuhp_lock_release();
+       percpu_up_read(&cpu_hotplug_lock);
  }
- EXPORT_SYMBOL_GPL(put_online_cpus);
+ EXPORT_SYMBOL_GPL(cpus_read_unlock);
  
- /*
-  * This ensures that the hotplug operation can begin only when the
-  * refcount goes to zero.
-  *
-  * Note that during a cpu-hotplug operation, the new readers, if any,
-  * will be blocked by the cpu_hotplug.lock
-  *
-  * Since cpu_hotplug_begin() is always called after invoking
-  * cpu_maps_update_begin(), we can be sure that only one writer is active.
-  *
-  * Note that theoretically, there is a possibility of a livelock:
-  * - Refcount goes to zero, last reader wakes up the sleeping
-  *   writer.
-  * - Last reader unlocks the cpu_hotplug.lock.
-  * - A new reader arrives at this moment, bumps up the refcount.
-  * - The writer acquires the cpu_hotplug.lock finds the refcount
-  *   non zero and goes to sleep again.
-  *
-  * However, this is very difficult to achieve in practice since
-  * get_online_cpus() not an api which is called all that often.
-  *
-  */
- void cpu_hotplug_begin(void)
+ void cpus_write_lock(void)
  {
-       DEFINE_WAIT(wait);
-       cpu_hotplug.active_writer = current;
-       cpuhp_lock_acquire();
+       percpu_down_write(&cpu_hotplug_lock);
+ }
  
-       for (;;) {
-               mutex_lock(&cpu_hotplug.lock);
-               prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
-               if (likely(!atomic_read(&cpu_hotplug.refcount)))
-                               break;
-               mutex_unlock(&cpu_hotplug.lock);
-               schedule();
-       }
-       finish_wait(&cpu_hotplug.wq, &wait);
+ void cpus_write_unlock(void)
+ {
+       percpu_up_write(&cpu_hotplug_lock);
  }
  
- void cpu_hotplug_done(void)
+ void lockdep_assert_cpus_held(void)
  {
-       cpu_hotplug.active_writer = NULL;
-       mutex_unlock(&cpu_hotplug.lock);
-       cpuhp_lock_release();
+       percpu_rwsem_assert_held(&cpu_hotplug_lock);
  }
  
  /*
@@@ -344,8 -271,6 +271,6 @@@ void cpu_hotplug_enable(void
  EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
  #endif        /* CONFIG_HOTPLUG_CPU */
  
- /* Notifier wrappers for transitioning to state machine */
  static int bringup_wait_for_ap(unsigned int cpu)
  {
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@@ -484,6 -409,7 +409,7 @@@ static void cpuhp_thread_fun(unsigned i
  
        st->should_run = false;
  
+       lock_map_acquire(&cpuhp_state_lock_map);
        /* Single callback invocation for [un]install ? */
        if (st->single) {
                if (st->cb_state < CPUHP_AP_ONLINE) {
                else if (st->state > st->target)
                        ret = cpuhp_ap_offline(cpu, st);
        }
+       lock_map_release(&cpuhp_state_lock_map);
        st->result = ret;
        complete(&st->done);
  }
@@@ -524,6 -451,9 +451,9 @@@ cpuhp_invoke_ap_callback(int cpu, enum 
        if (!cpu_online(cpu))
                return 0;
  
+       lock_map_acquire(&cpuhp_state_lock_map);
+       lock_map_release(&cpuhp_state_lock_map);
        /*
         * If we are up and running, use the hotplug thread. For early calls
         * we invoke the thread function directly.
@@@ -567,6 -497,8 +497,8 @@@ static int cpuhp_kick_ap_work(unsigned 
        enum cpuhp_state state = st->state;
  
        trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
+       lock_map_acquire(&cpuhp_state_lock_map);
+       lock_map_release(&cpuhp_state_lock_map);
        __cpuhp_kick_ap_work(st);
        wait_for_completion(&st->done);
        trace_cpuhp_exit(cpu, st->state, state, st->result);
@@@ -630,30 -562,6 +562,6 @@@ void clear_tasks_mm_cpumask(int cpu
        rcu_read_unlock();
  }
  
- static inline void check_for_tasks(int dead_cpu)
- {
-       struct task_struct *g, *p;
-       read_lock(&tasklist_lock);
-       for_each_process_thread(g, p) {
-               if (!p->on_rq)
-                       continue;
-               /*
-                * We do the check with unlocked task_rq(p)->lock.
-                * Order the reading to do not warn about a task,
-                * which was running on this cpu in the past, and
-                * it's just been woken on another cpu.
-                */
-               rmb();
-               if (task_cpu(p) != dead_cpu)
-                       continue;
-               pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
-                       p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
-       }
-       read_unlock(&tasklist_lock);
- }
  /* Take this CPU down. */
  static int take_cpu_down(void *_param)
  {
@@@ -701,7 -609,7 +609,7 @@@ static int takedown_cpu(unsigned int cp
        /*
         * So now all preempt/rcu users must observe !cpu_active().
         */
-       err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu));
+       err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
        if (err) {
                /* CPU refused to die */
                irq_unlock_sparse();
@@@ -773,7 -681,7 +681,7 @@@ static int __ref _cpu_down(unsigned in
        if (!cpu_present(cpu))
                return -EINVAL;
  
-       cpu_hotplug_begin();
+       cpus_write_lock();
  
        cpuhp_tasks_frozen = tasks_frozen;
  
        }
  
  out:
-       cpu_hotplug_done();
+       cpus_write_unlock();
        return ret;
  }
  
@@@ -893,7 -801,7 +801,7 @@@ static int _cpu_up(unsigned int cpu, in
        struct task_struct *idle;
        int ret = 0;
  
-       cpu_hotplug_begin();
+       cpus_write_lock();
  
        if (!cpu_present(cpu)) {
                ret = -EINVAL;
        target = min((int)target, CPUHP_BRINGUP_CPU);
        ret = cpuhp_up_callbacks(cpu, st, target);
  out:
-       cpu_hotplug_done();
+       cpus_write_unlock();
        return ret;
  }
  
@@@ -1252,11 -1160,6 +1160,11 @@@ static struct cpuhp_step cpuhp_ap_state
                .startup.single         = smpboot_unpark_threads,
                .teardown.single        = NULL,
        },
 +      [CPUHP_AP_IRQ_AFFINITY_ONLINE] = {
 +              .name                   = "irq/affinity:online",
 +              .startup.single         = irq_affinity_online_cpu,
 +              .teardown.single        = NULL,
 +      },
        [CPUHP_AP_PERF_ONLINE] = {
                .name                   = "perf:online",
                .startup.single         = perf_event_init_cpu,
@@@ -1418,18 -1321,20 +1326,20 @@@ static void cpuhp_rollback_install(int 
        }
  }
  
- int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
-                              bool invoke)
+ int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
+                                         struct hlist_node *node,
+                                         bool invoke)
  {
        struct cpuhp_step *sp;
        int cpu;
        int ret;
  
+       lockdep_assert_cpus_held();
        sp = cpuhp_get_step(state);
        if (sp->multi_instance == false)
                return -EINVAL;
  
-       get_online_cpus();
        mutex_lock(&cpuhp_state_mutex);
  
        if (!invoke || !sp->startup.multi)
@@@ -1458,13 -1363,23 +1368,23 @@@ add_node
        hlist_add_head(node, &sp->list);
  unlock:
        mutex_unlock(&cpuhp_state_mutex);
-       put_online_cpus();
+       return ret;
+ }
+ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
+                              bool invoke)
+ {
+       int ret;
+       cpus_read_lock();
+       ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke);
+       cpus_read_unlock();
        return ret;
  }
  EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
  
  /**
-  * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state
+  * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
   * @state:            The state to setup
   * @invoke:           If true, the startup function is invoked for cpus where
   *                    cpu state >= @state
   * @multi_instance:   State is set up for multiple instances which get
   *                    added afterwards.
   *
+  * The caller needs to hold cpus read locked while calling this function.
   * Returns:
   *   On success:
   *      Positive state number if @state is CPUHP_AP_ONLINE_DYN
   *      0 for all other states
   *   On failure: proper (negative) error code
   */
- int __cpuhp_setup_state(enum cpuhp_state state,
-                       const char *name, bool invoke,
-                       int (*startup)(unsigned int cpu),
-                       int (*teardown)(unsigned int cpu),
-                       bool multi_instance)
+ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
+                                  const char *name, bool invoke,
+                                  int (*startup)(unsigned int cpu),
+                                  int (*teardown)(unsigned int cpu),
+                                  bool multi_instance)
  {
        int cpu, ret = 0;
        bool dynstate;
  
+       lockdep_assert_cpus_held();
        if (cpuhp_cb_check(state) || !name)
                return -EINVAL;
  
-       get_online_cpus();
        mutex_lock(&cpuhp_state_mutex);
  
        ret = cpuhp_store_callbacks(state, name, startup, teardown,
        }
  out:
        mutex_unlock(&cpuhp_state_mutex);
-       put_online_cpus();
        /*
         * If the requested state is CPUHP_AP_ONLINE_DYN, return the
         * dynamically allocated state in case of success.
                return state;
        return ret;
  }
+ EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked);
+ int __cpuhp_setup_state(enum cpuhp_state state,
+                       const char *name, bool invoke,
+                       int (*startup)(unsigned int cpu),
+                       int (*teardown)(unsigned int cpu),
+                       bool multi_instance)
+ {
+       int ret;
+       cpus_read_lock();
+       ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup,
+                                            teardown, multi_instance);
+       cpus_read_unlock();
+       return ret;
+ }
  EXPORT_SYMBOL(__cpuhp_setup_state);
  
  int __cpuhp_state_remove_instance(enum cpuhp_state state,
        if (!sp->multi_instance)
                return -EINVAL;
  
-       get_online_cpus();
+       cpus_read_lock();
        mutex_lock(&cpuhp_state_mutex);
  
        if (!invoke || !cpuhp_get_teardown_cb(state))
  remove:
        hlist_del(node);
        mutex_unlock(&cpuhp_state_mutex);
-       put_online_cpus();
+       cpus_read_unlock();
  
        return 0;
  }
  EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);
  
  /**
-  * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state
+  * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state
   * @state:    The state to remove
   * @invoke:   If true, the teardown function is invoked for cpus where
   *            cpu state >= @state
   *
+  * The caller needs to hold cpus read locked while calling this function.
   * The teardown callback is currently not allowed to fail. Think
   * about module removal!
   */
- void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
+ void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke)
  {
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int cpu;
  
        BUG_ON(cpuhp_cb_check(state));
  
-       get_online_cpus();
+       lockdep_assert_cpus_held();
  
        mutex_lock(&cpuhp_state_mutex);
        if (sp->multi_instance) {
  remove:
        cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
        mutex_unlock(&cpuhp_state_mutex);
-       put_online_cpus();
+ }
+ EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked);
+ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
+ {
+       cpus_read_lock();
+       __cpuhp_remove_state_cpuslocked(state, invoke);
+       cpus_read_unlock();
  }
  EXPORT_SYMBOL(__cpuhp_remove_state);
  
@@@ -1663,13 -1603,13 +1608,13 @@@ static ssize_t write_cpuhp_target(struc
        ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
        mutex_unlock(&cpuhp_state_mutex);
        if (ret)
 -              return ret;
 +              goto out;
  
        if (st->state < target)
                ret = do_cpu_up(dev->id, target);
        else
                ret = do_cpu_down(dev->id, target);
 -
 +out:
        unlock_device_hotplug();
        return ret ? ret : count;
  }
@@@ -1689,7 -1629,7 +1634,7 @@@ static struct attribute *cpuhp_cpu_attr
        NULL
  };
  
- static struct attribute_group cpuhp_cpu_attr_group = {
+ static const struct attribute_group cpuhp_cpu_attr_group = {
        .attrs = cpuhp_cpu_attrs,
        .name = "hotplug",
        NULL
@@@ -1721,7 -1661,7 +1666,7 @@@ static struct attribute *cpuhp_cpu_root
        NULL
  };
  
- static struct attribute_group cpuhp_cpu_root_attr_group = {
+ static const struct attribute_group cpuhp_cpu_root_attr_group = {
        .attrs = cpuhp_cpu_root_attrs,
        .name = "hotplug",
        NULL
diff --combined kernel/events/core.c
index bc63f8db1b0d218f09ae489fbfbeb042e82f5cf6,1f1b8cdaca2d437030c92c75f0df4983dbafcdac..4d2c32f9848245cb0a280afa08066434374b1820
@@@ -389,6 -389,7 +389,7 @@@ static atomic_t nr_switch_events __read
  static LIST_HEAD(pmus);
  static DEFINE_MUTEX(pmus_lock);
  static struct srcu_struct pmus_srcu;
+ static cpumask_var_t perf_online_mask;
  
  /*
   * perf event paranoia level:
@@@ -925,6 -926,11 +926,6 @@@ static inline int is_cgroup_event(struc
        return 0;
  }
  
 -static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
 -{
 -      return 0;
 -}
 -
  static inline void update_cgrp_time_from_event(struct perf_event *event)
  {
  }
@@@ -3807,14 -3813,6 +3808,6 @@@ find_get_context(struct pmu *pmu, struc
                if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
                        return ERR_PTR(-EACCES);
  
-               /*
-                * We could be clever and allow to attach a event to an
-                * offline CPU and activate it when the CPU comes up, but
-                * that's for later.
-                */
-               if (!cpu_online(cpu))
-                       return ERR_PTR(-ENODEV);
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                ctx = &cpuctx->ctx;
                get_ctx(ctx);
@@@ -5724,6 -5722,9 +5717,6 @@@ static void perf_output_read_one(struc
        __output_copy(handle, values, n * sizeof(u64));
  }
  
 -/*
 - * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
 - */
  static void perf_output_read_group(struct perf_output_handle *handle,
                            struct perf_event *event,
                            u64 enabled, u64 running)
  #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
                                 PERF_FORMAT_TOTAL_TIME_RUNNING)
  
 +/*
 + * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
 + *
 + * The problem is that its both hard and excessively expensive to iterate the
 + * child list, not to mention that its impossible to IPI the children running
 + * on another CPU, from interrupt/NMI context.
 + */
  static void perf_output_read(struct perf_output_handle *handle,
                             struct perf_event *event)
  {
@@@ -7315,21 -7309,6 +7308,21 @@@ int perf_event_account_interrupt(struc
        return __perf_event_account_interrupt(event, 1);
  }
  
 +static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
 +{
 +      /*
 +       * Due to interrupt latency (AKA "skid"), we may enter the
 +       * kernel before taking an overflow, even if the PMU is only
 +       * counting user events.
 +       * To avoid leaking information to userspace, we must always
 +       * reject kernel samples when exclude_kernel is set.
 +       */
 +      if (event->attr.exclude_kernel && !user_mode(regs))
 +              return false;
 +
 +      return true;
 +}
 +
  /*
   * Generic event overflow handling, sampling.
   */
@@@ -7350,12 -7329,6 +7343,12 @@@ static int __perf_event_overflow(struc
  
        ret = __perf_event_account_interrupt(event, throttle);
  
 +      /*
 +       * For security, drop the skid kernel samples if necessary.
 +       */
 +      if (!sample_is_allowed(event, regs))
 +              return ret;
 +
        /*
         * XXX event_limit might not quite work as expected on inherited
         * events
@@@ -7723,7 -7696,8 +7716,8 @@@ static int swevent_hlist_get_cpu(int cp
        int err = 0;
  
        mutex_lock(&swhash->hlist_mutex);
-       if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
+       if (!swevent_hlist_deref(swhash) &&
+           cpumask_test_cpu(cpu, perf_online_mask)) {
                struct swevent_hlist *hlist;
  
                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@@ -7744,7 -7718,7 +7738,7 @@@ static int swevent_hlist_get(void
  {
        int err, cpu, failed_cpu;
  
-       get_online_cpus();
+       mutex_lock(&pmus_lock);
        for_each_possible_cpu(cpu) {
                err = swevent_hlist_get_cpu(cpu);
                if (err) {
                        goto fail;
                }
        }
-       put_online_cpus();
+       mutex_unlock(&pmus_lock);
        return 0;
  fail:
        for_each_possible_cpu(cpu) {
                        break;
                swevent_hlist_put_cpu(cpu);
        }
-       put_online_cpus();
+       mutex_unlock(&pmus_lock);
        return err;
  }
  
@@@ -8940,7 -8912,7 +8932,7 @@@ perf_event_mux_interval_ms_store(struc
        pmu->hrtimer_interval_ms = timer;
  
        /* update all cpuctx for this PMU */
-       get_online_cpus();
+       cpus_read_lock();
        for_each_online_cpu(cpu) {
                struct perf_cpu_context *cpuctx;
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                cpu_function_call(cpu,
                        (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
        }
-       put_online_cpus();
+       cpus_read_unlock();
        mutex_unlock(&mux_interval_mutex);
  
        return count;
@@@ -9079,6 -9051,7 +9071,7 @@@ skip_type
                lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->ctx.pmu = pmu;
+               cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
  
                __perf_mux_hrtimer_init(cpuctx, cpu);
        }
@@@ -9192,7 -9165,7 +9185,7 @@@ static int perf_try_init_event(struct p
  
  static struct pmu *perf_init_event(struct perf_event *event)
  {
 -      struct pmu *pmu = NULL;
 +      struct pmu *pmu;
        int idx;
        int ret;
  
@@@ -9461,10 -9434,9 +9454,10 @@@ perf_event_alloc(struct perf_event_att
        local64_set(&hwc->period_left, hwc->sample_period);
  
        /*
 -       * we currently do not support PERF_FORMAT_GROUP on inherited events
 +       * We currently do not support PERF_SAMPLE_READ on inherited events.
 +       * See perf_output_read().
         */
 -      if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
 +      if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
                goto err_ns;
  
        if (!has_branch_stack(event))
        }
  
        pmu = perf_init_event(event);
 -      if (!pmu)
 -              goto err_ns;
 -      else if (IS_ERR(pmu)) {
 +      if (IS_ERR(pmu)) {
                err = PTR_ERR(pmu);
                goto err_ns;
        }
                event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
                                                   sizeof(unsigned long),
                                                   GFP_KERNEL);
 -              if (!event->addr_filters_offs)
 +              if (!event->addr_filters_offs) {
 +                      err = -ENOMEM;
                        goto err_per_task;
 +              }
  
                /* force hw sync on the address filters */
                event->addr_filters_gen = 1;
@@@ -9903,12 -9875,10 +9896,10 @@@ SYSCALL_DEFINE5(perf_event_open
                goto err_task;
        }
  
-       get_online_cpus();
        if (task) {
                err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
                if (err)
-                       goto err_cpus;
+                       goto err_task;
  
                /*
                 * Reuse ptrace permission checks for now.
                goto err_locked;
        }
  
+       if (!task) {
+               /*
+                * Check if the @cpu we're creating an event for is online.
+                *
+                * We use the perf_cpu_context::ctx::mutex to serialize against
+                * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+                */
+               struct perf_cpu_context *cpuctx =
+                       container_of(ctx, struct perf_cpu_context, ctx);
+               if (!cpuctx->online) {
+                       err = -ENODEV;
+                       goto err_locked;
+               }
+       }
        /*
         * Must be under the same ctx::mutex as perf_install_in_context(),
         * because we need to serialize with concurrent event creation.
                put_task_struct(task);
        }
  
-       put_online_cpus();
        mutex_lock(&current->perf_event_mutex);
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);
@@@ -10218,8 -10203,6 +10224,6 @@@ err_alloc
  err_cred:
        if (task)
                mutex_unlock(&task->signal->cred_guard_mutex);
- err_cpus:
-       put_online_cpus();
  err_task:
        if (task)
                put_task_struct(task);
@@@ -10274,6 -10257,21 +10278,21 @@@ perf_event_create_kernel_counter(struc
                goto err_unlock;
        }
  
+       if (!task) {
+               /*
+                * Check if the @cpu we're creating an event for is online.
+                *
+                * We use the perf_cpu_context::ctx::mutex to serialize against
+                * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+                */
+               struct perf_cpu_context *cpuctx =
+                       container_of(ctx, struct perf_cpu_context, ctx);
+               if (!cpuctx->online) {
+                       err = -ENODEV;
+                       goto err_unlock;
+               }
+       }
        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
                goto err_unlock;
@@@ -10941,6 -10939,8 +10960,8 @@@ static void __init perf_event_init_all_
        struct swevent_htable *swhash;
        int cpu;
  
+       zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
        for_each_possible_cpu(cpu) {
                swhash = &per_cpu(swevent_htable, cpu);
                mutex_init(&swhash->hlist_mutex);
        }
  }
  
int perf_event_init_cpu(unsigned int cpu)
void perf_swevent_init_cpu(unsigned int cpu)
  {
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  
                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
        mutex_unlock(&swhash->hlist_mutex);
-       return 0;
  }
  
  #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
@@@ -10987,19 -10986,22 +11007,22 @@@ static void __perf_event_exit_context(v
  
  static void perf_event_exit_cpu_context(int cpu)
  {
+       struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
        struct pmu *pmu;
-       int idx;
  
-       idx = srcu_read_lock(&pmus_srcu);
-       list_for_each_entry_rcu(pmu, &pmus, entry) {
-               ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+       mutex_lock(&pmus_lock);
+       list_for_each_entry(pmu, &pmus, entry) {
+               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+               ctx = &cpuctx->ctx;
  
                mutex_lock(&ctx->mutex);
                smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+               cpuctx->online = 0;
                mutex_unlock(&ctx->mutex);
        }
-       srcu_read_unlock(&pmus_srcu, idx);
+       cpumask_clear_cpu(cpu, perf_online_mask);
+       mutex_unlock(&pmus_lock);
  }
  #else
  
@@@ -11007,6 -11009,29 +11030,29 @@@ static void perf_event_exit_cpu_context
  
  #endif
  
+ int perf_event_init_cpu(unsigned int cpu)
+ {
+       struct perf_cpu_context *cpuctx;
+       struct perf_event_context *ctx;
+       struct pmu *pmu;
+       perf_swevent_init_cpu(cpu);
+       mutex_lock(&pmus_lock);
+       cpumask_set_cpu(cpu, perf_online_mask);
+       list_for_each_entry(pmu, &pmus, entry) {
+               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+               ctx = &cpuctx->ctx;
+               mutex_lock(&ctx->mutex);
+               cpuctx->online = 1;
+               mutex_unlock(&ctx->mutex);
+       }
+       mutex_unlock(&pmus_lock);
+       return 0;
+ }
  int perf_event_exit_cpu(unsigned int cpu)
  {
        perf_event_exit_cpu_context(cpu);
diff --combined kernel/kprobes.c
index adfe3b4cfe05a101bcee0da8e72db6fe6a10cf72,9f6056749a28bc7f8ae4a8c803e95f2c26edca9e..6756d750b31b5c8d044d793fe68f43f19f5bb3d1
@@@ -122,7 -122,7 +122,7 @@@ static void *alloc_insn_page(void
        return module_alloc(PAGE_SIZE);
  }
  
 -static void free_insn_page(void *page)
 +void __weak free_insn_page(void *page)
  {
        module_memfree(page);
  }
@@@ -483,11 -483,6 +483,6 @@@ static DECLARE_DELAYED_WORK(optimizing_
   */
  static void do_optimize_kprobes(void)
  {
-       /* Optimization never be done when disarmed */
-       if (kprobes_all_disarmed || !kprobes_allow_optimization ||
-           list_empty(&optimizing_list))
-               return;
        /*
         * The optimization/unoptimization refers online_cpus via
         * stop_machine() and cpu-hotplug modifies online_cpus.
         * This combination can cause a deadlock (cpu-hotplug try to lock
         * text_mutex but stop_machine can not be done because online_cpus
         * has been changed)
-        * To avoid this deadlock, we need to call get_online_cpus()
+        * To avoid this deadlock, caller must have locked cpu hotplug
         * for preventing cpu-hotplug outside of text_mutex locking.
         */
-       get_online_cpus();
+       lockdep_assert_cpus_held();
+       /* Optimization never be done when disarmed */
+       if (kprobes_all_disarmed || !kprobes_allow_optimization ||
+           list_empty(&optimizing_list))
+               return;
        mutex_lock(&text_mutex);
        arch_optimize_kprobes(&optimizing_list);
        mutex_unlock(&text_mutex);
-       put_online_cpus();
  }
  
  /*
@@@ -513,12 -513,13 +513,13 @@@ static void do_unoptimize_kprobes(void
  {
        struct optimized_kprobe *op, *tmp;
  
+       /* See comment in do_optimize_kprobes() */
+       lockdep_assert_cpus_held();
        /* Unoptimization must be done anytime */
        if (list_empty(&unoptimizing_list))
                return;
  
-       /* Ditto to do_optimize_kprobes */
-       get_online_cpus();
        mutex_lock(&text_mutex);
        arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
        /* Loop free_list for disarming */
                        list_del_init(&op->list);
        }
        mutex_unlock(&text_mutex);
-       put_online_cpus();
  }
  
  /* Reclaim all kprobes on the free_list */
@@@ -562,6 -562,7 +562,7 @@@ static void kick_kprobe_optimizer(void
  static void kprobe_optimizer(struct work_struct *work)
  {
        mutex_lock(&kprobe_mutex);
+       cpus_read_lock();
        /* Lock modules while optimizing kprobes */
        mutex_lock(&module_mutex);
  
        do_free_cleaned_kprobes();
  
        mutex_unlock(&module_mutex);
+       cpus_read_unlock();
        mutex_unlock(&kprobe_mutex);
  
        /* Step 5: Kick optimizer again if needed */
@@@ -650,9 -652,8 +652,8 @@@ static void optimize_kprobe(struct kpro
  /* Short cut to direct unoptimizing */
  static void force_unoptimize_kprobe(struct optimized_kprobe *op)
  {
-       get_online_cpus();
+       lockdep_assert_cpus_held();
        arch_unoptimize_kprobe(op);
-       put_online_cpus();
        if (kprobe_disabled(&op->kp))
                arch_disarm_kprobe(&op->kp);
  }
@@@ -791,6 -792,7 +792,7 @@@ static void try_to_optimize_kprobe(stru
                return;
  
        /* For preparing optimization, jump_label_text_reserved() is called */
+       cpus_read_lock();
        jump_label_lock();
        mutex_lock(&text_mutex);
  
  out:
        mutex_unlock(&text_mutex);
        jump_label_unlock();
+       cpus_read_unlock();
  }
  
  #ifdef CONFIG_SYSCTL
@@@ -826,6 -829,7 +829,7 @@@ static void optimize_all_kprobes(void
        if (kprobes_allow_optimization)
                goto out;
  
+       cpus_read_lock();
        kprobes_allow_optimization = true;
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                        if (!kprobe_disabled(p))
                                optimize_kprobe(p);
        }
+       cpus_read_unlock();
        printk(KERN_INFO "Kprobes globally optimized\n");
  out:
        mutex_unlock(&kprobe_mutex);
@@@ -851,6 -856,7 +856,7 @@@ static void unoptimize_all_kprobes(void
                return;
        }
  
+       cpus_read_lock();
        kprobes_allow_optimization = false;
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                                unoptimize_kprobe(p, false);
                }
        }
+       cpus_read_unlock();
        mutex_unlock(&kprobe_mutex);
  
        /* Wait for unoptimizing completion */
@@@ -1010,14 -1017,11 +1017,11 @@@ static void arm_kprobe(struct kprobe *k
                arm_kprobe_ftrace(kp);
                return;
        }
-       /*
-        * Here, since __arm_kprobe() doesn't use stop_machine(),
-        * this doesn't cause deadlock on text_mutex. So, we don't
-        * need get_online_cpus().
-        */
+       cpus_read_lock();
        mutex_lock(&text_mutex);
        __arm_kprobe(kp);
        mutex_unlock(&text_mutex);
+       cpus_read_unlock();
  }
  
  /* Disarm a kprobe with text_mutex */
@@@ -1027,10 -1031,12 +1031,12 @@@ static void disarm_kprobe(struct kprob
                disarm_kprobe_ftrace(kp);
                return;
        }
-       /* Ditto */
+       cpus_read_lock();
        mutex_lock(&text_mutex);
        __disarm_kprobe(kp, reopt);
        mutex_unlock(&text_mutex);
+       cpus_read_unlock();
  }
  
  /*
@@@ -1298,13 -1304,10 +1304,10 @@@ static int register_aggr_kprobe(struct 
        int ret = 0;
        struct kprobe *ap = orig_p;
  
+       cpus_read_lock();
        /* For preparing optimization, jump_label_text_reserved() is called */
        jump_label_lock();
-       /*
-        * Get online CPUs to avoid text_mutex deadlock.with stop machine,
-        * which is invoked by unoptimize_kprobe() in add_new_kprobe()
-        */
-       get_online_cpus();
        mutex_lock(&text_mutex);
  
        if (!kprobe_aggrprobe(orig_p)) {
  
  out:
        mutex_unlock(&text_mutex);
-       put_online_cpus();
        jump_label_unlock();
+       cpus_read_unlock();
  
        if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
                ap->flags &= ~KPROBE_FLAG_DISABLED;
@@@ -1555,9 -1558,12 +1558,12 @@@ int register_kprobe(struct kprobe *p
                goto out;
        }
  
-       mutex_lock(&text_mutex);        /* Avoiding text modification */
+       cpus_read_lock();
+       /* Prevent text modification */
+       mutex_lock(&text_mutex);
        ret = prepare_kprobe(p);
        mutex_unlock(&text_mutex);
+       cpus_read_unlock();
        if (ret)
                goto out;
  
  
        /* Try to optimize kprobe */
        try_to_optimize_kprobe(p);
  out:
        mutex_unlock(&kprobe_mutex);
  
This page took 0.244747 seconds and 4 git commands to generate.