Merge commit '8700c95adb03' into timers/nohz

author Frederic Weisbecker <[email protected]>

Thu, 2 May 2013 15:37:49 +0000 (17:37 +0200)

committer Frederic Weisbecker <[email protected]>

Thu, 2 May 2013 15:54:19 +0000 (17:54 +0200)
author Frederic Weisbecker <[email protected]>
Thu, 2 May 2013 15:37:49 +0000 (17:37 +0200)
committer Frederic Weisbecker <[email protected]>
Thu, 2 May 2013 15:54:19 +0000 (17:54 +0200)
diff --combined Documentation/RCU/stallwarn.txt

index b336755b71ed0d2926741a93e5a6fa8952e99fa2,e38b8df3d727e77b896e1cd2b1e925b54f083aad..8e9359de1d28b2e845d25425be72451b3f6be4f7
--- 1/Documentation/RCU/stallwarn.txt
--- 2/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@@ -92,14 -92,14 +92,14 @@@ If the CONFIG_RCU_CPU_STALL_INFO kerne
   more information is printed with the stall-warning message, for example:
   
         INFO: rcu_preempt detected stall on CPU
-       0: (63959 ticks this GP) idle=241/3fffffffffffffff/0
+       0: (63959 ticks this GP) idle=241/3fffffffffffffff/0 softirq=82/543
            (t=65000 jiffies)
   
   In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is
   printed:
   
         INFO: rcu_preempt detected stall on CPU
-       0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 drain=0 . timer not pending
+       0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D
            (t=65000 jiffies)
   
   The "(64628 ticks this GP)" indicates that this CPU has taken more
@@@ -116,13 -116,28 +116,28 @@@ number between the two "/"s is the valu
   be a small positive number if in the idle loop and a very large positive
   number (as shown above) otherwise.
   
- For CONFIG_RCU_FAST_NO_HZ kernels, the "drain=0" indicates that the CPU is
- not in the process of trying to force itself into dyntick-idle state, the
- "." indicates that the CPU has not given up forcing RCU into dyntick-idle
- mode (it would be "H" otherwise), and the "timer not pending" indicates
- that the CPU has not recently forced RCU into dyntick-idle mode (it
- would otherwise indicate the number of microseconds remaining in this
- forced state).
+ The "softirq=" portion of the message tracks the number of RCU softirq
+ handlers that the stalled CPU has executed.  The number before the "/"
+ is the number that had executed since boot at the time that this CPU
+ last noted the beginning of a grace period, which might be the current
+ (stalled) grace period, or it might be some earlier grace period (for
+ example, if the CPU might have been in dyntick-idle mode for an extended
+ time period.  The number after the "/" is the number that have executed
+ since boot until the current time.  If this latter number stays constant
+ across repeated stall-warning messages, it is possible that RCU's softirq
+ handlers are no longer able to execute on this CPU.  This can happen if
+ the stalled CPU is spinning with interrupts are disabled, or, in -rt
+ kernels, if a high-priority process is starving RCU's softirq handler.
+ 
+ For CONFIG_RCU_FAST_NO_HZ kernels, the "last_accelerate:" prints the
+ low-order 16 bits (in hex) of the jiffies counter when this CPU last
+ invoked rcu_try_advance_all_cbs() from rcu_needs_cpu() or last invoked
+ rcu_accelerate_cbs() from rcu_prepare_for_idle().  The "nonlazy_posted:"
+ prints the number of non-lazy callbacks posted since the last call to
+ rcu_needs_cpu().  Finally, an "L" indicates that there are currently
+ no non-lazy callbacks ("." is printed otherwise, as shown above) and
+ "D" indicates that dyntick-idle processing is enabled ("." is printed
+ otherwise, for example, if disabled via the "nohz=" kernel boot parameter).
   
   
   Multiple Warnings From One Stall
@@@ -176,7 -191,7 +191,7 @@@ o  A CPU-bound real-time task in a CONFI
   o     A hardware or software issue shuts off the scheduler-clock
         interrupt on a CPU that is not in dyntick-idle mode.  This
         problem really has happened, and seems to be most likely to
- -      result in RCU CPU stall warnings for CONFIG_NO_HZ=n kernels.
+ +      result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
   
   o     A bug in the RCU implementation.
   
diff --combined Documentation/kernel-parameters.txt

index 4865e9bfd08d0e9c294d723e2a6ed757f21573f4,de12397b60a9b1c64d5dec815beedcfedf8d8821..7d55ebb5660cd8c2fabfe2673b6278e229b0bcab
--- 1/Documentation/kernel-parameters.txt
--- 2/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -44,6 -44,7 +44,7 @@@ parameter is applicable
         AVR32   AVR32 architecture is enabled.
         AX25    Appropriate AX.25 support is enabled.
         BLACKFIN Blackfin architecture is enabled.
+       CLK     Common clock infrastructure is enabled.
         DRM     Direct Rendering Management support is enabled.
         DYNAMIC_DEBUG Build in debug messages and enable them at runtime
         EDD     BIOS Enhanced Disk Drive Services (EDD) is enabled
@@@ -320,6 -321,13 +321,13 @@@ bytes respectively. Such letter suffixe
                         on: enable for both 32- and 64-bit processes
                         off: disable for both 32- and 64-bit processes
   
+       alloc_snapshot  [FTRACE]
+                       Allocate the ftrace snapshot buffer on boot up when the
+                       main buffer is allocated. This is handy if debugging
+                       and you need to use tracing_snapshot() on boot up, and
+                       do not want to use tracing_snapshot_alloc() as it needs
+                       to be done where GFP_KERNEL allocations are allowed.
+ 
         amd_iommu=      [HW,X86-64]
                         Pass parameters to the AMD IOMMU driver in the system.
                         Possible values are:
@@@ -465,6 -473,13 +473,13 @@@
   
         cio_ignore=     [S390]
                         See Documentation/s390/CommonIO for details.
+       clk_ignore_unused
+                       [CLK]
+                       Keep all clocks already enabled by bootloader on,
+                       even if no driver has claimed them. This is useful
+                       for debug and development, but should not be
+                       needed on a platform with proper driver support.
+                       For more information, see Documentation/clk.txt.
   
         clock=          [BUGS=X86-32, HW] gettimeofday clocksource override.
                         [Deprecated]
@@@ -596,9 -611,6 +611,6 @@@
                         is selected automatically. Check
                         Documentation/kdump/kdump.txt for further details.
   
-       crashkernel_low=size[KMG]
-                       [KNL, x86] parts under 4G.
- 
         crashkernel=range1:size1[,range2:size2,...][@offset]
                         [KNL] Same as above, but depends on the memory
                         in the running system. The syntax of range is
@@@ -606,6 -618,26 +618,26 @@@
                         a memory unit (amount[KMG]). See also
                         Documentation/kdump/kdump.txt for an example.
   
+       crashkernel=size[KMG],high
+                       [KNL, x86_64] range could be above 4G. Allow kernel
+                       to allocate physical memory region from top, so could
+                       be above 4G if system have more than 4G ram installed.
+                       Otherwise memory region will be allocated below 4G, if
+                       available.
+                       It will be ignored if crashkernel=X is specified.
+       crashkernel=size[KMG],low
+                       [KNL, x86_64] range under 4G. When crashkernel=X,high
+                       is passed, kernel could allocate physical memory region
+                       above 4G, that cause second kernel crash on system
+                       that require some amount of low memory, e.g. swiotlb
+                       requires at least 64M+32K low memory.  Kernel would
+                       try to allocate 72M below 4G automatically.
+                       This one let user to specify own low range under 4G
+                       for second kernel instead.
+                       0: to disable low allocation.
+                       It will be ignored when crashkernel=X,high is not used
+                       or memory reserved is below 4G.
+ 
         cs89x0_dma=     [HW,NET]
                         Format: <dma>
   
@@@ -788,6 -820,12 +820,12 @@@
         edd=            [EDD]
                         Format: {"off" | "on" | "skip[mbr]"}
   
+       efi_no_storage_paranoia [EFI; X86]
+                       Using this parameter you can use more than 50% of
+                       your efi variable storage. Use this parameter only if
+                       you are really sure that your UEFI does sane gc and
+                       fulfills the spec otherwise your board may brick.
+ 
         eisa_irq_edge=  [PARISC,HW]
                         See header of drivers/parisc/eisa.c.
   
@@@ -1913,14 -1951,6 +1951,14 @@@
                         Valid arguments: on, off
                         Default: on
   
+ +      nohz_full=      [KNL,BOOT]
+ +                      In kernels built with CONFIG_NO_HZ_FULL=y, set
+ +                      the specified list of CPUs whose tick will be stopped
+ +                      whenever possible. The boot CPU will be forced outside
+ +                      the range to maintain the timekeeping.
+ +                      The CPUs in this range must also be included in the
+ +                      rcu_nocbs= set.
+ +
         noiotrap        [SH] Disables trapped I/O port accesses.
   
         noirqdebug      [X86-32] Disables the code which attempts to detect and
@@@ -2469,9 -2499,12 +2507,12 @@@
                         In kernels built with CONFIG_RCU_NOCB_CPU=y, set
                         the specified list of CPUs to be no-callback CPUs.
                         Invocation of these CPUs' RCU callbacks will
-                       be offloaded to "rcuoN" kthreads created for
-                       that purpose.  This reduces OS jitter on the
+                       be offloaded to "rcuox/N" kthreads created for
+                       that purpose, where "x" is "b" for RCU-bh, "p"
+                       for RCU-preempt, and "s" for RCU-sched, and "N"
+                       is the CPU number.  This reduces OS jitter on the
                         offloaded CPUs, which can be useful for HPC and
+ 
                         real-time workloads.  It can also improve energy
                         efficiency for asymmetric multiprocessors.
   
@@@ -2495,6 -2528,17 +2536,17 @@@
                         leaf rcu_node structure.  Useful for very large
                         systems.
   
+       rcutree.jiffies_till_first_fqs= [KNL,BOOT]
+                       Set delay from grace-period initialization to
+                       first attempt to force quiescent states.
+                       Units are jiffies, minimum value is zero,
+                       and maximum value is HZ.
+ 
+       rcutree.jiffies_till_next_fqs= [KNL,BOOT]
+                       Set delay between subsequent attempts to force
+                       quiescent states.  Units are jiffies, minimum
+                       value is one, and maximum value is HZ.
+ 
         rcutree.qhimark=        [KNL,BOOT]
                         Set threshold of queued
                         RCU callbacks over which batch limiting is disabled.
@@@ -2509,16 -2553,15 +2561,15 @@@
         rcutree.rcu_cpu_stall_timeout= [KNL,BOOT]
                         Set timeout for RCU CPU stall warning messages.
   
-       rcutree.jiffies_till_first_fqs= [KNL,BOOT]
-                       Set delay from grace-period initialization to
-                       first attempt to force quiescent states.
-                       Units are jiffies, minimum value is zero,
-                       and maximum value is HZ.
+       rcutree.rcu_idle_gp_delay=      [KNL,BOOT]
+                       Set wakeup interval for idle CPUs that have
+                       RCU callbacks (RCU_FAST_NO_HZ=y).
   
-       rcutree.jiffies_till_next_fqs= [KNL,BOOT]
-                       Set delay between subsequent attempts to force
-                       quiescent states.  Units are jiffies, minimum
-                       value is one, and maximum value is HZ.
+       rcutree.rcu_idle_lazy_gp_delay= [KNL,BOOT]
+                       Set wakeup interval for idle CPUs that have
+                       only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y).
+                       Lazy RCU callbacks are those which RCU can
+                       prove do nothing more than free memory.
   
         rcutorture.fqs_duration= [KNL,BOOT]
                         Set duration of force_quiescent_state bursts.
@@@ -3230,6 -3273,15 +3281,15 @@@
                         or other driver-specific files in the
                         Documentation/watchdog/ directory.
   
+       workqueue.disable_numa
+                       By default, all work items queued to unbound
+                       workqueues are affine to the NUMA nodes they're
+                       issued on, which results in better behavior in
+                       general.  If NUMA affinity needs to be disabled for
+                       whatever reason, this option can be used.  Note
+                       that this also can be controlled per-workqueue for
+                       workqueues visible under /sys/bus/workqueue/.
+ 
         x2apic_phys     [X86-64,APIC] Use x2apic physical mode instead of
                         default x2apic cluster mode on platforms
                         supporting x2apic.
diff --combined include/linux/perf_event.h

index 0140830225e216103742f1dc1f6736d461f86d8a,e0373d26c24454a313fa47bb96d5211458db1c2d..f463a46424e240715f2cc0c3bbf58f38955db1a8
--- 1/include/linux/perf_event.h
--- 2/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@@ -21,7 -21,6 +21,6 @@@
    */
   
   #ifdef CONFIG_PERF_EVENTS
- # include <linux/cgroup.h>
   # include <asm/perf_event.h>
   # include <asm/local64.h>
   #endif
@@@ -128,6 -127,7 +127,7 @@@ struct hw_perf_event 
                         int             event_base_rdpmc;
                         int             idx;
                         int             last_cpu;
+                       int             flags;
   
                         struct hw_perf_event_extra extra_reg;
                         struct hw_perf_event_extra branch_reg;
@@@ -299,22 -299,7 +299,7 @@@ struct swevent_hlist 
   #define PERF_ATTACH_GROUP     0x02
   #define PERF_ATTACH_TASK      0x04
   
- #ifdef CONFIG_CGROUP_PERF
- /*
-  * perf_cgroup_info keeps track of time_enabled for a cgroup.
-  * This is a per-cpu dynamically allocated data structure.
-  */
- struct perf_cgroup_info {
-       u64                             time;
-       u64                             timestamp;
- };
- 
- struct perf_cgroup {
-       struct                          cgroup_subsys_state css;
-       struct                          perf_cgroup_info *info; /* timing info, one per cpu */
- };
- #endif
- 
+ struct perf_cgroup;
   struct ring_buffer;
   
   /**
@@@ -583,11 -568,13 +568,13 @@@ struct perf_sample_data 
                 u32     reserved;
         }                               cpu_entry;
         u64                             period;
+       union  perf_mem_data_src        data_src;
         struct perf_callchain_entry     *callchain;
         struct perf_raw_record          *raw;
         struct perf_branch_stack        *br_stack;
         struct perf_regs_user           regs_user;
         u64                             stack_user_size;
+       u64                             weight;
   };
   
   static inline void perf_sample_data_init(struct perf_sample_data *data,
@@@ -601,6 -588,8 +588,8 @@@
         data->regs_user.abi = PERF_SAMPLE_REGS_ABI_NONE;
         data->regs_user.regs = NULL;
         data->stack_user_size = 0;
+       data->weight = 0;
+       data->data_src.val = 0;
   }
   
   extern void perf_output_sample(struct perf_output_handle *handle,
@@@ -799,12 -788,12 +788,18 @@@ static inline int __perf_event_disable(
   static inline void perf_event_task_tick(void)                         { }
   #endif
   
+ +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL)
+ +extern bool perf_event_can_stop_tick(void);
+ +#else
+ +static inline bool perf_event_can_stop_tick(void)                     { return true; }
+ +#endif
+ +
+ #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
+ extern void perf_restore_debug_store(void);
+ #else
+ static inline void perf_restore_debug_store(void)                     { }
+ #endif
+ 
   #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))
   
   /*
@@@ -831,6 -820,7 +826,7 @@@ do {                                                                       
   struct perf_pmu_events_attr {
         struct device_attribute attr;
         u64 id;
+       const char *event_str;
   };
   
   #define PMU_EVENT_ATTR(_name, _var, _id, _show)                               \
diff --combined include/linux/rcupdate.h

index 8e0948c872fc283bb29b711c19c550bdb097777d,9ed2c9a4de45d332e69eb37d3c7b51014d203116..4ccd68e49b00dbb9ef88006d31f9cfbadec3272a
--- 1/include/linux/rcupdate.h
--- 2/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@@ -80,6 -80,7 +80,7 @@@ extern void do_trace_rcu_torture_read(c
   #define UINT_CMP_LT(a, b)     (UINT_MAX / 2 < (a) - (b))
   #define ULONG_CMP_GE(a, b)    (ULONG_MAX / 2 >= (a) - (b))
   #define ULONG_CMP_LT(a, b)    (ULONG_MAX / 2 < (a) - (b))
+ #define ulong2long(a)         (*(long *)(&(a)))
   
   /* Exported common interfaces */
   
@@@ -999,11 -1000,4 +1000,11 @@@ static inline notrace void rcu_read_unl
   #define kfree_rcu(ptr, rcu_head)                                      \
         __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
   
+ +#ifdef CONFIG_RCU_NOCB_CPU
+ +extern bool rcu_is_nocb_cpu(int cpu);
+ +#else
+ +static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
+ +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+ +
+ +
   #endif /* __LINUX_RCUPDATE_H */
diff --combined include/linux/sched.h

index a74adedcdd10b0bea041ad1aa5bc814eb718458d,981ab688725994fd8a678bf3dcf6a2b41ad84375..ebf7095158a9ebe88ef289b11b153daebe82cb60
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -151,9 -151,10 +151,10 @@@ print_cfs_rq(struct seq_file *m, int cp
   #define TASK_DEAD             64
   #define TASK_WAKEKILL         128
   #define TASK_WAKING           256
- #define TASK_STATE_MAX                512
+ #define TASK_PARKED           512
+ #define TASK_STATE_MAX                1024
   
- #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW"
+ #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
   
   extern char ___assert_task_state[1 - 2*!!(
                 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
@@@ -230,7 -231,7 +231,7 @@@ extern void init_idle_bootup_task(struc
   
   extern int runqueue_is_locked(int cpu);
   
- -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
+ +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
   extern void nohz_balance_enter_idle(int cpu);
   extern void set_cpu_sd_state_idle(void);
   extern int get_nohz_timer_target(void);
@@@ -308,7 -309,6 +309,6 @@@ extern signed long schedule_timeout_kil
   extern signed long schedule_timeout_uninterruptible(signed long timeout);
   asmlinkage void schedule(void);
   extern void schedule_preempt_disabled(void);
- extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner);
   
   struct nsproxy;
   struct user_namespace;
@@@ -808,6 -808,8 +808,8 @@@ struct sched_domain 
         unsigned int wake_idx;
         unsigned int forkexec_idx;
         unsigned int smt_gain;
+ 
+       int nohz_idle;                  /* NOHZ IDLE status */
         int flags;                      /* See SD_* */
         int level;
   
@@@ -1105,8 -1107,10 +1107,10 @@@ struct task_struct 
         int exit_code, exit_signal;
         int pdeath_signal;  /*  The signal sent when the parent dies  */
         unsigned int jobctl;    /* JOBCTL_*, siglock protected */
-       /* ??? */
+ 
+       /* Used for emulating ABI behavior of previous Linux versions */
         unsigned int personality;
+ 
         unsigned did_exec:1;
         unsigned in_execve:1;   /* Tell the LSMs that the process is doing an
                                  * execve */
@@@ -1624,7 -1628,7 +1628,7 @@@ extern void thread_group_cputime_adjust
   #define PF_SWAPWRITE  0x00800000      /* Allowed to write to swap */
   #define PF_SPREAD_PAGE        0x01000000      /* Spread page cache over cpuset */
   #define PF_SPREAD_SLAB        0x02000000      /* Spread some slab caches over cpuset */
- #define PF_THREAD_BOUND       0x04000000      /* Thread bound to specific cpu */
+ #define PF_NO_SETAFFINITY 0x04000000  /* Userland is not allowed to meddle with cpus_allowed */
   #define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
   #define PF_MEMPOLICY  0x10000000      /* Non-default NUMA mempolicy */
   #define PF_MUTEX_TESTER       0x20000000      /* Thread belongs to the rt mutex tester */
@@@ -1758,13 -1762,13 +1762,13 @@@ static inline int set_cpus_allowed_ptr(
   }
   #endif
   
- -#ifdef CONFIG_NO_HZ
+ +#ifdef CONFIG_NO_HZ_COMMON
   void calc_load_enter_idle(void);
   void calc_load_exit_idle(void);
   #else
   static inline void calc_load_enter_idle(void) { }
   static inline void calc_load_exit_idle(void) { }
- -#endif /* CONFIG_NO_HZ */
+ +#endif /* CONFIG_NO_HZ_COMMON */
   
   #ifndef CONFIG_CPUMASK_OFFSTACK
   static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
@@@ -1850,16 -1854,10 +1854,16 @@@ extern void idle_task_exit(void)
   static inline void idle_task_exit(void) {}
   #endif
   
- -#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
- -extern void wake_up_idle_cpu(int cpu);
+ +#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
+ +extern void wake_up_nohz_cpu(int cpu);
   #else
- -static inline void wake_up_idle_cpu(int cpu) { }
+ +static inline void wake_up_nohz_cpu(int cpu) { }
+ +#endif
+ +
+ +#ifdef CONFIG_NO_HZ_FULL
+ +extern bool sched_can_stop_tick(void);
+ +#else
+ +static inline bool sched_can_stop_tick(void) { return false; }
   #endif
   
   #ifdef CONFIG_SCHED_AUTOGROUP
@@@ -2458,6 -2456,47 +2462,47 @@@ static inline int spin_needbreak(spinlo
   #endif
   }
   
+ /*
+  * Idle thread specific functions to determine the need_resched
+  * polling state. We have two versions, one based on TS_POLLING in
+  * thread_info.status and one based on TIF_POLLING_NRFLAG in
+  * thread_info.flags
+  */
+ #ifdef TS_POLLING
+ static inline int tsk_is_polling(struct task_struct *p)
+ {
+       return task_thread_info(p)->status & TS_POLLING;
+ }
+ static inline void current_set_polling(void)
+ {
+       current_thread_info()->status |= TS_POLLING;
+ }
+ 
+ static inline void current_clr_polling(void)
+ {
+       current_thread_info()->status &= ~TS_POLLING;
+       smp_mb__after_clear_bit();
+ }
+ #elif defined(TIF_POLLING_NRFLAG)
+ static inline int tsk_is_polling(struct task_struct *p)
+ {
+       return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
+ }
+ static inline void current_set_polling(void)
+ {
+       set_thread_flag(TIF_POLLING_NRFLAG);
+ }
+ 
+ static inline void current_clr_polling(void)
+ {
+       clear_thread_flag(TIF_POLLING_NRFLAG);
+ }
+ #else
+ static inline int tsk_is_polling(struct task_struct *p) { return 0; }
+ static inline void current_set_polling(void) { }
+ static inline void current_clr_polling(void) { }
+ #endif
+ 
   /*
    * Thread group CPU time accounting.
    */
diff --combined init/Kconfig

index 8f97a7407714736cf6507c1d6f897f3303f12e6f,4367e1379002d8368fbfe9f9a4101af9c7efcfa4..66f67afad4fad4758d802938d5c3a4fefbc4dbf4
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -28,10 -28,6 +28,6 @@@ config BUILDTIME_EXTABLE_SOR
   
   menu "General setup"
   
- config EXPERIMENTAL
-       bool
-       default y
- 
   config BROKEN
         bool
   
@@@ -306,7 -302,7 +302,7 @@@ choic
   # Kind of a stub config for the pure tick based cputime accounting
   config TICK_CPU_ACCOUNTING
         bool "Simple tick based cputime accounting"
- -      depends on !S390
+ +      depends on !S390 && !NO_HZ_FULL
         help
           This is the basic tick based cputime accounting that maintains
           statistics about user, system and idle time spent on per jiffies
@@@ -316,7 -312,7 +312,7 @@@
   
   config VIRT_CPU_ACCOUNTING_NATIVE
         bool "Deterministic task and CPU time accounting"
- -      depends on HAVE_VIRT_CPU_ACCOUNTING
+ +      depends on HAVE_VIRT_CPU_ACCOUNTING && !NO_HZ_FULL
         select VIRT_CPU_ACCOUNTING
         help
           Select this option to enable more accurate task and CPU time
@@@ -346,7 -342,7 +342,7 @@@ config VIRT_CPU_ACCOUNTING_GE
   
   config IRQ_TIME_ACCOUNTING
         bool "Fine granularity task level IRQ time accounting"
- -      depends on HAVE_IRQ_TIME_ACCOUNTING
+ +      depends on HAVE_IRQ_TIME_ACCOUNTING && !NO_HZ_FULL
         help
           Select this option to enable fine granularity task irq time
           accounting. This is done by reading a timestamp on each
@@@ -580,16 -576,19 +576,19 @@@ config RCU_FANOUT_EXAC
   
   config RCU_FAST_NO_HZ
         bool "Accelerate last non-dyntick-idle CPU's grace periods"
- -      depends on NO_HZ && SMP
+ +      depends on NO_HZ_COMMON && SMP
         default n
         help
-         This option causes RCU to attempt to accelerate grace periods in
-         order to allow CPUs to enter dynticks-idle state more quickly.
-         On the other hand, this option increases the overhead of the
-         dynticks-idle checking, thus degrading scheduling latency.
+         This option permits CPUs to enter dynticks-idle state even if
+         they have RCU callbacks queued, and prevents RCU from waking
+         these CPUs up more than roughly once every four jiffies (by
+         default, you can adjust this using the rcutree.rcu_idle_gp_delay
+         parameter), thus improving energy efficiency.  On the other
+         hand, this option increases the duration of RCU grace periods,
+         for example, slowing down synchronize_rcu().
   
-         Say Y if energy efficiency is critically important, and you don't
-               care about real-time response.
+         Say Y if energy efficiency is critically important, and you
+               don't care about increased grace-period durations.
   
           Say N if you are unsure.
   
@@@ -656,7 -655,7 +655,7 @@@ config RCU_BOOST_DELA
           Accept the default if unsure.
   
   config RCU_NOCB_CPU
-       bool "Offload RCU callback processing from boot-selected CPUs"
+       bool "Offload RCU callback processing from boot-selected CPUs (EXPERIMENTAL"
         depends on TREE_RCU || TREE_PREEMPT_RCU
         default n
         help
@@@ -667,16 -666,56 +666,56 @@@
   
           This option offloads callback invocation from the set of
           CPUs specified at boot time by the rcu_nocbs parameter.
-         For each such CPU, a kthread ("rcuoN") will be created to
-         invoke callbacks, where the "N" is the CPU being offloaded.
-         Nothing prevents this kthread from running on the specified
-         CPUs, but (1) the kthreads may be preempted between each
-         callback, and (2) affinity or cgroups can be used to force
-         the kthreads to run on whatever set of CPUs is desired.
- 
-         Say Y here if you want reduced OS jitter on selected CPUs.
+         For each such CPU, a kthread ("rcuox/N") will be created to
+         invoke callbacks, where the "N" is the CPU being offloaded,
+         and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
+         "s" for RCU-sched.  Nothing prevents this kthread from running
+         on the specified CPUs, but (1) the kthreads may be preempted
+         between each callback, and (2) affinity or cgroups can be used
+         to force the kthreads to run on whatever set of CPUs is desired.
+ 
+         Say Y here if you want to help to debug reduced OS jitter.
           Say N here if you are unsure.
   
+ choice
+       prompt "Build-forced no-CBs CPUs"
+       default RCU_NOCB_CPU_NONE
+       help
+         This option allows no-CBs CPUs to be specified at build time.
+         Additional no-CBs CPUs may be specified by the rcu_nocbs=
+         boot parameter.
+ 
+ config RCU_NOCB_CPU_NONE
+       bool "No build_forced no-CBs CPUs"
+       depends on RCU_NOCB_CPU
+       help
+         This option does not force any of the CPUs to be no-CBs CPUs.
+         Only CPUs designated by the rcu_nocbs= boot parameter will be
+         no-CBs CPUs.
+ 
+ config RCU_NOCB_CPU_ZERO
+       bool "CPU 0 is a build_forced no-CBs CPU"
+       depends on RCU_NOCB_CPU
+       help
+         This option forces CPU 0 to be a no-CBs CPU.  Additional CPUs
+         may be designated as no-CBs CPUs using the rcu_nocbs= boot
+         parameter will be no-CBs CPUs.
+ 
+         Select this if CPU 0 needs to be a no-CBs CPU for real-time
+         or energy-efficiency reasons.
+ 
+ config RCU_NOCB_CPU_ALL
+       bool "All CPUs are build_forced no-CBs CPUs"
+       depends on RCU_NOCB_CPU
+       help
+         This option forces all CPUs to be no-CBs CPUs.  The rcu_nocbs=
+         boot parameter will be ignored.
+ 
+         Select this if all CPUs need to be no-CBs CPUs for real-time
+         or energy-efficiency reasons.
+ 
+ endchoice
+ 
   endmenu # "RCU Subsystem"
   
   config IKCONFIG
diff --combined init/main.c

index 2acb5bbde99b55c77818c2053e6b36eca7d61712,12c366944dbd51bccbe2c03f7af6f91485526d87..1952bf2f6875b1d87dc2bacb73aad6f2d8db3e70
--- 1/init/main.c
--- 2/init/main.c
+++ b/init/main.c
@@@ -9,6 -9,8 +9,8 @@@
    *  Simplified starting of init:  Michael A. Griffith <[email protected]> 
    */
   
+ #define DEBUG         /* Enable initcall_debug */
+ 
   #include <linux/types.h>
   #include <linux/module.h>
   #include <linux/proc_fs.h>
@@@ -174,8 -176,8 +176,8 @@@ static int __init obsolete_checksetup(c
                                 if (line[n] == '\0' || line[n] == '=')
                                         had_early_param = 1;
                         } else if (!p->setup_func) {
-                               printk(KERN_WARNING "Parameter %s is obsolete,"
-                                      " ignored\n", p->str);
+                               pr_warn("Parameter %s is obsolete, ignored\n",
+                                       p->str);
                                 return 1;
                         } else if (p->setup_func(line + n))
                                 return 1;
@@@ -384,7 -386,7 +386,7 @@@ static noinline void __init_refok rest_
         init_idle_bootup_task(current);
         schedule_preempt_disabled();
         /* Call into cpu_idle with preempt disabled */
-       cpu_idle();
+       cpu_startup_entry(CPUHP_ONLINE);
   }
   
   /* Check for early params. */
@@@ -398,8 -400,7 +400,7 @@@ static int __init do_early_param(char *
                      strcmp(p->str, "earlycon") == 0)
                 ) {
                         if (p->setup_func(val) != 0)
-                               printk(KERN_WARNING
-                                      "Malformed early option '%s'\n", param);
+                               pr_warn("Malformed early option '%s'\n", param);
                 }
         }
         /* We accept everything at this stage. */
@@@ -497,7 -498,7 +498,7 @@@ asmlinkage void __init start_kernel(voi
         tick_init();
         boot_cpu_init();
         page_address_init();
-       printk(KERN_NOTICE "%s", linux_banner);
+       pr_notice("%s", linux_banner);
         setup_arch(&command_line);
         mm_init_owner(&init_mm, &init_task);
         mm_init_cpumask(&init_mm);
@@@ -509,7 -510,7 +510,7 @@@
         build_all_zonelists(NULL, NULL);
         page_alloc_init();
   
-       printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
+       pr_notice("Kernel command line: %s\n", boot_command_line);
         parse_early_param();
         parse_args("Booting kernel", static_command_line, __start___param,
                    __stop___param - __start___param,
@@@ -539,15 -540,11 +540,12 @@@
          * fragile until we cpu_idle() for the first time.
          */
         preempt_disable();
-       if (!irqs_disabled()) {
-               printk(KERN_WARNING "start_kernel(): bug: interrupts were "
-                               "enabled *very* early, fixing it\n");
+       if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n"))
                 local_irq_disable();
-       }
         idr_init_cache();
         perf_event_init();
         rcu_init();
+ +      tick_nohz_init();
         radix_tree_init();
         /* init some links before init_ISA_irqs() */
         early_irq_init();
@@@ -559,9 -556,7 +557,7 @@@
         time_init();
         profile_init();
         call_function_init();
-       if (!irqs_disabled())
-               printk(KERN_CRIT "start_kernel(): bug: interrupts were "
-                                "enabled early\n");
+       WARN(!irqs_disabled(), "Interrupts were enabled early\n");
         early_boot_irqs_disabled = false;
         local_irq_enable();
   
@@@ -588,8 -583,7 +584,7 @@@
   #ifdef CONFIG_BLK_DEV_INITRD
         if (initrd_start && !initrd_below_start_ok &&
             page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
-               printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - "
-                   "disabling it.\n",
+               pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n",
                     page_to_pfn(virt_to_page((void *)initrd_start)),
                     min_low_pfn);
                 initrd_start = 0;
@@@ -668,14 -662,14 +663,14 @@@ static int __init_or_module do_one_init
         unsigned long long duration;
         int ret;
   
-       printk(KERN_DEBUG "calling  %pF @ %i\n", fn, task_pid_nr(current));
+       pr_debug("calling  %pF @ %i\n", fn, task_pid_nr(current));
         calltime = ktime_get();
         ret = fn();
         rettime = ktime_get();
         delta = ktime_sub(rettime, calltime);
         duration = (unsigned long long) ktime_to_ns(delta) >> 10;
-       printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n", fn,
-               ret, duration);
+       pr_debug("initcall %pF returned %d after %lld usecs\n",
+                fn, ret, duration);
   
         return ret;
   }
@@@ -703,9 -697,7 +698,7 @@@ int __init_or_module do_one_initcall(in
                 strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf));
                 local_irq_enable();
         }
-       if (msgbuf[0]) {
-               printk("initcall %pF returned with %s\n", fn, msgbuf);
-       }
+       WARN(msgbuf[0], "initcall %pF returned with %s\n", fn, msgbuf);
   
         return ret;
   }
@@@ -833,8 -825,7 +826,7 @@@ static int __ref kernel_init(void *unus
         if (ramdisk_execute_command) {
                 if (!run_init_process(ramdisk_execute_command))
                         return 0;
-               printk(KERN_WARNING "Failed to execute %s\n",
-                               ramdisk_execute_command);
+               pr_err("Failed to execute %s\n", ramdisk_execute_command);
         }
   
         /*
@@@ -846,8 -837,8 +838,8 @@@
         if (execute_command) {
                 if (!run_init_process(execute_command))
                         return 0;
-               printk(KERN_WARNING "Failed to execute %s.  Attempting "
-                                       "defaults...\n", execute_command);
+               pr_err("Failed to execute %s.  Attempting defaults...\n",
+                       execute_command);
         }
         if (!run_init_process("/sbin/init") ||
             !run_init_process("/etc/init") ||
@@@ -892,7 -883,7 +884,7 @@@ static noinline void __init kernel_init
   
         /* Open the /dev/console on the rootfs, this should never fail */
         if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
-               printk(KERN_WARNING "Warning: unable to open an initial console.\n");
+               pr_err("Warning: unable to open an initial console.\n");
   
         (void) sys_dup(0);
         (void) sys_dup(0);
diff --combined kernel/events/core.c

index ddb993b52190e9504b2b4505ec263034eb65801c,3820e3cefbaea869541fcdabadb909174f83a42f..6b41c1899a8b00acc0ca48ae30b0e8dfbdd2ad9d
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -18,7 -18,6 +18,7 @@@
   #include <linux/poll.h>
   #include <linux/slab.h>
   #include <linux/hash.h>
+ +#include <linux/tick.h>
   #include <linux/sysfs.h>
   #include <linux/dcache.h>
   #include <linux/percpu.h>
@@@ -38,6 -37,7 +38,7 @@@
   #include <linux/ftrace_event.h>
   #include <linux/hw_breakpoint.h>
   #include <linux/mm_types.h>
+ #include <linux/cgroup.h>
   
   #include "internal.h"
   
@@@ -234,6 -234,20 +235,20 @@@ static void perf_ctx_unlock(struct perf
   
   #ifdef CONFIG_CGROUP_PERF
   
+ /*
+  * perf_cgroup_info keeps track of time_enabled for a cgroup.
+  * This is a per-cpu dynamically allocated data structure.
+  */
+ struct perf_cgroup_info {
+       u64                             time;
+       u64                             timestamp;
+ };
+ 
+ struct perf_cgroup {
+       struct cgroup_subsys_state      css;
+       struct perf_cgroup_info __percpu *info;
+ };
+ 
   /*
    * Must ensure cgroup is pinned (css_get) before calling
    * this function. In other words, we cannot call this function
@@@ -252,7 -266,22 +267,22 @@@ perf_cgroup_match(struct perf_event *ev
         struct perf_event_context *ctx = event->ctx;
         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
   
-       return !event->cgrp || event->cgrp == cpuctx->cgrp;
+       /* @event doesn't care about cgroup */
+       if (!event->cgrp)
+               return true;
+ 
+       /* wants specific cgroup scope but @cpuctx isn't associated with any */
+       if (!cpuctx->cgrp)
+               return false;
+ 
+       /*
+        * Cgroup scoping is recursive.  An event enabled for a cgroup is
+        * also enabled for all its descendant cgroups.  If @cpuctx's
+        * cgroup is a descendant of @event's (the test covers identity
+        * case), it's a match.
+        */
+       return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
+                                   event->cgrp->css.cgroup);
   }
   
   static inline bool perf_tryget_cgroup(struct perf_event *event)
@@@ -656,12 -685,8 +686,12 @@@ static void perf_pmu_rotate_start(struc
   
         WARN_ON(!irqs_disabled());
   
- -      if (list_empty(&cpuctx->rotation_list))
+ +      if (list_empty(&cpuctx->rotation_list)) {
+ +              int was_empty = list_empty(head);
                 list_add(&cpuctx->rotation_list, head);
+ +              if (was_empty)
+ +                      tick_nohz_full_kick();
+ +      }
   }
   
   static void get_ctx(struct perf_event_context *ctx)
@@@ -966,9 -991,15 +996,15 @@@ static void perf_event__header_size(str
         if (sample_type & PERF_SAMPLE_PERIOD)
                 size += sizeof(data->period);
   
+       if (sample_type & PERF_SAMPLE_WEIGHT)
+               size += sizeof(data->weight);
+ 
         if (sample_type & PERF_SAMPLE_READ)
                 size += event->read_size;
   
+       if (sample_type & PERF_SAMPLE_DATA_SRC)
+               size += sizeof(data->data_src.val);
+ 
         event->header_size = size;
   }
   
@@@ -2560,16 -2591,6 +2596,16 @@@ done
                 list_del_init(&cpuctx->rotation_list);
   }
   
+ +#ifdef CONFIG_NO_HZ_FULL
+ +bool perf_event_can_stop_tick(void)
+ +{
+ +      if (list_empty(&__get_cpu_var(rotation_list)))
+ +              return true;
+ +      else
+ +              return false;
+ +}
+ +#endif
+ +
   void perf_event_task_tick(void)
   {
         struct list_head *head = &__get_cpu_var(rotation_list);
@@@ -4193,6 -4214,12 +4229,12 @@@ void perf_output_sample(struct perf_out
                 perf_output_sample_ustack(handle,
                                           data->stack_user_size,
                                           data->regs_user.regs);
+ 
+       if (sample_type & PERF_SAMPLE_WEIGHT)
+               perf_output_put(handle, data->weight);
+ 
+       if (sample_type & PERF_SAMPLE_DATA_SRC)
+               perf_output_put(handle, data->data_src.val);
   }
   
   void perf_prepare_sample(struct perf_event_header *header,
@@@ -4449,12 -4476,15 +4491,15 @@@ static void perf_event_task_event(struc
                         if (ctxn < 0)
                                 goto next;
                         ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+                       if (ctx)
+                               perf_event_task_ctx(ctx, task_event);
                 }
-               if (ctx)
-                       perf_event_task_ctx(ctx, task_event);
   next:
                 put_cpu_ptr(pmu->pmu_cpu_context);
         }
+       if (task_event->task_ctx)
+               perf_event_task_ctx(task_event->task_ctx, task_event);
+ 
         rcu_read_unlock();
   }
   
@@@ -4608,6 -4638,7 +4653,7 @@@ void perf_event_comm(struct task_struc
         struct perf_event_context *ctx;
         int ctxn;
   
+       rcu_read_lock();
         for_each_task_context_nr(ctxn) {
                 ctx = task->perf_event_ctxp[ctxn];
                 if (!ctx)
@@@ -4615,6 -4646,7 +4661,7 @@@
   
                 perf_event_enable_on_exec(ctx);
         }
+       rcu_read_unlock();
   
         if (!atomic_read(&nr_comm_events))
                 return;
@@@ -4749,7 -4781,8 +4796,8 @@@ static void perf_event_mmap_event(struc
         } else {
                 if (arch_vma_name(mmap_event->vma)) {
                         name = strncpy(tmp, arch_vma_name(mmap_event->vma),
-                                      sizeof(tmp));
+                                      sizeof(tmp) - 1);
+                       tmp[sizeof(tmp) - 1] = '\0';
                         goto got_name;
                 }
   
@@@ -4776,6 -4809,9 +4824,9 @@@ got_name
         mmap_event->file_name = name;
         mmap_event->file_size = size;
   
+       if (!(vma->vm_flags & VM_EXEC))
+               mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
+ 
         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
   
         rcu_read_lock();
@@@ -5342,7 -5378,7 +5393,7 @@@ static void sw_perf_event_destroy(struc
   
   static int perf_swevent_init(struct perf_event *event)
   {
-       int event_id = event->attr.config;
+       u64 event_id = event->attr.config;
   
         if (event->attr.type != PERF_TYPE_SOFTWARE)
                 return -ENOENT;
@@@ -5662,6 -5698,7 +5713,7 @@@ static void perf_swevent_init_hrtimer(s
                 event->attr.sample_period = NSEC_PER_SEC / freq;
                 hwc->sample_period = event->attr.sample_period;
                 local64_set(&hwc->period_left, hwc->sample_period);
+               hwc->last_period = hwc->sample_period;
                 event->attr.freq = 0;
         }
   }
@@@ -5997,6 -6034,7 +6049,7 @@@ skip_type
         if (pmu->pmu_cpu_context)
                 goto got_cpu_context;
   
+       ret = -ENOMEM;
         pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
         if (!pmu->pmu_cpu_context)
                 goto free_dev;
@@@ -7524,12 -7562,5 +7577,5 @@@ struct cgroup_subsys perf_subsys = 
         .css_free       = perf_cgroup_css_free,
         .exit           = perf_cgroup_exit,
         .attach         = perf_cgroup_attach,
- 
-       /*
-        * perf_event cgroup doesn't handle nesting correctly.
-        * ctx->nr_cgroups adjustments should be propagated through the
-        * cgroup hierarchy.  Fix it and remove the following.
-        */
-       .broken_hierarchy = true,
   };
   #endif /* CONFIG_CGROUP_PERF */
diff --combined kernel/hrtimer.c

index ec60482d8b03d8b8352d92b50d2cb4f6c30b29df,14be27feda491da1c3dc9990a5ae80ce649570aa..abfd89d687ac9d119dffed6a070c113e8c0ab5c7
--- 1/kernel/hrtimer.c
--- 2/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@@ -63,6 -63,7 +63,7 @@@
   DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
   {
   
+       .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
         .clock_base =
         {
                 {
@@@ -160,7 -161,7 +161,7 @@@ struct hrtimer_clock_base *lock_hrtimer
    */
   static int hrtimer_get_target(int this_cpu, int pinned)
   {
- -#ifdef CONFIG_NO_HZ
+ +#ifdef CONFIG_NO_HZ_COMMON
         if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
                 return get_nohz_timer_target();
   #endif
@@@ -1106,7 -1107,7 +1107,7 @@@ ktime_t hrtimer_get_remaining(const str
   }
   EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
   
- -#ifdef CONFIG_NO_HZ
+ +#ifdef CONFIG_NO_HZ_COMMON
   /**
    * hrtimer_get_next_event - get the time until next expiry event
    *
@@@ -1642,8 -1643,6 +1643,6 @@@ static void __cpuinit init_hrtimers_cpu
         struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
         int i;
   
-       raw_spin_lock_init(&cpu_base->lock);
- 
         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                 cpu_base->clock_base[i].cpu_base = cpu_base;
                 timerqueue_init_head(&cpu_base->clock_base[i].active);
diff --combined kernel/rcutree.c

index 1d4ceff793a4dd9bc4da097630cba4d2c3581d99,d8534308fd052f9a9446929f170a4ceb9d6e30b1..16ea67925015f19e7f693ed7a4557d898942beea
--- 1/kernel/rcutree.c
--- 2/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@@ -64,7 -64,7 +64,7 @@@
   static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
   static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
   
- #define RCU_STATE_INITIALIZER(sname, cr) { \
+ #define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
         .level = { &sname##_state.node[0] }, \
         .call = cr, \
         .fqs_state = RCU_GP_IDLE, \
@@@ -76,13 -76,14 +76,14 @@@
         .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
         .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
         .name = #sname, \
+       .abbr = sabbr, \
   }
   
   struct rcu_state rcu_sched_state =
-       RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
+       RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
   DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
   
- struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
+ struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
   DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
   
   static struct rcu_state *rcu_state;
@@@ -223,6 -224,8 +224,8 @@@ static ulong jiffies_till_next_fqs = RC
   module_param(jiffies_till_first_fqs, ulong, 0644);
   module_param(jiffies_till_next_fqs, ulong, 0644);
   
+ static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+                                 struct rcu_data *rdp);
   static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
   static void force_quiescent_state(struct rcu_state *rsp);
   static int rcu_pending(int cpu);
@@@ -310,6 -313,8 +313,8 @@@ cpu_needs_another_gp(struct rcu_state *
   
         if (rcu_gp_in_progress(rsp))
                 return 0;  /* No, a grace period is already in progress. */
+       if (rcu_nocb_needs_gp(rsp))
+               return 1;  /* Yes, a no-CBs CPU needs one. */
         if (!rdp->nxttail[RCU_NEXT_TAIL])
                 return 0;  /* No, this is a no-CBs (or offline) CPU. */
         if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
@@@ -794,16 -799,6 +799,16 @@@ static int rcu_implicit_dynticks_qs(str
                 rdp->offline_fqs++;
                 return 1;
         }
+ +
+ +      /*
+ +       * There is a possibility that a CPU in adaptive-ticks state
+ +       * might run in the kernel with the scheduling-clock tick disabled
+ +       * for an extended time period.  Invoke rcu_kick_nohz_cpu() to
+ +       * force the CPU to restart the scheduling-clock tick in this
+ +       * CPU is in this state.
+ +       */
+ +      rcu_kick_nohz_cpu(rdp->cpu);
+ +
         return 0;
   }
   
@@@ -1045,10 -1040,11 +1050,11 @@@ static void init_callback_list(struct r
   {
         int i;
   
+       if (init_nocb_callback_list(rdp))
+               return;
         rdp->nxtlist = NULL;
         for (i = 0; i < RCU_NEXT_SIZE; i++)
                 rdp->nxttail[i] = &rdp->nxtlist;
-       init_nocb_callback_list(rdp);
   }
   
   /*
@@@ -1080,6 -1076,120 +1086,120 @@@ static unsigned long rcu_cbs_completed(
         return rnp->completed + 2;
   }
   
+ /*
+  * Trace-event helper function for rcu_start_future_gp() and
+  * rcu_nocb_wait_gp().
+  */
+ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+                               unsigned long c, char *s)
+ {
+       trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
+                                     rnp->completed, c, rnp->level,
+                                     rnp->grplo, rnp->grphi, s);
+ }
+ 
+ /*
+  * Start some future grace period, as needed to handle newly arrived
+  * callbacks.  The required future grace periods are recorded in each
+  * rcu_node structure's ->need_future_gp field.
+  *
+  * The caller must hold the specified rcu_node structure's ->lock.
+  */
+ static unsigned long __maybe_unused
+ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
+ {
+       unsigned long c;
+       int i;
+       struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+ 
+       /*
+        * Pick up grace-period number for new callbacks.  If this
+        * grace period is already marked as needed, return to the caller.
+        */
+       c = rcu_cbs_completed(rdp->rsp, rnp);
+       trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
+       if (rnp->need_future_gp[c & 0x1]) {
+               trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
+               return c;
+       }
+ 
+       /*
+        * If either this rcu_node structure or the root rcu_node structure
+        * believe that a grace period is in progress, then we must wait
+        * for the one following, which is in "c".  Because our request
+        * will be noticed at the end of the current grace period, we don't
+        * need to explicitly start one.
+        */
+       if (rnp->gpnum != rnp->completed ||
+           ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
+               rnp->need_future_gp[c & 0x1]++;
+               trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
+               return c;
+       }
+ 
+       /*
+        * There might be no grace period in progress.  If we don't already
+        * hold it, acquire the root rcu_node structure's lock in order to
+        * start one (if needed).
+        */
+       if (rnp != rnp_root)
+               raw_spin_lock(&rnp_root->lock);
+ 
+       /*
+        * Get a new grace-period number.  If there really is no grace
+        * period in progress, it will be smaller than the one we obtained
+        * earlier.  Adjust callbacks as needed.  Note that even no-CBs
+        * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
+        */
+       c = rcu_cbs_completed(rdp->rsp, rnp_root);
+       for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
+               if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
+                       rdp->nxtcompleted[i] = c;
+ 
+       /*
+        * If the needed for the required grace period is already
+        * recorded, trace and leave.
+        */
+       if (rnp_root->need_future_gp[c & 0x1]) {
+               trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
+               goto unlock_out;
+       }
+ 
+       /* Record the need for the future grace period. */
+       rnp_root->need_future_gp[c & 0x1]++;
+ 
+       /* If a grace period is not already in progress, start one. */
+       if (rnp_root->gpnum != rnp_root->completed) {
+               trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
+       } else {
+               trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
+               rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
+       }
+ unlock_out:
+       if (rnp != rnp_root)
+               raw_spin_unlock(&rnp_root->lock);
+       return c;
+ }
+ 
+ /*
+  * Clean up any old requests for the just-ended grace period.  Also return
+  * whether any additional grace periods have been requested.  Also invoke
+  * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
+  * waiting for this grace period to complete.
+  */
+ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+ {
+       int c = rnp->completed;
+       int needmore;
+       struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+ 
+       rcu_nocb_gp_cleanup(rsp, rnp);
+       rnp->need_future_gp[c & 0x1] = 0;
+       needmore = rnp->need_future_gp[(c + 1) & 0x1];
+       trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup");
+       return needmore;
+ }
+ 
   /*
    * If there is room, assign a ->completed number to any callbacks on
    * this CPU that have not already been assigned.  Also accelerate any
@@@ -1139,6 -1249,8 +1259,8 @@@ static void rcu_accelerate_cbs(struct r
                 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
                 rdp->nxtcompleted[i] = c;
         }
+       /* Record any needed additional grace periods. */
+       rcu_start_future_gp(rnp, rdp);
   
         /* Trace depending on how much we were able to accelerate. */
         if (!*rdp->nxttail[RCU_WAIT_TAIL])
@@@ -1318,9 -1430,9 +1440,9 @@@ static int rcu_gp_init(struct rcu_stat
                 rdp = this_cpu_ptr(rsp->rda);
                 rcu_preempt_check_blocked_tasks(rnp);
                 rnp->qsmask = rnp->qsmaskinit;
-               rnp->gpnum = rsp->gpnum;
+               ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
                 WARN_ON_ONCE(rnp->completed != rsp->completed);
-               rnp->completed = rsp->completed;
+               ACCESS_ONCE(rnp->completed) = rsp->completed;
                 if (rnp == rdp->mynode)
                         rcu_start_gp_per_cpu(rsp, rnp, rdp);
                 rcu_preempt_boost_start_gp(rnp);
@@@ -1329,7 -1441,8 +1451,8 @@@
                                             rnp->grphi, rnp->qsmask);
                 raw_spin_unlock_irq(&rnp->lock);
   #ifdef CONFIG_PROVE_RCU_DELAY
-               if ((random32() % (rcu_num_nodes * 8)) == 0)
+               if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 &&
+                   system_state == SYSTEM_RUNNING)
                         schedule_timeout_uninterruptible(2);
   #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
                 cond_resched();
@@@ -1371,6 -1484,7 +1494,7 @@@ int rcu_gp_fqs(struct rcu_state *rsp, i
   static void rcu_gp_cleanup(struct rcu_state *rsp)
   {
         unsigned long gp_duration;
+       int nocb = 0;
         struct rcu_data *rdp;
         struct rcu_node *rnp = rcu_get_root(rsp);
   
@@@ -1400,17 -1514,23 +1524,23 @@@
          */
         rcu_for_each_node_breadth_first(rsp, rnp) {
                 raw_spin_lock_irq(&rnp->lock);
-               rnp->completed = rsp->gpnum;
+               ACCESS_ONCE(rnp->completed) = rsp->gpnum;
+               rdp = this_cpu_ptr(rsp->rda);
+               if (rnp == rdp->mynode)
+                       __rcu_process_gp_end(rsp, rnp, rdp);
+               nocb += rcu_future_gp_cleanup(rsp, rnp);
                 raw_spin_unlock_irq(&rnp->lock);
                 cond_resched();
         }
         rnp = rcu_get_root(rsp);
         raw_spin_lock_irq(&rnp->lock);
+       rcu_nocb_gp_set(rnp, nocb);
   
         rsp->completed = rsp->gpnum; /* Declare grace period done. */
         trace_rcu_grace_period(rsp->name, rsp->completed, "end");
         rsp->fqs_state = RCU_GP_IDLE;
         rdp = this_cpu_ptr(rsp->rda);
+       rcu_advance_cbs(rsp, rnp, rdp);  /* Reduce false positives below. */
         if (cpu_needs_another_gp(rsp, rdp))
                 rsp->gp_flags = 1;
         raw_spin_unlock_irq(&rnp->lock);
@@@ -1486,57 -1606,62 +1616,62 @@@ static int __noreturn rcu_gp_kthread(vo
   /*
    * Start a new RCU grace period if warranted, re-initializing the hierarchy
    * in preparation for detecting the next grace period.  The caller must hold
-  * the root node's ->lock, which is released before return.  Hard irqs must
-  * be disabled.
+  * the root node's ->lock and hard irqs must be disabled.
    *
    * Note that it is legal for a dying CPU (which is marked as offline) to
    * invoke this function.  This can happen when the dying CPU reports its
    * quiescent state.
    */
   static void
- rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
-       __releases(rcu_get_root(rsp)->lock)
+ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+                     struct rcu_data *rdp)
   {
-       struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
-       struct rcu_node *rnp = rcu_get_root(rsp);
- 
-       if (!rsp->gp_kthread ||
-           !cpu_needs_another_gp(rsp, rdp)) {
+       if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
                 /*
                  * Either we have not yet spawned the grace-period
                  * task, this CPU does not need another grace period,
                  * or a grace period is already in progress.
                  * Either way, don't start a new grace period.
                  */
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
                 return;
         }
- 
-       /*
-        * Because there is no grace period in progress right now,
-        * any callbacks we have up to this point will be satisfied
-        * by the next grace period.  So this is a good place to
-        * assign a grace period number to recently posted callbacks.
-        */
-       rcu_accelerate_cbs(rsp, rnp, rdp);
- 
         rsp->gp_flags = RCU_GP_FLAG_INIT;
-       raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
- 
-       /* Ensure that CPU is aware of completion of last grace period. */
-       rcu_process_gp_end(rsp, rdp);
-       local_irq_restore(flags);
   
         /* Wake up rcu_gp_kthread() to start the grace period. */
         wake_up(&rsp->gp_wq);
   }
   
+ /*
+  * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
+  * callbacks.  Note that rcu_start_gp_advanced() cannot do this because it
+  * is invoked indirectly from rcu_advance_cbs(), which would result in
+  * endless recursion -- or would do so if it wasn't for the self-deadlock
+  * that is encountered beforehand.
+  */
+ static void
+ rcu_start_gp(struct rcu_state *rsp)
+ {
+       struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+       struct rcu_node *rnp = rcu_get_root(rsp);
+ 
+       /*
+        * If there is no grace period in progress right now, any
+        * callbacks we have up to this point will be satisfied by the
+        * next grace period.  Also, advancing the callbacks reduces the
+        * probability of false positives from cpu_needs_another_gp()
+        * resulting in pointless grace periods.  So, advance callbacks
+        * then start the grace period!
+        */
+       rcu_advance_cbs(rsp, rnp, rdp);
+       rcu_start_gp_advanced(rsp, rnp, rdp);
+ }
+ 
   /*
    * Report a full set of quiescent states to the specified rcu_state
    * data structure.  This involves cleaning up after the prior grace
    * period and letting rcu_start_gp() start up the next grace period
-  * if one is needed.  Note that the caller must hold rnp->lock, as
-  * required by rcu_start_gp(), which will release it.
+  * if one is needed.  Note that the caller must hold rnp->lock, which
+  * is released before return.
    */
   static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
         __releases(rcu_get_root(rsp)->lock)
@@@ -1695,7 -1820,7 +1830,7 @@@ rcu_send_cbs_to_orphanage(int cpu, stru
                           struct rcu_node *rnp, struct rcu_data *rdp)
   {
         /* No-CBs CPUs do not have orphanable callbacks. */
- -      if (is_nocb_cpu(rdp->cpu))
+ +      if (rcu_is_nocb_cpu(rdp->cpu))
                 return;
   
         /*
@@@ -2134,7 -2259,8 +2269,8 @@@ __rcu_process_callbacks(struct rcu_stat
         local_irq_save(flags);
         if (cpu_needs_another_gp(rsp, rdp)) {
                 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
-               rcu_start_gp(rsp, flags);  /* releases above lock */
+               rcu_start_gp(rsp);
+               raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
         } else {
                 local_irq_restore(flags);
         }
@@@ -2179,7 -2305,8 +2315,8 @@@ static void invoke_rcu_callbacks(struc
   
   static void invoke_rcu_core(void)
   {
-       raise_softirq(RCU_SOFTIRQ);
+       if (cpu_online(smp_processor_id()))
+               raise_softirq(RCU_SOFTIRQ);
   }
   
   /*
@@@ -2214,11 -2341,11 +2351,11 @@@ static void __call_rcu_core(struct rcu_
   
                 /* Start a new grace period if one not already started. */
                 if (!rcu_gp_in_progress(rsp)) {
-                       unsigned long nestflag;
                         struct rcu_node *rnp_root = rcu_get_root(rsp);
   
-                       raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
-                       rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
+                       raw_spin_lock(&rnp_root->lock);
+                       rcu_start_gp(rsp);
+                       raw_spin_unlock(&rnp_root->lock);
                 } else {
                         /* Give the grace period a kick. */
                         rdp->blimit = LONG_MAX;
@@@ -2638,19 -2765,27 +2775,27 @@@ static int rcu_pending(int cpu
   }
   
   /*
-  * Check to see if any future RCU-related work will need to be done
-  * by the current CPU, even if none need be done immediately, returning
-  * 1 if so.
+  * Return true if the specified CPU has any callback.  If all_lazy is
+  * non-NULL, store an indication of whether all callbacks are lazy.
+  * (If there are no callbacks, all of them are deemed to be lazy.)
    */
- static int rcu_cpu_has_callbacks(int cpu)
+ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
   {
+       bool al = true;
+       bool hc = false;
+       struct rcu_data *rdp;
         struct rcu_state *rsp;
   
-       /* RCU callbacks either ready or pending? */
-       for_each_rcu_flavor(rsp)
-               if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
-                       return 1;
-       return 0;
+       for_each_rcu_flavor(rsp) {
+               rdp = per_cpu_ptr(rsp->rda, cpu);
+               if (rdp->qlen != rdp->qlen_lazy)
+                       al = false;
+               if (rdp->nxtlist)
+                       hc = true;
+       }
+       if (all_lazy)
+               *all_lazy = al;
+       return hc;
   }
   
   /*
@@@ -2757,10 -2892,10 +2902,10 @@@ static void _rcu_barrier(struct rcu_sta
          * corresponding CPU's preceding callbacks have been invoked.
          */
         for_each_possible_cpu(cpu) {
- -              if (!cpu_online(cpu) && !is_nocb_cpu(cpu))
+ +              if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
                         continue;
                 rdp = per_cpu_ptr(rsp->rda, cpu);
- -              if (is_nocb_cpu(cpu)) {
+ +              if (rcu_is_nocb_cpu(cpu)) {
                         _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
                                            rsp->n_barrier_done);
                         atomic_inc(&rsp->barrier_cpu_count);
@@@ -2869,7 -3004,6 +3014,6 @@@ rcu_init_percpu_data(int cpu, struct rc
         rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
         atomic_set(&rdp->dynticks->dynticks,
                    (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
-       rcu_prepare_for_idle_init(cpu);
         raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
   
         /* Add CPU to rcu_node bitmasks. */
@@@ -2919,7 -3053,6 +3063,6 @@@ static int __cpuinit rcu_cpu_notify(str
         struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
         struct rcu_node *rnp = rdp->mynode;
         struct rcu_state *rsp;
-       int ret = NOTIFY_OK;
   
         trace_rcu_utilization("Start CPU hotplug");
         switch (action) {
@@@ -2933,21 -3066,12 +3076,12 @@@
                 rcu_boost_kthread_setaffinity(rnp, -1);
                 break;
         case CPU_DOWN_PREPARE:
-               if (nocb_cpu_expendable(cpu))
-                       rcu_boost_kthread_setaffinity(rnp, cpu);
-               else
-                       ret = NOTIFY_BAD;
+               rcu_boost_kthread_setaffinity(rnp, cpu);
                 break;
         case CPU_DYING:
         case CPU_DYING_FROZEN:
-               /*
-                * The whole machine is "stopped" except this CPU, so we can
-                * touch any data without introducing corruption. We send the
-                * dying CPU's callbacks to an arbitrarily chosen online CPU.
-                */
                 for_each_rcu_flavor(rsp)
                         rcu_cleanup_dying_cpu(rsp);
-               rcu_cleanup_after_idle(cpu);
                 break;
         case CPU_DEAD:
         case CPU_DEAD_FROZEN:
@@@ -2960,7 -3084,7 +3094,7 @@@
                 break;
         }
         trace_rcu_utilization("End CPU hotplug");
-       return ret;
+       return NOTIFY_OK;
   }
   
   /*
@@@ -3095,6 -3219,7 +3229,7 @@@ static void __init rcu_init_one(struct 
                         }
                         rnp->level = i;
                         INIT_LIST_HEAD(&rnp->blkd_tasks);
+                       rcu_init_one_nocb(rnp);
                 }
         }
   
@@@ -3180,8 -3305,7 +3315,7 @@@ void __init rcu_init(void
         rcu_init_one(&rcu_sched_state, &rcu_sched_data);
         rcu_init_one(&rcu_bh_state, &rcu_bh_data);
         __rcu_init_preempt();
-       rcu_init_nocb();
-        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
   
         /*
          * We don't need protection against CPU-hotplug here because
diff --combined kernel/rcutree.h

index 38acc49da2c6cf7e0303afdfe731a11460c560a1,14ee40795d6fe09257818c11a1a456a96214cf4c..da77a8f57ff95f80c7546684eba2ac293fdde08c
--- 1/kernel/rcutree.h
--- 2/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@@ -88,18 -88,13 +88,13 @@@ struct rcu_dynticks 
         int dynticks_nmi_nesting;   /* Track NMI nesting level. */
         atomic_t dynticks;          /* Even value for idle, else odd. */
   #ifdef CONFIG_RCU_FAST_NO_HZ
-       int dyntick_drain;          /* Prepare-for-idle state variable. */
-       unsigned long dyntick_holdoff;
-                                   /* No retries for the jiffy of failure. */
-       struct timer_list idle_gp_timer;
-                                   /* Wake up CPU sleeping with callbacks. */
-       unsigned long idle_gp_timer_expires;
-                                   /* When to wake up CPU (for repost). */
-       bool idle_first_pass;       /* First pass of attempt to go idle? */
+       bool all_lazy;              /* Are all CPU's CBs lazy? */
         unsigned long nonlazy_posted;
                                     /* # times non-lazy CBs posted to CPU. */
         unsigned long nonlazy_posted_snap;
                                     /* idle-period nonlazy_posted snapshot. */
+       unsigned long last_accelerate;
+                                   /* Last jiffy CBs were accelerated. */
         int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
   #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
   };
@@@ -134,9 -129,6 +129,6 @@@ struct rcu_node 
                                 /*  elements that need to drain to allow the */
                                 /*  current expedited grace period to */
                                 /*  complete (only for TREE_PREEMPT_RCU). */
-       atomic_t wakemask;      /* CPUs whose kthread needs to be awakened. */
-                               /*  Since this has meaning only for leaf */
-                               /*  rcu_node structures, 32 bits suffices. */
         unsigned long qsmaskinit;
                                 /* Per-GP initial value for qsmask & expmask. */
         unsigned long grpmask;  /* Mask to apply to parent qsmask. */
@@@ -196,6 -188,12 +188,12 @@@
                                 /* Refused to boost: not sure why, though. */
                                 /*  This can happen due to race conditions. */
   #endif /* #ifdef CONFIG_RCU_BOOST */
+ #ifdef CONFIG_RCU_NOCB_CPU
+       wait_queue_head_t nocb_gp_wq[2];
+                               /* Place for rcu_nocb_kthread() to wait GP. */
+ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+       int need_future_gp[2];
+                               /* Counts of upcoming no-CB GP requests. */
         raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
   } ____cacheline_internodealigned_in_smp;
   
@@@ -328,6 -326,11 +326,11 @@@ struct rcu_data 
         struct task_struct *nocb_kthread;
   #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
   
+       /* 8) RCU CPU stall data. */
+ #ifdef CONFIG_RCU_CPU_STALL_INFO
+       unsigned int softirq_snap;      /* Snapshot of softirq activity. */
+ #endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+ 
         int cpu;
         struct rcu_state *rsp;
   };
@@@ -375,12 -378,6 +378,6 @@@ struct rcu_state 
         struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
         void (*call)(struct rcu_head *head,     /* call_rcu() flavor. */
                      void (*func)(struct rcu_head *head));
- #ifdef CONFIG_RCU_NOCB_CPU
-       void (*call_remote)(struct rcu_head *head,
-                    void (*func)(struct rcu_head *head));
-                                               /* call_rcu() flavor, but for */
-                                               /*  placing on remote CPU. */
- #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
   
         /* The following fields are guarded by the root rcu_node's lock. */
   
@@@ -443,6 -440,7 +440,7 @@@
         unsigned long gp_max;                   /* Maximum GP duration in */
                                                 /*  jiffies. */
         char *name;                             /* Name of structure. */
+       char abbr;                              /* Abbreviated name. */
         struct list_head flavors;               /* List of RCU flavors. */
   };
   
@@@ -520,7 -518,6 +518,6 @@@ static int __cpuinit rcu_spawn_one_boos
                                                  struct rcu_node *rnp);
   #endif /* #ifdef CONFIG_RCU_BOOST */
   static void __cpuinit rcu_prepare_kthreads(int cpu);
- static void rcu_prepare_for_idle_init(int cpu);
   static void rcu_cleanup_after_idle(int cpu);
   static void rcu_prepare_for_idle(int cpu);
   static void rcu_idle_count_callbacks_posted(void);
@@@ -529,16 -526,18 +526,18 @@@ static void print_cpu_stall_info(struc
   static void print_cpu_stall_info_end(void);
   static void zero_cpu_stall_ticks(struct rcu_data *rdp);
   static void increment_cpu_stall_ticks(void);
- -static bool is_nocb_cpu(int cpu);
+ static int rcu_nocb_needs_gp(struct rcu_state *rsp);
+ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
+ static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+ static void rcu_init_one_nocb(struct rcu_node *rnp);
   static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
                             bool lazy);
   static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
                                       struct rcu_data *rdp);
- static bool nocb_cpu_expendable(int cpu);
   static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
   static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
- static void init_nocb_callback_list(struct rcu_data *rdp);
- static void __init rcu_init_nocb(void);
+ +static void rcu_kick_nohz_cpu(int cpu);
+ static bool init_nocb_callback_list(struct rcu_data *rdp);
   
   #endif /* #ifndef RCU_TREE_NONCORE */
   
diff --combined kernel/rcutree_plugin.h

index 0cd91cc18db410c866e1405c1a207d803e16b6a1,d084ae3f281c2cad8a44075be8d3ebacadd7929c..71bd7337d0ccf2e49ff15ae3cdd687a2519352f8
--- 1/kernel/rcutree_plugin.h
--- 2/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@@ -28,7 -28,6 +28,7 @@@
   #include <linux/gfp.h>
   #include <linux/oom.h>
   #include <linux/smpboot.h>
+ +#include <linux/tick.h>
   
   #define RCU_KTHREAD_PRIO 1
   
@@@ -86,11 -85,21 +86,21 @@@ static void __init rcu_bootup_announce_
         if (nr_cpu_ids != NR_CPUS)
                 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
   #ifdef CONFIG_RCU_NOCB_CPU
+ #ifndef CONFIG_RCU_NOCB_CPU_NONE
+       if (!have_rcu_nocb_mask) {
+               alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+               have_rcu_nocb_mask = true;
+       }
+ #ifdef CONFIG_RCU_NOCB_CPU_ZERO
+       pr_info("\tExperimental no-CBs CPU 0\n");
+       cpumask_set_cpu(0, rcu_nocb_mask);
+ #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
+ #ifdef CONFIG_RCU_NOCB_CPU_ALL
+       pr_info("\tExperimental no-CBs for all CPUs\n");
+       cpumask_setall(rcu_nocb_mask);
+ #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
+ #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
         if (have_rcu_nocb_mask) {
-               if (cpumask_test_cpu(0, rcu_nocb_mask)) {
-                       cpumask_clear_cpu(0, rcu_nocb_mask);
-                       pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
-               }
                 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
                 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
                 if (rcu_nocb_poll)
@@@ -102,7 -111,7 +112,7 @@@
   #ifdef CONFIG_TREE_PREEMPT_RCU
   
   struct rcu_state rcu_preempt_state =
-       RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
+       RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
   DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
   static struct rcu_state *rcu_state = &rcu_preempt_state;
   
@@@ -1534,14 -1543,7 +1544,7 @@@ static void __cpuinit rcu_prepare_kthre
   int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
   {
         *delta_jiffies = ULONG_MAX;
-       return rcu_cpu_has_callbacks(cpu);
- }
- 
- /*
-  * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
-  */
- static void rcu_prepare_for_idle_init(int cpu)
- {
+       return rcu_cpu_has_callbacks(cpu, NULL);
   }
   
   /*
@@@ -1578,16 -1580,6 +1581,6 @@@ static void rcu_idle_count_callbacks_po
    *
    * The following three proprocessor symbols control this state machine:
    *
-  * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
-  *    to satisfy RCU.  Beyond this point, it is better to incur a periodic
-  *    scheduling-clock interrupt than to loop through the state machine
-  *    at full power.
-  * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
-  *    optional if RCU does not need anything immediately from this
-  *    CPU, even if this CPU still has RCU callbacks queued.  The first
-  *    times through the state machine are mandatory: we need to give
-  *    the state machine a chance to communicate a quiescent state
-  *    to the RCU core.
    * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
    *    to sleep in dyntick-idle mode with RCU callbacks pending.  This
    *    is sized to be roughly one RCU grace period.  Those energy-efficiency
@@@ -1603,186 -1595,108 +1596,108 @@@
    * adjustment, they can be converted into kernel config parameters, though
    * making the state machine smarter might be a better option.
    */
- #define RCU_IDLE_FLUSHES 5            /* Number of dyntick-idle tries. */
- #define RCU_IDLE_OPT_FLUSHES 3                /* Optional dyntick-idle tries. */
   #define RCU_IDLE_GP_DELAY 4           /* Roughly one grace period. */
   #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ)       /* Roughly six seconds. */
   
- extern int tick_nohz_enabled;
- 
- /*
-  * Does the specified flavor of RCU have non-lazy callbacks pending on
-  * the specified CPU?  Both RCU flavor and CPU are specified by the
-  * rcu_data structure.
-  */
- static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
- {
-       return rdp->qlen != rdp->qlen_lazy;
- }
+ static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
+ module_param(rcu_idle_gp_delay, int, 0644);
+ static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
+ module_param(rcu_idle_lazy_gp_delay, int, 0644);
   
- #ifdef CONFIG_TREE_PREEMPT_RCU
+ extern int tick_nohz_enabled;
   
   /*
-  * Are there non-lazy RCU-preempt callbacks?  (There cannot be if there
-  * is no RCU-preempt in the kernel.)
+  * Try to advance callbacks for all flavors of RCU on the current CPU.
+  * Afterwards, if there are any callbacks ready for immediate invocation,
+  * return true.
    */
- static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
+ static bool rcu_try_advance_all_cbs(void)
   {
-       struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
- 
-       return __rcu_cpu_has_nonlazy_callbacks(rdp);
- }
- 
- #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+       bool cbs_ready = false;
+       struct rcu_data *rdp;
+       struct rcu_node *rnp;
+       struct rcu_state *rsp;
   
- static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
- {
-       return 0;
- }
+       for_each_rcu_flavor(rsp) {
+               rdp = this_cpu_ptr(rsp->rda);
+               rnp = rdp->mynode;
   
- #endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */
+               /*
+                * Don't bother checking unless a grace period has
+                * completed since we last checked and there are
+                * callbacks not yet ready to invoke.
+                */
+               if (rdp->completed != rnp->completed &&
+                   rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
+                       rcu_process_gp_end(rsp, rdp);
   
- /*
-  * Does any flavor of RCU have non-lazy callbacks on the specified CPU?
-  */
- static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
- {
-       return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
-              __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
-              rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
+               if (cpu_has_callbacks_ready_to_invoke(rdp))
+                       cbs_ready = true;
+       }
+       return cbs_ready;
   }
   
   /*
-  * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
-  * callbacks on this CPU, (2) this CPU has not yet attempted to enter
-  * dyntick-idle mode, or (3) this CPU is in the process of attempting to
-  * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
-  * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
-  * it is better to incur scheduling-clock interrupts than to spin
-  * continuously for the same time duration!
+  * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
+  * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
+  * caller to set the timeout based on whether or not there are non-lazy
+  * callbacks.
    *
-  * The delta_jiffies argument is used to store the time when RCU is
-  * going to need the CPU again if it still has callbacks.  The reason
-  * for this is that rcu_prepare_for_idle() might need to post a timer,
-  * but if so, it will do so after tick_nohz_stop_sched_tick() has set
-  * the wakeup time for this CPU.  This means that RCU's timer can be
-  * delayed until the wakeup time, which defeats the purpose of posting
-  * a timer.
+  * The caller must have disabled interrupts.
    */
- int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+ int rcu_needs_cpu(int cpu, unsigned long *dj)
   {
         struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
   
-       /* Flag a new idle sojourn to the idle-entry state machine. */
-       rdtp->idle_first_pass = 1;
+       /* Snapshot to detect later posting of non-lazy callback. */
+       rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
+ 
         /* If no callbacks, RCU doesn't need the CPU. */
-       if (!rcu_cpu_has_callbacks(cpu)) {
-               *delta_jiffies = ULONG_MAX;
+       if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
+               *dj = ULONG_MAX;
                 return 0;
         }
-       if (rdtp->dyntick_holdoff == jiffies) {
-               /* RCU recently tried and failed, so don't try again. */
-               *delta_jiffies = 1;
+ 
+       /* Attempt to advance callbacks. */
+       if (rcu_try_advance_all_cbs()) {
+               /* Some ready to invoke, so initiate later invocation. */
+               invoke_rcu_core();
                 return 1;
         }
-       /* Set up for the possibility that RCU will post a timer. */
-       if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-               *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
-                                         RCU_IDLE_GP_DELAY) - jiffies;
+       rdtp->last_accelerate = jiffies;
+ 
+       /* Request timer delay depending on laziness, and round. */
+       if (rdtp->all_lazy) {
+               *dj = round_up(rcu_idle_gp_delay + jiffies,
+                              rcu_idle_gp_delay) - jiffies;
         } else {
-               *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
-               *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
+               *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
         }
         return 0;
   }
   
   /*
-  * Handler for smp_call_function_single().  The only point of this
-  * handler is to wake the CPU up, so the handler does only tracing.
-  */
- void rcu_idle_demigrate(void *unused)
- {
-       trace_rcu_prep_idle("Demigrate");
- }
- 
- /*
-  * Timer handler used to force CPU to start pushing its remaining RCU
-  * callbacks in the case where it entered dyntick-idle mode with callbacks
-  * pending.  The hander doesn't really need to do anything because the
-  * real work is done upon re-entry to idle, or by the next scheduling-clock
-  * interrupt should idle not be re-entered.
-  *
-  * One special case: the timer gets migrated without awakening the CPU
-  * on which the timer was scheduled on.  In this case, we must wake up
-  * that CPU.  We do so with smp_call_function_single().
-  */
- static void rcu_idle_gp_timer_func(unsigned long cpu_in)
- {
-       int cpu = (int)cpu_in;
- 
-       trace_rcu_prep_idle("Timer");
-       if (cpu != smp_processor_id())
-               smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
-       else
-               WARN_ON_ONCE(1); /* Getting here can hang the system... */
- }
- 
- /*
-  * Initialize the timer used to pull CPUs out of dyntick-idle mode.
-  */
- static void rcu_prepare_for_idle_init(int cpu)
- {
-       struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- 
-       rdtp->dyntick_holdoff = jiffies - 1;
-       setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
-       rdtp->idle_gp_timer_expires = jiffies - 1;
-       rdtp->idle_first_pass = 1;
- }
- 
- /*
-  * Clean up for exit from idle.  Because we are exiting from idle, there
-  * is no longer any point to ->idle_gp_timer, so cancel it.  This will
-  * do nothing if this timer is not active, so just cancel it unconditionally.
-  */
- static void rcu_cleanup_after_idle(int cpu)
- {
-       struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- 
-       del_timer(&rdtp->idle_gp_timer);
-       trace_rcu_prep_idle("Cleanup after idle");
-       rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
- }
- 
- /*
-  * Check to see if any RCU-related work can be done by the current CPU,
-  * and if so, schedule a softirq to get it done.  This function is part
-  * of the RCU implementation; it is -not- an exported member of the RCU API.
-  *
-  * The idea is for the current CPU to clear out all work required by the
-  * RCU core for the current grace period, so that this CPU can be permitted
-  * to enter dyntick-idle mode.  In some cases, it will need to be awakened
-  * at the end of the grace period by whatever CPU ends the grace period.
-  * This allows CPUs to go dyntick-idle more quickly, and to reduce the
-  * number of wakeups by a modest integer factor.
-  *
-  * Because it is not legal to invoke rcu_process_callbacks() with irqs
-  * disabled, we do one pass of force_quiescent_state(), then do a
-  * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
-  * later.  The ->dyntick_drain field controls the sequencing.
+  * Prepare a CPU for idle from an RCU perspective.  The first major task
+  * is to sense whether nohz mode has been enabled or disabled via sysfs.
+  * The second major task is to check to see if a non-lazy callback has
+  * arrived at a CPU that previously had only lazy callbacks.  The third
+  * major task is to accelerate (that is, assign grace-period numbers to)
+  * any recently arrived callbacks.
    *
    * The caller must have disabled interrupts.
    */
   static void rcu_prepare_for_idle(int cpu)
   {
-       struct timer_list *tp;
+       struct rcu_data *rdp;
         struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+       struct rcu_node *rnp;
+       struct rcu_state *rsp;
         int tne;
   
         /* Handle nohz enablement switches conservatively. */
         tne = ACCESS_ONCE(tick_nohz_enabled);
         if (tne != rdtp->tick_nohz_enabled_snap) {
-               if (rcu_cpu_has_callbacks(cpu))
+               if (rcu_cpu_has_callbacks(cpu, NULL))
                         invoke_rcu_core(); /* force nohz to see update. */
                 rdtp->tick_nohz_enabled_snap = tne;
                 return;
@@@ -1790,125 -1704,56 +1705,56 @@@
         if (!tne)
                 return;
   
-       /* Adaptive-tick mode, where usermode execution is idle to RCU. */
-       if (!is_idle_task(current)) {
-               rdtp->dyntick_holdoff = jiffies - 1;
-               if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-                       trace_rcu_prep_idle("User dyntick with callbacks");
-                       rdtp->idle_gp_timer_expires =
-                               round_up(jiffies + RCU_IDLE_GP_DELAY,
-                                        RCU_IDLE_GP_DELAY);
-               } else if (rcu_cpu_has_callbacks(cpu)) {
-                       rdtp->idle_gp_timer_expires =
-                               round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
-                       trace_rcu_prep_idle("User dyntick with lazy callbacks");
-               } else {
-                       return;
-               }
-               tp = &rdtp->idle_gp_timer;
-               mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
+       /* If this is a no-CBs CPU, no callbacks, just return. */
+       if (is_nocb_cpu(cpu))
                 return;
-       }
   
         /*
-        * If this is an idle re-entry, for example, due to use of
-        * RCU_NONIDLE() or the new idle-loop tracing API within the idle
-        * loop, then don't take any state-machine actions, unless the
-        * momentary exit from idle queued additional non-lazy callbacks.
-        * Instead, repost the ->idle_gp_timer if this CPU has callbacks
-        * pending.
+        * If a non-lazy callback arrived at a CPU having only lazy
+        * callbacks, invoke RCU core for the side-effect of recalculating
+        * idle duration on re-entry to idle.
          */
-       if (!rdtp->idle_first_pass &&
-           (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
-               if (rcu_cpu_has_callbacks(cpu)) {
-                       tp = &rdtp->idle_gp_timer;
-                       mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
-               }
+       if (rdtp->all_lazy &&
+           rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
+               invoke_rcu_core();
                 return;
         }
-       rdtp->idle_first_pass = 0;
-       rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
   
         /*
-        * If there are no callbacks on this CPU, enter dyntick-idle mode.
-        * Also reset state to avoid prejudicing later attempts.
+        * If we have not yet accelerated this jiffy, accelerate all
+        * callbacks on this CPU.
          */
-       if (!rcu_cpu_has_callbacks(cpu)) {
-               rdtp->dyntick_holdoff = jiffies - 1;
-               rdtp->dyntick_drain = 0;
-               trace_rcu_prep_idle("No callbacks");
+       if (rdtp->last_accelerate == jiffies)
                 return;
+       rdtp->last_accelerate = jiffies;
+       for_each_rcu_flavor(rsp) {
+               rdp = per_cpu_ptr(rsp->rda, cpu);
+               if (!*rdp->nxttail[RCU_DONE_TAIL])
+                       continue;
+               rnp = rdp->mynode;
+               raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+               rcu_accelerate_cbs(rsp, rnp, rdp);
+               raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
         }
+ }
   
-       /*
-        * If in holdoff mode, just return.  We will presumably have
-        * refrained from disabling the scheduling-clock tick.
-        */
-       if (rdtp->dyntick_holdoff == jiffies) {
-               trace_rcu_prep_idle("In holdoff");
-               return;
-       }
+ /*
+  * Clean up for exit from idle.  Attempt to advance callbacks based on
+  * any grace periods that elapsed while the CPU was idle, and if any
+  * callbacks are now ready to invoke, initiate invocation.
+  */
+ static void rcu_cleanup_after_idle(int cpu)
+ {
+       struct rcu_data *rdp;
+       struct rcu_state *rsp;
   
-       /* Check and update the ->dyntick_drain sequencing. */
-       if (rdtp->dyntick_drain <= 0) {
-               /* First time through, initialize the counter. */
-               rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
-       } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
-                  !rcu_pending(cpu) &&
-                  !local_softirq_pending()) {
-               /* Can we go dyntick-idle despite still having callbacks? */
-               rdtp->dyntick_drain = 0;
-               rdtp->dyntick_holdoff = jiffies;
-               if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-                       trace_rcu_prep_idle("Dyntick with callbacks");
-                       rdtp->idle_gp_timer_expires =
-                               round_up(jiffies + RCU_IDLE_GP_DELAY,
-                                        RCU_IDLE_GP_DELAY);
-               } else {
-                       rdtp->idle_gp_timer_expires =
-                               round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
-                       trace_rcu_prep_idle("Dyntick with lazy callbacks");
-               }
-               tp = &rdtp->idle_gp_timer;
-               mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
-               rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
-               return; /* Nothing more to do immediately. */
-       } else if (--(rdtp->dyntick_drain) <= 0) {
-               /* We have hit the limit, so time to give up. */
-               rdtp->dyntick_holdoff = jiffies;
-               trace_rcu_prep_idle("Begin holdoff");
-               invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */
+       if (is_nocb_cpu(cpu))
                 return;
-       }
- 
-       /*
-        * Do one step of pushing the remaining RCU callbacks through
-        * the RCU core state machine.
-        */
- #ifdef CONFIG_TREE_PREEMPT_RCU
-       if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
-               rcu_preempt_qs(cpu);
-               force_quiescent_state(&rcu_preempt_state);
-       }
- #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-       if (per_cpu(rcu_sched_data, cpu).nxtlist) {
-               rcu_sched_qs(cpu);
-               force_quiescent_state(&rcu_sched_state);
-       }
-       if (per_cpu(rcu_bh_data, cpu).nxtlist) {
-               rcu_bh_qs(cpu);
-               force_quiescent_state(&rcu_bh_state);
-       }
- 
-       /*
-        * If RCU callbacks are still pending, RCU still needs this CPU.
-        * So try forcing the callbacks through the grace period.
-        */
-       if (rcu_cpu_has_callbacks(cpu)) {
-               trace_rcu_prep_idle("More callbacks");
-               invoke_rcu_core();
-       } else {
-               trace_rcu_prep_idle("Callbacks drained");
+       rcu_try_advance_all_cbs();
+       for_each_rcu_flavor(rsp) {
+               rdp = per_cpu_ptr(rsp->rda, cpu);
+               if (cpu_has_callbacks_ready_to_invoke(rdp))
+                       invoke_rcu_core();
         }
   }
   
@@@ -2016,16 -1861,13 +1862,13 @@@ early_initcall(rcu_register_oom_notifie
   static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
   {
         struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-       struct timer_list *tltp = &rdtp->idle_gp_timer;
-       char c;
+       unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
   
-       c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
-       if (timer_pending(tltp))
-               sprintf(cp, "drain=%d %c timer=%lu",
-                       rdtp->dyntick_drain, c, tltp->expires - jiffies);
-       else
-               sprintf(cp, "drain=%d %c timer not pending",
-                       rdtp->dyntick_drain, c);
+       sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
+               rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
+               ulong2long(nlpd),
+               rdtp->all_lazy ? 'L' : '.',
+               rdtp->tick_nohz_enabled_snap ? '.' : 'D');
   }
   
   #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@@ -2071,10 -1913,11 +1914,11 @@@ static void print_cpu_stall_info(struc
                 ticks_value = rsp->gpnum - rdp->gpnum;
         }
         print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
-       printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n",
+       printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
                cpu, ticks_value, ticks_title,
                atomic_read(&rdtp->dynticks) & 0xfff,
                rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
+              rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
                fast_no_hz);
   }
   
@@@ -2088,6 -1931,7 +1932,7 @@@ static void print_cpu_stall_info_end(vo
   static void zero_cpu_stall_ticks(struct rcu_data *rdp)
   {
         rdp->ticks_this_gp = 0;
+       rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
   }
   
   /* Increment ->ticks_this_gp for all flavors of RCU. */
@@@ -2166,8 -2010,49 +2011,49 @@@ static int __init parse_rcu_nocb_poll(c
   }
   early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
   
+ /*
+  * Do any no-CBs CPUs need another grace period?
+  *
+  * Interrupts must be disabled.  If the caller does not hold the root
+  * rnp_node structure's ->lock, the results are advisory only.
+  */
+ static int rcu_nocb_needs_gp(struct rcu_state *rsp)
+ {
+       struct rcu_node *rnp = rcu_get_root(rsp);
+ 
+       return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
+ }
+ 
+ /*
+  * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
+  * grace period.
+  */
+ static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+ {
+       wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+ }
+ 
+ /*
+  * Set the root rcu_node structure's ->need_future_gp field
+  * based on the sum of those of all rcu_node structures.  This does
+  * double-count the root rcu_node structure's requests, but this
+  * is necessary to handle the possibility of a rcu_nocb_kthread()
+  * having awakened during the time that the rcu_node structures
+  * were being updated for the end of the previous grace period.
+  */
+ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
+ {
+       rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
+ }
+ 
+ static void rcu_init_one_nocb(struct rcu_node *rnp)
+ {
+       init_waitqueue_head(&rnp->nocb_gp_wq[0]);
+       init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+ }
+ 
   /* Is the specified CPU a no-CPUs CPU? */
- -static bool is_nocb_cpu(int cpu)
+ +bool rcu_is_nocb_cpu(int cpu)
   {
         if (have_rcu_nocb_mask)
                 return cpumask_test_cpu(cpu, rcu_nocb_mask);
@@@ -2225,9 -2110,16 +2111,16 @@@ static bool __call_rcu_nocb(struct rcu_
                             bool lazy)
   {
   
- -      if (!is_nocb_cpu(rdp->cpu))
+ +      if (!rcu_is_nocb_cpu(rdp->cpu))
                 return 0;
         __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
+       if (__is_kfree_rcu_offset((unsigned long)rhp->func))
+               trace_rcu_kfree_callback(rdp->rsp->name, rhp,
+                                        (unsigned long)rhp->func,
+                                        rdp->qlen_lazy, rdp->qlen);
+       else
+               trace_rcu_callback(rdp->rsp->name, rhp,
+                                  rdp->qlen_lazy, rdp->qlen);
         return 1;
   }
   
@@@ -2242,7 -2134,7 +2135,7 @@@ static bool __maybe_unused rcu_nocb_ado
         long qll = rsp->qlen_lazy;
   
         /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
- -      if (!is_nocb_cpu(smp_processor_id()))
+ +      if (!rcu_is_nocb_cpu(smp_processor_id()))
                 return 0;
         rsp->qlen = 0;
         rsp->qlen_lazy = 0;
@@@ -2266,95 -2158,36 +2159,36 @@@
   }
   
   /*
-  * There must be at least one non-no-CBs CPU in operation at any given
-  * time, because no-CBs CPUs are not capable of initiating grace periods
-  * independently.  This function therefore complains if the specified
-  * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
-  * avoid offlining the last such CPU.  (Recursion is a wonderful thing,
-  * but you have to have a base case!)
+  * If necessary, kick off a new grace period, and either way wait
+  * for a subsequent grace period to complete.
    */
- static bool nocb_cpu_expendable(int cpu)
+ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
   {
-       cpumask_var_t non_nocb_cpus;
-       int ret;
+       unsigned long c;
+       bool d;
+       unsigned long flags;
+       struct rcu_node *rnp = rdp->mynode;
+ 
+       raw_spin_lock_irqsave(&rnp->lock, flags);
+       c = rcu_start_future_gp(rnp, rdp);
+       raw_spin_unlock_irqrestore(&rnp->lock, flags);
   
         /*
-        * If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
-        * then offlining this CPU is harmless.  Let it happen.
+        * Wait for the grace period.  Do so interruptibly to avoid messing
+        * up the load average.
          */
-       if (!have_rcu_nocb_mask || rcu_is_nocb_cpu(cpu))
-               return 1;
- 
-       /* If no memory, play it safe and keep the CPU around. */
-       if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
-               return 0;
-       cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
-       cpumask_clear_cpu(cpu, non_nocb_cpus);
-       ret = !cpumask_empty(non_nocb_cpus);
-       free_cpumask_var(non_nocb_cpus);
-       return ret;
- }
- 
- /*
-  * Helper structure for remote registry of RCU callbacks.
-  * This is needed for when a no-CBs CPU needs to start a grace period.
-  * If it just invokes call_rcu(), the resulting callback will be queued,
-  * which can result in deadlock.
-  */
- struct rcu_head_remote {
-       struct rcu_head *rhp;
-       call_rcu_func_t *crf;
-       void (*func)(struct rcu_head *rhp);
- };
- 
- /*
-  * Register a callback as specified by the rcu_head_remote struct.
-  * This function is intended to be invoked via smp_call_function_single().
-  */
- static void call_rcu_local(void *arg)
- {
-       struct rcu_head_remote *rhrp =
-               container_of(arg, struct rcu_head_remote, rhp);
- 
-       rhrp->crf(rhrp->rhp, rhrp->func);
- }
- 
- /*
-  * Set up an rcu_head_remote structure and the invoke call_rcu_local()
-  * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
-  * smp_call_function_single().
-  */
- static void invoke_crf_remote(struct rcu_head *rhp,
-                             void (*func)(struct rcu_head *rhp),
-                             call_rcu_func_t crf)
- {
-       struct rcu_head_remote rhr;
- 
-       rhr.rhp = rhp;
-       rhr.crf = crf;
-       rhr.func = func;
-       smp_call_function_single(0, call_rcu_local, &rhr, 1);
- }
- 
- /*
-  * Helper functions to be passed to wait_rcu_gp(), each of which
-  * invokes invoke_crf_remote() to register a callback appropriately.
-  */
- static void __maybe_unused
- call_rcu_preempt_remote(struct rcu_head *rhp,
-                       void (*func)(struct rcu_head *rhp))
- {
-       invoke_crf_remote(rhp, func, call_rcu);
- }
- static void call_rcu_bh_remote(struct rcu_head *rhp,
-                              void (*func)(struct rcu_head *rhp))
- {
-       invoke_crf_remote(rhp, func, call_rcu_bh);
- }
- static void call_rcu_sched_remote(struct rcu_head *rhp,
-                                 void (*func)(struct rcu_head *rhp))
- {
-       invoke_crf_remote(rhp, func, call_rcu_sched);
+       trace_rcu_future_gp(rnp, rdp, c, "StartWait");
+       for (;;) {
+               wait_event_interruptible(
+                       rnp->nocb_gp_wq[c & 0x1],
+                       (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
+               if (likely(d))
+                       break;
+               flush_signals(current);
+               trace_rcu_future_gp(rnp, rdp, c, "ResumeWait");
+       }
+       trace_rcu_future_gp(rnp, rdp, c, "EndWait");
+       smp_mb(); /* Ensure that CB invocation happens after GP end. */
   }
   
   /*
@@@ -2391,7 -2224,7 +2225,7 @@@ static int rcu_nocb_kthread(void *arg
                 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
                 ACCESS_ONCE(rdp->nocb_p_count) += c;
                 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
-               wait_rcu_gp(rdp->rsp->call_remote);
+               rcu_nocb_wait_gp(rdp);
   
                 /* Each pass through the following loop invokes a callback. */
                 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
@@@ -2437,33 -2270,47 +2271,42 @@@ static void __init rcu_spawn_nocb_kthre
                 return;
         for_each_cpu(cpu, rcu_nocb_mask) {
                 rdp = per_cpu_ptr(rsp->rda, cpu);
-               t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
+               t = kthread_run(rcu_nocb_kthread, rdp,
+                               "rcuo%c/%d", rsp->abbr, cpu);
                 BUG_ON(IS_ERR(t));
                 ACCESS_ONCE(rdp->nocb_kthread) = t;
         }
   }
   
   /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
- static void init_nocb_callback_list(struct rcu_data *rdp)
+ static bool init_nocb_callback_list(struct rcu_data *rdp)
   {
         if (rcu_nocb_mask == NULL ||
             !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
-               return;
+               return false;
         rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+       return true;
+ }
+ 
+ #else /* #ifdef CONFIG_RCU_NOCB_CPU */
+ 
+ static int rcu_nocb_needs_gp(struct rcu_state *rsp)
+ {
+       return 0;
   }
   
- /* Initialize the ->call_remote fields in the rcu_state structures. */
- static void __init rcu_init_nocb(void)
+ static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
   {
- #ifdef CONFIG_PREEMPT_RCU
-       rcu_preempt_state.call_remote = call_rcu_preempt_remote;
- #endif /* #ifdef CONFIG_PREEMPT_RCU */
-       rcu_bh_state.call_remote = call_rcu_bh_remote;
-       rcu_sched_state.call_remote = call_rcu_sched_remote;
   }
   
- #else /* #ifdef CONFIG_RCU_NOCB_CPU */
+ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
+ {
+ }
+ 
+ static void rcu_init_one_nocb(struct rcu_node *rnp)
+ {
+ }
   
- -static bool is_nocb_cpu(int cpu)
- -{
- -      return false;
- -}
- -
   static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
                             bool lazy)
   {
@@@ -2476,11 -2323,6 +2319,6 @@@ static bool __maybe_unused rcu_nocb_ado
         return 0;
   }
   
- static bool nocb_cpu_expendable(int cpu)
- {
-       return 1;
- }
- 
   static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
   {
   }
@@@ -2489,29 -2331,9 +2327,26 @@@ static void __init rcu_spawn_nocb_kthre
   {
   }
   
- static void init_nocb_callback_list(struct rcu_data *rdp)
- {
- }
- 
- static void __init rcu_init_nocb(void)
+ static bool init_nocb_callback_list(struct rcu_data *rdp)
   {
+       return false;
   }
   
   #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+ +
+ +/*
+ + * An adaptive-ticks CPU can potentially execute in kernel mode for an
+ + * arbitrarily long period of time with the scheduling-clock tick turned
+ + * off.  RCU will be paying attention to this CPU because it is in the
+ + * kernel, but the CPU cannot be guaranteed to be executing the RCU state
+ + * machine because the scheduling-clock tick has been disabled.  Therefore,
+ + * if an adaptive-ticks CPU is failing to respond to the current grace
+ + * period and has not be idle from an RCU perspective, kick it.
+ + */
+ +static void rcu_kick_nohz_cpu(int cpu)
+ +{
+ +#ifdef CONFIG_NO_HZ_FULL
+ +      if (tick_nohz_full_cpu(cpu))
+ +              smp_send_reschedule(cpu);
+ +#endif /* #ifdef CONFIG_NO_HZ_FULL */
+ +}
diff --combined kernel/sched/core.c

index dd09def88567bf9f418825c0f684b105b0667be7,c70a8814a767904c07912aaed2281c26bc355451..e94842d4400c1a53b4a5331f7f35c711e41c2c79
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -512,11 -512,6 +512,6 @@@ static inline void init_hrtick(void
    * the target CPU.
    */
   #ifdef CONFIG_SMP
- 
- #ifndef tsk_is_polling
- #define tsk_is_polling(t) 0
- #endif
- 
   void resched_task(struct task_struct *p)
   {
         int cpu;
@@@ -549,7 -544,7 +544,7 @@@ void resched_cpu(int cpu
         raw_spin_unlock_irqrestore(&rq->lock, flags);
   }
   
- -#ifdef CONFIG_NO_HZ
+ +#ifdef CONFIG_NO_HZ_COMMON
   /*
    * In the semi idle case, use the nearest busy cpu for migrating timers
    * from an idle cpu.  This is good for power-savings.
@@@ -587,7 -582,7 +582,7 @@@ unlock
    * account when the CPU goes back to idle and evaluates the timer
    * wheel for the next timer event.
    */
- -void wake_up_idle_cpu(int cpu)
+ +static void wake_up_idle_cpu(int cpu)
   {
         struct rq *rq = cpu_rq(cpu);
   
@@@ -617,56 -612,20 +612,56 @@@
                 smp_send_reschedule(cpu);
   }
   
+ +static bool wake_up_full_nohz_cpu(int cpu)
+ +{
+ +      if (tick_nohz_full_cpu(cpu)) {
+ +              if (cpu != smp_processor_id() ||
+ +                  tick_nohz_tick_stopped())
+ +                      smp_send_reschedule(cpu);
+ +              return true;
+ +      }
+ +
+ +      return false;
+ +}
+ +
+ +void wake_up_nohz_cpu(int cpu)
+ +{
+ +      if (!wake_up_full_nohz_cpu(cpu))
+ +              wake_up_idle_cpu(cpu);
+ +}
+ +
   static inline bool got_nohz_idle_kick(void)
   {
         int cpu = smp_processor_id();
         return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
   }
   
- -#else /* CONFIG_NO_HZ */
+ +#else /* CONFIG_NO_HZ_COMMON */
   
   static inline bool got_nohz_idle_kick(void)
   {
         return false;
   }
   
- -#endif /* CONFIG_NO_HZ */
+ +#endif /* CONFIG_NO_HZ_COMMON */
+ +
+ +#ifdef CONFIG_NO_HZ_FULL
+ +bool sched_can_stop_tick(void)
+ +{
+ +       struct rq *rq;
+ +
+ +       rq = this_rq();
+ +
+ +       /* Make sure rq->nr_running update is visible after the IPI */
+ +       smp_rmb();
+ +
+ +       /* More than one running task need preemption */
+ +       if (rq->nr_running > 1)
+ +               return false;
+ +
+ +       return true;
+ +}
+ +#endif /* CONFIG_NO_HZ_FULL */
   
   void sched_avg_update(struct rq *rq)
   {
@@@ -1398,8 -1357,7 +1393,8 @@@ static void sched_ttwu_pending(void
   
   void scheduler_ipi(void)
   {
- -      if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+ +      if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()
+ +          && !tick_nohz_full_cpu(smp_processor_id()))
                 return;
   
         /*
@@@ -1416,7 -1374,6 +1411,7 @@@
          * somewhat pessimize the simple resched case.
          */
         irq_enter();
+ +      tick_nohz_full_check();
         sched_ttwu_pending();
   
         /*
@@@ -1536,8 -1493,10 +1531,10 @@@ static void try_to_wake_up_local(struc
   {
         struct rq *rq = task_rq(p);
   
-       BUG_ON(rq != this_rq());
-       BUG_ON(p == current);
+       if (WARN_ON_ONCE(rq != this_rq()) ||
+           WARN_ON_ONCE(p == current))
+               return;
+ 
         lockdep_assert_held(&rq->lock);
   
         if (!raw_spin_trylock(&p->pi_lock)) {
@@@ -1896,8 -1855,6 +1893,8 @@@ static void finish_task_switch(struct r
                 kprobe_flush_task(prev);
                 put_task_struct(prev);
         }
+ +
+ +      tick_nohz_task_switch(current);
   }
   
   #ifdef CONFIG_SMP
@@@ -2161,7 -2118,7 +2158,7 @@@ calc_load(unsigned long load, unsigned 
         return load >> FSHIFT;
   }
   
- -#ifdef CONFIG_NO_HZ
+ +#ifdef CONFIG_NO_HZ_COMMON
   /*
    * Handle NO_HZ for the global load-average.
    *
@@@ -2387,12 -2344,12 +2384,12 @@@ static void calc_global_nohz(void
         smp_wmb();
         calc_load_idx++;
   }
- -#else /* !CONFIG_NO_HZ */
+ +#else /* !CONFIG_NO_HZ_COMMON */
   
   static inline long calc_load_fold_idle(void) { return 0; }
   static inline void calc_global_nohz(void) { }
   
- -#endif /* CONFIG_NO_HZ */
+ +#endif /* CONFIG_NO_HZ_COMMON */
   
   /*
    * calc_load - update the avenrun load estimates 10 ticks after the
@@@ -2552,7 -2509,7 +2549,7 @@@ static void __update_cpu_load(struct r
         sched_avg_update(this_rq);
   }
   
- -#ifdef CONFIG_NO_HZ
+ +#ifdef CONFIG_NO_HZ_COMMON
   /*
    * There is no sane way to deal with nohz on smp when using jiffies because the
    * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@@ -2612,7 -2569,7 +2609,7 @@@ void update_cpu_load_nohz(void
         }
         raw_spin_unlock(&this_rq->lock);
   }
- -#endif /* CONFIG_NO_HZ */
+ +#endif /* CONFIG_NO_HZ_COMMON */
   
   /*
    * Called from scheduler_tick()
@@@ -3037,51 -2994,6 +3034,6 @@@ void __sched schedule_preempt_disabled(
         preempt_disable();
   }
   
- #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- 
- static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
- {
-       if (lock->owner != owner)
-               return false;
- 
-       /*
-        * Ensure we emit the owner->on_cpu, dereference _after_ checking
-        * lock->owner still matches owner, if that fails, owner might
-        * point to free()d memory, if it still matches, the rcu_read_lock()
-        * ensures the memory stays valid.
-        */
-       barrier();
- 
-       return owner->on_cpu;
- }
- 
- /*
-  * Look out! "owner" is an entirely speculative pointer
-  * access and not reliable.
-  */
- int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
- {
-       if (!sched_feat(OWNER_SPIN))
-               return 0;
- 
-       rcu_read_lock();
-       while (owner_running(lock, owner)) {
-               if (need_resched())
-                       break;
- 
-               arch_mutex_cpu_relax();
-       }
-       rcu_read_unlock();
- 
-       /*
-        * We break out the loop above on need_resched() and when the
-        * owner changed, which is a sign for heavy contention. Return
-        * success only when lock->owner is NULL.
-        */
-       return lock->owner == NULL;
- }
- #endif
- 
   #ifdef CONFIG_PREEMPT
   /*
    * this is the entry point to schedule() from in-kernel preemption
@@@ -4170,6 -4082,10 +4122,10 @@@ long sched_setaffinity(pid_t pid, cons
         get_task_struct(p);
         rcu_read_unlock();
   
+       if (p->flags & PF_NO_SETAFFINITY) {
+               retval = -EINVAL;
+               goto out_put_task;
+       }
         if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
                 retval = -ENOMEM;
                 goto out_put_task;
@@@ -4817,11 -4733,6 +4773,6 @@@ int set_cpus_allowed_ptr(struct task_st
                 goto out;
         }
   
-       if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
-               ret = -EINVAL;
-               goto out;
-       }
- 
         do_set_cpus_allowed(p, new_mask);
   
         /* Can the task run on the task's current CPU? If so, we're done */
@@@ -5043,7 -4954,7 +4994,7 @@@ static void sd_free_ctl_entry(struct ct
   }
   
   static int min_load_idx = 0;
- static int max_load_idx = CPU_LOAD_IDX_MAX;
+ static int max_load_idx = CPU_LOAD_IDX_MAX-1;
   
   static void
   set_table_entry(struct ctl_table *entry,
@@@ -6292,7 -6203,7 +6243,7 @@@ static void sched_init_numa(void
          * 'level' contains the number of unique distances, excluding the
          * identity distance node_distance(i,i).
          *
-        * The sched_domains_nume_distance[] array includes the actual distance
+        * The sched_domains_numa_distance[] array includes the actual distance
          * numbers.
          */
   
@@@ -6913,7 -6824,7 +6864,7 @@@ struct task_group root_task_group
   LIST_HEAD(task_groups);
   #endif
   
- DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+ DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
   
   void __init sched_init(void)
   {
@@@ -6950,7 -6861,7 +6901,7 @@@
   #endif /* CONFIG_RT_GROUP_SCHED */
   #ifdef CONFIG_CPUMASK_OFFSTACK
                 for_each_possible_cpu(i) {
-                       per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+                       per_cpu(load_balance_mask, i) = (void *)ptr;
                         ptr += cpumask_size();
                 }
   #endif /* CONFIG_CPUMASK_OFFSTACK */
@@@ -6976,12 -6887,6 +6927,6 @@@
   
   #endif /* CONFIG_CGROUP_SCHED */
   
- #ifdef CONFIG_CGROUP_CPUACCT
-       root_cpuacct.cpustat = &kernel_cpustat;
-       root_cpuacct.cpuusage = alloc_percpu(u64);
-       /* Too early, not expected to fail */
-       BUG_ON(!root_cpuacct.cpuusage);
- #endif
         for_each_possible_cpu(i) {
                 struct rq *rq;
   
@@@ -7045,7 -6950,7 +6990,7 @@@
                 INIT_LIST_HEAD(&rq->cfs_tasks);
   
                 rq_attach_root(rq, &def_root_domain);
- -#ifdef CONFIG_NO_HZ
+ +#ifdef CONFIG_NO_HZ_COMMON
                 rq->nohz_flags = 0;
   #endif
   #endif
@@@ -8083,226 -7988,6 +8028,6 @@@ struct cgroup_subsys cpu_cgroup_subsys 
   
   #endif        /* CONFIG_CGROUP_SCHED */
   
- #ifdef CONFIG_CGROUP_CPUACCT
- 
- /*
-  * CPU accounting code for task groups.
-  *
-  * Based on the work by Paul Menage ([email protected]) and Balbir Singh
-  * ([email protected]).
-  */
- 
- struct cpuacct root_cpuacct;
- 
- /* create a new cpu accounting group */
- static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
- {
-       struct cpuacct *ca;
- 
-       if (!cgrp->parent)
-               return &root_cpuacct.css;
- 
-       ca = kzalloc(sizeof(*ca), GFP_KERNEL);
-       if (!ca)
-               goto out;
- 
-       ca->cpuusage = alloc_percpu(u64);
-       if (!ca->cpuusage)
-               goto out_free_ca;
- 
-       ca->cpustat = alloc_percpu(struct kernel_cpustat);
-       if (!ca->cpustat)
-               goto out_free_cpuusage;
- 
-       return &ca->css;
- 
- out_free_cpuusage:
-       free_percpu(ca->cpuusage);
- out_free_ca:
-       kfree(ca);
- out:
-       return ERR_PTR(-ENOMEM);
- }
- 
- /* destroy an existing cpu accounting group */
- static void cpuacct_css_free(struct cgroup *cgrp)
- {
-       struct cpuacct *ca = cgroup_ca(cgrp);
- 
-       free_percpu(ca->cpustat);
-       free_percpu(ca->cpuusage);
-       kfree(ca);
- }
- 
- static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
- {
-       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-       u64 data;
- 
- #ifndef CONFIG_64BIT
-       /*
-        * Take rq->lock to make 64-bit read safe on 32-bit platforms.
-        */
-       raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-       data = *cpuusage;
-       raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
- #else
-       data = *cpuusage;
- #endif
- 
-       return data;
- }
- 
- static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
- {
-       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- 
- #ifndef CONFIG_64BIT
-       /*
-        * Take rq->lock to make 64-bit write safe on 32-bit platforms.
-        */
-       raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-       *cpuusage = val;
-       raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
- #else
-       *cpuusage = val;
- #endif
- }
- 
- /* return total cpu usage (in nanoseconds) of a group */
- static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
- {
-       struct cpuacct *ca = cgroup_ca(cgrp);
-       u64 totalcpuusage = 0;
-       int i;
- 
-       for_each_present_cpu(i)
-               totalcpuusage += cpuacct_cpuusage_read(ca, i);
- 
-       return totalcpuusage;
- }
- 
- static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
-                                                               u64 reset)
- {
-       struct cpuacct *ca = cgroup_ca(cgrp);
-       int err = 0;
-       int i;
- 
-       if (reset) {
-               err = -EINVAL;
-               goto out;
-       }
- 
-       for_each_present_cpu(i)
-               cpuacct_cpuusage_write(ca, i, 0);
- 
- out:
-       return err;
- }
- 
- static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
-                                  struct seq_file *m)
- {
-       struct cpuacct *ca = cgroup_ca(cgroup);
-       u64 percpu;
-       int i;
- 
-       for_each_present_cpu(i) {
-               percpu = cpuacct_cpuusage_read(ca, i);
-               seq_printf(m, "%llu ", (unsigned long long) percpu);
-       }
-       seq_printf(m, "\n");
-       return 0;
- }
- 
- static const char *cpuacct_stat_desc[] = {
-       [CPUACCT_STAT_USER] = "user",
-       [CPUACCT_STAT_SYSTEM] = "system",
- };
- 
- static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
-                             struct cgroup_map_cb *cb)
- {
-       struct cpuacct *ca = cgroup_ca(cgrp);
-       int cpu;
-       s64 val = 0;
- 
-       for_each_online_cpu(cpu) {
-               struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-               val += kcpustat->cpustat[CPUTIME_USER];
-               val += kcpustat->cpustat[CPUTIME_NICE];
-       }
-       val = cputime64_to_clock_t(val);
-       cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
- 
-       val = 0;
-       for_each_online_cpu(cpu) {
-               struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-               val += kcpustat->cpustat[CPUTIME_SYSTEM];
-               val += kcpustat->cpustat[CPUTIME_IRQ];
-               val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
-       }
- 
-       val = cputime64_to_clock_t(val);
-       cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
- 
-       return 0;
- }
- 
- static struct cftype files[] = {
-       {
-               .name = "usage",
-               .read_u64 = cpuusage_read,
-               .write_u64 = cpuusage_write,
-       },
-       {
-               .name = "usage_percpu",
-               .read_seq_string = cpuacct_percpu_seq_read,
-       },
-       {
-               .name = "stat",
-               .read_map = cpuacct_stats_show,
-       },
-       { }     /* terminate */
- };
- 
- /*
-  * charge this task's execution time to its accounting group.
-  *
-  * called with rq->lock held.
-  */
- void cpuacct_charge(struct task_struct *tsk, u64 cputime)
- {
-       struct cpuacct *ca;
-       int cpu;
- 
-       if (unlikely(!cpuacct_subsys.active))
-               return;
- 
-       cpu = task_cpu(tsk);
- 
-       rcu_read_lock();
- 
-       ca = task_ca(tsk);
- 
-       for (; ca; ca = parent_ca(ca)) {
-               u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-               *cpuusage += cputime;
-       }
- 
-       rcu_read_unlock();
- }
- 
- struct cgroup_subsys cpuacct_subsys = {
-       .name = "cpuacct",
-       .css_alloc = cpuacct_css_alloc,
-       .css_free = cpuacct_css_free,
-       .subsys_id = cpuacct_subsys_id,
-       .base_cftypes = files,
- };
- #endif        /* CONFIG_CGROUP_CPUACCT */
- 
   void dump_cpu_task(int cpu)
   {
         pr_info("Task dump for CPU %d:\n", cpu);
diff --combined kernel/sched/fair.c

index 5c97fca091a779b7638f4f212b54d43576950e55,8bf7081b1ec5201af66b08eaf3602177beef6368..c61a614465c8ebf13b5f71aabf23a78c9e8533a6
--- 1/kernel/sched/fair.c
--- 2/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -1563,6 -1563,27 +1563,27 @@@ static inline void dequeue_entity_load_
                 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
         } /* migrations, e.g. sleep=0 leave decay_count == 0 */
   }
+ 
+ /*
+  * Update the rq's load with the elapsed running time before entering
+  * idle. if the last scheduled task is not a CFS task, idle_enter will
+  * be the only way to update the runnable statistic.
+  */
+ void idle_enter_fair(struct rq *this_rq)
+ {
+       update_rq_runnable_avg(this_rq, 1);
+ }
+ 
+ /*
+  * Update the rq's load with the elapsed idle time before a task is
+  * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
+  * be the only way to update the runnable statistic.
+  */
+ void idle_exit_fair(struct rq *this_rq)
+ {
+       update_rq_runnable_avg(this_rq, 0);
+ }
+ 
   #else
   static inline void update_entity_load_avg(struct sched_entity *se,
                                           int update_cfs_rq) {}
@@@ -3875,12 -3896,16 +3896,16 @@@ int can_migrate_task(struct task_struc
         int tsk_cache_hot = 0;
         /*
          * We do not migrate tasks that are:
-        * 1) running (obviously), or
+        * 1) throttled_lb_pair, or
          * 2) cannot be migrated to this CPU due to cpus_allowed, or
-        * 3) are cache-hot on their current CPU.
+        * 3) running (obviously), or
+        * 4) are cache-hot on their current CPU.
          */
+       if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+               return 0;
+ 
         if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
-               int new_dst_cpu;
+               int cpu;
   
                 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
   
@@@ -3895,12 -3920,15 +3920,15 @@@
                 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
                         return 0;
   
-               new_dst_cpu = cpumask_first_and(env->dst_grpmask,
-                                               tsk_cpus_allowed(p));
-               if (new_dst_cpu < nr_cpu_ids) {
-                       env->flags |= LBF_SOME_PINNED;
-                       env->new_dst_cpu = new_dst_cpu;
+               /* Prevent to re-select dst_cpu via env's cpus */
+               for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
+                       if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+                               env->flags |= LBF_SOME_PINNED;
+                               env->new_dst_cpu = cpu;
+                               break;
+                       }
                 }
+ 
                 return 0;
         }
   
@@@ -3921,20 -3949,17 +3949,17 @@@
         tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
         if (!tsk_cache_hot ||
                 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
- #ifdef CONFIG_SCHEDSTATS
+ 
                 if (tsk_cache_hot) {
                         schedstat_inc(env->sd, lb_hot_gained[env->idle]);
                         schedstat_inc(p, se.statistics.nr_forced_migrations);
                 }
- #endif
+ 
                 return 1;
         }
   
-       if (tsk_cache_hot) {
-               schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
-               return 0;
-       }
-       return 1;
+       schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+       return 0;
   }
   
   /*
@@@ -3949,9 -3974,6 +3974,6 @@@ static int move_one_task(struct lb_env 
         struct task_struct *p, *n;
   
         list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
-               if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
-                       continue;
- 
                 if (!can_migrate_task(p, env))
                         continue;
   
@@@ -4003,7 -4025,7 +4025,7 @@@ static int move_tasks(struct lb_env *en
                         break;
                 }
   
-               if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+               if (!can_migrate_task(p, env))
                         goto next;
   
                 load = task_h_load(p);
@@@ -4014,9 -4036,6 +4036,6 @@@
                 if ((load / 2) > env->imbalance)
                         goto next;
   
-               if (!can_migrate_task(p, env))
-                       goto next;
- 
                 move_task(p, env);
                 pulled++;
                 env->imbalance -= load;
@@@ -4961,7 -4980,7 +4980,7 @@@ static struct rq *find_busiest_queue(st
   #define MAX_PINNED_INTERVAL   512
   
   /* Working cpumask for load_balance and load_balance_newidle. */
- DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
   
   static int need_active_balance(struct lb_env *env)
   {
@@@ -4992,11 -5011,10 +5011,10 @@@ static int load_balance(int this_cpu, s
                         int *balance)
   {
         int ld_moved, cur_ld_moved, active_balance = 0;
-       int lb_iterations, max_lb_iterations;
         struct sched_group *group;
         struct rq *busiest;
         unsigned long flags;
-       struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
+       struct cpumask *cpus = __get_cpu_var(load_balance_mask);
   
         struct lb_env env = {
                 .sd             = sd,
@@@ -5008,8 -5026,14 +5026,14 @@@
                 .cpus           = cpus,
         };
   
+       /*
+        * For NEWLY_IDLE load_balancing, we don't need to consider
+        * other cpus in our group
+        */
+       if (idle == CPU_NEWLY_IDLE)
+               env.dst_grpmask = NULL;
+ 
         cpumask_copy(cpus, cpu_active_mask);
-       max_lb_iterations = cpumask_weight(env.dst_grpmask);
   
         schedstat_inc(sd, lb_count[idle]);
   
@@@ -5035,7 -5059,6 +5059,6 @@@ redo
         schedstat_add(sd, lb_imbalance[idle], env.imbalance);
   
         ld_moved = 0;
-       lb_iterations = 1;
         if (busiest->nr_running > 1) {
                 /*
                  * Attempt to move tasks. If find_busiest_group has found
@@@ -5062,17 -5085,17 +5085,17 @@@ more_balance
                 double_rq_unlock(env.dst_rq, busiest);
                 local_irq_restore(flags);
   
-               if (env.flags & LBF_NEED_BREAK) {
-                       env.flags &= ~LBF_NEED_BREAK;
-                       goto more_balance;
-               }
- 
                 /*
                  * some other cpu did the load balance for us.
                  */
                 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
                         resched_cpu(env.dst_cpu);
   
+               if (env.flags & LBF_NEED_BREAK) {
+                       env.flags &= ~LBF_NEED_BREAK;
+                       goto more_balance;
+               }
+ 
                 /*
                  * Revisit (affine) tasks on src_cpu that couldn't be moved to
                  * us and move them to an alternate dst_cpu in our sched_group
@@@ -5092,14 -5115,17 +5115,17 @@@
                  * moreover subsequent load balance cycles should correct the
                  * excess load moved.
                  */
-               if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
-                               lb_iterations++ < max_lb_iterations) {
+               if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
   
                         env.dst_rq       = cpu_rq(env.new_dst_cpu);
                         env.dst_cpu      = env.new_dst_cpu;
                         env.flags       &= ~LBF_SOME_PINNED;
                         env.loop         = 0;
                         env.loop_break   = sched_nr_migrate_break;
+ 
+                       /* Prevent to re-select dst_cpu via env's cpus */
+                       cpumask_clear_cpu(env.dst_cpu, env.cpus);
+ 
                         /*
                          * Go back to "more_balance" rather than "redo" since we
                          * need to continue with same src_cpu.
@@@ -5220,8 -5246,6 +5246,6 @@@ void idle_balance(int this_cpu, struct 
         if (this_rq->avg_idle < sysctl_sched_migration_cost)
                 return;
   
-       update_rq_runnable_avg(this_rq, 1);
- 
         /*
          * Drop the rq->lock, but keep IRQ/preempt disabled.
          */
@@@ -5331,7 -5355,7 +5355,7 @@@ out_unlock
         return 0;
   }
   
- -#ifdef CONFIG_NO_HZ
+ +#ifdef CONFIG_NO_HZ_COMMON
   /*
    * idle load balancing details
    * - When one of the busy CPUs notice that there may be an idle rebalancing
@@@ -5396,13 -5420,16 +5420,16 @@@ static inline void set_cpu_sd_state_bus
         struct sched_domain *sd;
         int cpu = smp_processor_id();
   
-       if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
-               return;
-       clear_bit(NOHZ_IDLE, nohz_flags(cpu));
- 
         rcu_read_lock();
-       for_each_domain(cpu, sd)
+       sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+ 
+       if (!sd || !sd->nohz_idle)
+               goto unlock;
+       sd->nohz_idle = 0;
+ 
+       for (; sd; sd = sd->parent)
                 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+ unlock:
         rcu_read_unlock();
   }
   
@@@ -5411,13 -5438,16 +5438,16 @@@ void set_cpu_sd_state_idle(void
         struct sched_domain *sd;
         int cpu = smp_processor_id();
   
-       if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
-               return;
-       set_bit(NOHZ_IDLE, nohz_flags(cpu));
- 
         rcu_read_lock();
-       for_each_domain(cpu, sd)
+       sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+ 
+       if (!sd || sd->nohz_idle)
+               goto unlock;
+       sd->nohz_idle = 1;
+ 
+       for (; sd; sd = sd->parent)
                 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+ unlock:
         rcu_read_unlock();
   }
   
@@@ -5469,7 -5499,7 +5499,7 @@@ void update_max_interval(void
    * It checks each scheduling domain to see if it is due to be balanced,
    * and initiates a balancing operation if so.
    *
-  * Balancing parameters are set up in arch_init_sched_domains.
+  * Balancing parameters are set up in init_sched_domains.
    */
   static void rebalance_domains(int cpu, enum cpu_idle_type idle)
   {
@@@ -5507,10 -5537,11 +5537,11 @@@
                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
                         if (load_balance(cpu, rq, sd, idle, &balance)) {
                                 /*
-                                * We've pulled tasks over so either we're no
-                                * longer idle.
+                                * The LBF_SOME_PINNED logic could have changed
+                                * env->dst_cpu, so we can't know our idle
+                                * state even if we migrated tasks. Update it.
                                  */
-                               idle = CPU_NOT_IDLE;
+                               idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
                         }
                         sd->last_balance = jiffies;
                 }
@@@ -5541,9 -5572,9 +5572,9 @@@ out
                 rq->next_balance = next_balance;
   }
   
- -#ifdef CONFIG_NO_HZ
+ +#ifdef CONFIG_NO_HZ_COMMON
   /*
- - * In CONFIG_NO_HZ case, the idle balance kickee will do the
+ + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
    * rebalancing for all the cpus for whom scheduler ticks are stopped.
    */
   static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
@@@ -5686,7 -5717,7 +5717,7 @@@ void trigger_load_balance(struct rq *rq
         if (time_after_eq(jiffies, rq->next_balance) &&
             likely(!on_null_domain(cpu)))
                 raise_softirq(SCHED_SOFTIRQ);
- -#ifdef CONFIG_NO_HZ
+ +#ifdef CONFIG_NO_HZ_COMMON
         if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
                 nohz_balancer_kick(cpu);
   #endif
@@@ -6156,7 -6187,7 +6187,7 @@@ __init void init_sched_fair_class(void
   #ifdef CONFIG_SMP
         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
   
- -#ifdef CONFIG_NO_HZ
+ +#ifdef CONFIG_NO_HZ_COMMON
         nohz.next_balance = jiffies;
         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
         cpu_notifier(sched_ilb_notifier, 0);
diff --combined kernel/sched/sched.h

index eb363aa5d83cf02a6f2717a79a56e815de4b1c43,4c225c4c7111d7baaa3dfa3d4ec0b3f9a966c593..24dc2989774937be3f1161541d8fb8860aeec4d0
--- 1/kernel/sched/sched.h
--- 2/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@@ -5,9 -5,9 +5,10 @@@
   #include <linux/mutex.h>
   #include <linux/spinlock.h>
   #include <linux/stop_machine.h>
+ +#include <linux/tick.h>
   
   #include "cpupri.h"
+ #include "cpuacct.h"
   
   extern __read_mostly int scheduler_running;
   
@@@ -405,7 -405,7 +406,7 @@@ struct rq 
         #define CPU_LOAD_IDX_MAX 5
         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
         unsigned long last_load_update_tick;
- -#ifdef CONFIG_NO_HZ
+ +#ifdef CONFIG_NO_HZ_COMMON
         u64 nohz_stamp;
         unsigned long nohz_flags;
   #endif
@@@ -951,14 -951,6 +952,6 @@@ static const u32 prio_to_wmult[40] = 
    /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
   };
   
- /* Time spent by the tasks of the cpu accounting group executing in ... */
- enum cpuacct_stat_index {
-       CPUACCT_STAT_USER,      /* ... user mode */
-       CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
- 
-       CPUACCT_STAT_NSTATS,
- };
- 
   #define ENQUEUE_WAKEUP                1
   #define ENQUEUE_HEAD          2
   #ifdef CONFIG_SMP
@@@ -1032,6 -1024,18 +1025,18 @@@ extern void update_group_power(struct s
   extern void trigger_load_balance(struct rq *rq, int cpu);
   extern void idle_balance(int this_cpu, struct rq *this_rq);
   
+ /*
+  * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
+  * becomes useful in lb
+  */
+ #if defined(CONFIG_FAIR_GROUP_SCHED)
+ extern void idle_enter_fair(struct rq *this_rq);
+ extern void idle_exit_fair(struct rq *this_rq);
+ #else
+ static inline void idle_enter_fair(struct rq *this_rq) {}
+ static inline void idle_exit_fair(struct rq *this_rq) {}
+ #endif
+ 
   #else /* CONFIG_SMP */
   
   static inline void idle_balance(int cpu, struct rq *rq)
@@@ -1055,45 -1059,6 +1060,6 @@@ extern void init_rt_bandwidth(struct rt
   
   extern void update_idle_cpu_load(struct rq *this_rq);
   
- #ifdef CONFIG_CGROUP_CPUACCT
- #include <linux/cgroup.h>
- /* track cpu usage of a group of tasks and its child groups */
- struct cpuacct {
-       struct cgroup_subsys_state css;
-       /* cpuusage holds pointer to a u64-type object on every cpu */
-       u64 __percpu *cpuusage;
-       struct kernel_cpustat __percpu *cpustat;
- };
- 
- extern struct cgroup_subsys cpuacct_subsys;
- extern struct cpuacct root_cpuacct;
- 
- /* return cpu accounting group corresponding to this container */
- static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
- {
-       return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
-                           struct cpuacct, css);
- }
- 
- /* return cpu accounting group to which this task belongs */
- static inline struct cpuacct *task_ca(struct task_struct *tsk)
- {
-       return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
-                           struct cpuacct, css);
- }
- 
- static inline struct cpuacct *parent_ca(struct cpuacct *ca)
- {
-       if (!ca || !ca->css.cgroup->parent)
-               return NULL;
-       return cgroup_ca(ca->css.cgroup->parent);
- }
- 
- extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
- #else
- static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
- #endif
- 
   #ifdef CONFIG_PARAVIRT
   static inline u64 steal_ticks(u64 steal)
   {
@@@ -1107,16 -1072,6 +1073,16 @@@
   static inline void inc_nr_running(struct rq *rq)
   {
         rq->nr_running++;
+ +
+ +#ifdef CONFIG_NO_HZ_FULL
+ +      if (rq->nr_running == 2) {
+ +              if (tick_nohz_full_cpu(rq->cpu)) {
+ +                      /* Order rq->nr_running write against the IPI */
+ +                      smp_wmb();
+ +                      smp_send_reschedule(rq->cpu);
+ +              }
+ +       }
+ +#endif
   }
   
   static inline void dec_nr_running(struct rq *rq)
@@@ -1344,11 -1299,10 +1310,10 @@@ extern void init_rt_rq(struct rt_rq *rt
   
   extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
   
- -#ifdef CONFIG_NO_HZ
+ +#ifdef CONFIG_NO_HZ_COMMON
   enum rq_nohz_flag_bits {
         NOHZ_TICK_STOPPED,
         NOHZ_BALANCE_KICK,
-       NOHZ_IDLE,
   };
   
   #define nohz_flags(cpu)       (&cpu_rq(cpu)->nohz_flags)
diff --combined kernel/softirq.c

index 8b1446d4a4dbbe3f1e090bebdeb53f9c9c49f192,14d7758074aadf4d1c43947ecef675e8bb6c044e..51a09d56e78b875cc2c329afb59876f52afdaf8b
--- 1/kernel/softirq.c
--- 2/kernel/softirq.c
+++ b/kernel/softirq.c
@@@ -323,47 -323,35 +323,44 @@@ void irq_enter(void
   
   static inline void invoke_softirq(void)
   {
-       if (!force_irqthreads) {
- #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
+       if (!force_irqthreads)
                 __do_softirq();
- #else
-               do_softirq();
- #endif
-       } else {
-               __local_bh_disable((unsigned long)__builtin_return_address(0),
-                               SOFTIRQ_OFFSET);
+       else
                 wakeup_softirqd();
-               __local_bh_enable(SOFTIRQ_OFFSET);
-       }
   }
   
+ +static inline void tick_irq_exit(void)
+ +{
+ +#ifdef CONFIG_NO_HZ_COMMON
+ +      int cpu = smp_processor_id();
+ +
+ +      /* Make sure that timer wheel updates are propagated */
+ +      if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
+ +              if (!in_interrupt())
+ +                      tick_nohz_irq_exit();
+ +      }
+ +#endif
+ +}
+ +
   /*
    * Exit an interrupt context. Process softirqs if needed and possible:
    */
   void irq_exit(void)
   {
+ #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
+       local_irq_disable();
+ #else
+       WARN_ON_ONCE(!irqs_disabled());
+ #endif
+ 
         account_irq_exit_time(current);
         trace_hardirq_exit();
-       sub_preempt_count(IRQ_EXIT_OFFSET);
+       sub_preempt_count(HARDIRQ_OFFSET);
         if (!in_interrupt() && local_softirq_pending())
                 invoke_softirq();
   
- -#ifdef CONFIG_NO_HZ
- -      /* Make sure that timer wheel updates are propagated */
- -      if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
- -              tick_nohz_irq_exit();
- -#endif
+ +      tick_irq_exit();
         rcu_irq_exit();
-       sched_preempt_enable_no_resched();
   }
   
   /*
diff --combined kernel/time/tick-broadcast.c

index a3a3123f6272b74486ea14503b914ac4e6f17160,7f32fe0e52cd46489c8d90e4b85f9d74204aab16..40c10502c9e9e51871f1034a035f40c44a653961
--- 1/kernel/time/tick-broadcast.c
--- 2/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@@ -67,7 -67,8 +67,8 @@@ static void tick_broadcast_start_period
    */
   int tick_check_broadcast_device(struct clock_event_device *dev)
   {
-       if ((tick_broadcast_device.evtdev &&
+       if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
+           (tick_broadcast_device.evtdev &&
              tick_broadcast_device.evtdev->rating >= dev->rating) ||
              (dev->features & CLOCK_EVT_FEAT_C3STOP))
                 return 0;
@@@ -573,8 -574,7 +574,8 @@@ void tick_broadcast_setup_oneshot(struc
                 bc->event_handler = tick_handle_oneshot_broadcast;
   
                 /* Take the do_timer update */
- -              tick_do_timer_cpu = cpu;
+ +              if (!tick_nohz_full_cpu(cpu))
+ +                      tick_do_timer_cpu = cpu;
   
                 /*
                  * We must be careful here. There might be other CPUs
author	Frederic Weisbecker <[email protected]>
	Thu, 2 May 2013 15:37:49 +0000 (17:37 +0200)
committer	Frederic Weisbecker <[email protected]>
	Thu, 2 May 2013 15:54:19 +0000 (17:54 +0200)
		1	2
Documentation/RCU/stallwarn.txt	patch \|	diff1 \|	diff2 \|	blob \| history
Documentation/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/perf_event.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/rcupdate.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
init/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/hrtimer.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcutree.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcutree.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcutree_plugin.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/softirq.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/time/tick-broadcast.c	patch \|	diff1 \|	diff2 \|	blob \| history