Merge branch 'linus' into sched/core

author Ingo Molnar <[email protected]>

Fri, 2 Apr 2010 18:02:55 +0000 (20:02 +0200)

committer Ingo Molnar <[email protected]>

Fri, 2 Apr 2010 18:03:08 +0000 (20:03 +0200)
author Ingo Molnar <[email protected]>
Fri, 2 Apr 2010 18:02:55 +0000 (20:02 +0200)
committer Ingo Molnar <[email protected]>
Fri, 2 Apr 2010 18:03:08 +0000 (20:03 +0200)
diff --combined include/linux/sched.h

index 8604884cee87c01e5363d2ecf3fff06a005173c3,dad7f668ebf70041f3897102a0ff13a1a456edad..43c9451527321ecdbb945a275b45c39d500d1deb
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -258,6 -258,10 +258,10 @@@ extern spinlock_t mmlist_lock
   
   struct task_struct;
   
+ #ifdef CONFIG_PROVE_RCU
+ extern int lockdep_tasklist_lock_is_held(void);
+ #endif /* #ifdef CONFIG_PROVE_RCU */
+ 
   extern void sched_init(void);
   extern void sched_init_smp(void);
   extern asmlinkage void schedule_tail(struct task_struct *prev);
@@@ -271,17 -275,11 +275,17 @@@ extern cpumask_var_t nohz_cpu_mask
   #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
   extern int select_nohz_load_balancer(int cpu);
   extern int get_nohz_load_balancer(void);
+ +extern int nohz_ratelimit(int cpu);
   #else
   static inline int select_nohz_load_balancer(int cpu)
   {
         return 0;
   }
+ +
+ +static inline int nohz_ratelimit(int cpu)
+ +{
+ +      return 0;
+ +}
   #endif
   
   /*
@@@ -402,60 -400,6 +406,6 @@@ extern void arch_unmap_area_topdown(str
   static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
   #endif
   
- #if USE_SPLIT_PTLOCKS
- /*
-  * The mm counters are not protected by its page_table_lock,
-  * so must be incremented atomically.
-  */
- #define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
- #define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
- #define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
- #define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
- #define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
- 
- #else  /* !USE_SPLIT_PTLOCKS */
- /*
-  * The mm counters are protected by its page_table_lock,
-  * so can be incremented directly.
-  */
- #define set_mm_counter(mm, member, value) (mm)->_##member = (value)
- #define get_mm_counter(mm, member) ((mm)->_##member)
- #define add_mm_counter(mm, member, value) (mm)->_##member += (value)
- #define inc_mm_counter(mm, member) (mm)->_##member++
- #define dec_mm_counter(mm, member) (mm)->_##member--
- 
- #endif /* !USE_SPLIT_PTLOCKS */
- 
- #define get_mm_rss(mm)                                        \
-       (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
- #define update_hiwater_rss(mm)        do {                    \
-       unsigned long _rss = get_mm_rss(mm);            \
-       if ((mm)->hiwater_rss < _rss)                   \
-               (mm)->hiwater_rss = _rss;               \
- } while (0)
- #define update_hiwater_vm(mm) do {                    \
-       if ((mm)->hiwater_vm < (mm)->total_vm)          \
-               (mm)->hiwater_vm = (mm)->total_vm;      \
- } while (0)
- 
- static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
- {
-       return max(mm->hiwater_rss, get_mm_rss(mm));
- }
- 
- static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
-                                        struct mm_struct *mm)
- {
-       unsigned long hiwater_rss = get_mm_hiwater_rss(mm);
- 
-       if (*maxrss < hiwater_rss)
-               *maxrss = hiwater_rss;
- }
- 
- static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
- {
-       return max(mm->hiwater_vm, mm->total_vm);
- }
   
   extern void set_dumpable(struct mm_struct *mm, int value);
   extern int get_dumpable(struct mm_struct *mm);
@@@ -1133,8 -1077,36 +1083,8 @@@ struct load_weight 
         unsigned long weight, inv_weight;
   };
   
- -/*
- - * CFS stats for a schedulable entity (task, task-group etc)
- - *
- - * Current field usage histogram:
- - *
- - *     4 se->block_start
- - *     4 se->run_node
- - *     4 se->sleep_start
- - *     6 se->load.weight
- - */
- -struct sched_entity {
- -      struct load_weight      load;           /* for load-balancing */
- -      struct rb_node          run_node;
- -      struct list_head        group_node;
- -      unsigned int            on_rq;
- -
- -      u64                     exec_start;
- -      u64                     sum_exec_runtime;
- -      u64                     vruntime;
- -      u64                     prev_sum_exec_runtime;
- -
- -      u64                     last_wakeup;
- -      u64                     avg_overlap;
- -
- -      u64                     nr_migrations;
- -
- -      u64                     start_runtime;
- -      u64                     avg_wakeup;
- -
   #ifdef CONFIG_SCHEDSTATS
+ +struct sched_statistics {
         u64                     wait_start;
         u64                     wait_max;
         u64                     wait_count;
@@@ -1166,24 -1138,6 +1116,24 @@@
         u64                     nr_wakeups_affine_attempts;
         u64                     nr_wakeups_passive;
         u64                     nr_wakeups_idle;
+ +};
+ +#endif
+ +
+ +struct sched_entity {
+ +      struct load_weight      load;           /* for load-balancing */
+ +      struct rb_node          run_node;
+ +      struct list_head        group_node;
+ +      unsigned int            on_rq;
+ +
+ +      u64                     exec_start;
+ +      u64                     sum_exec_runtime;
+ +      u64                     vruntime;
+ +      u64                     prev_sum_exec_runtime;
+ +
+ +      u64                     nr_migrations;
+ +
+ +#ifdef CONFIG_SCHEDSTATS
+ +      struct sched_statistics statistics;
   #endif
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
@@@ -1270,7 -1224,9 +1220,9 @@@ struct task_struct 
         struct plist_node pushable_tasks;
   
         struct mm_struct *mm, *active_mm;
- 
+ #if defined(SPLIT_RSS_COUNTING)
+       struct task_rss_stat    rss_stat;
+ #endif
   /* task state */
         int exit_state;
         int exit_code, exit_signal;
@@@ -1521,7 -1477,7 +1473,7 @@@
   
         struct list_head        *scm_work_list;
   #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-       /* Index of current stored adress in ret_stack */
+       /* Index of current stored address in ret_stack */
         int curr_ret_stack;
         /* Stack of return addresses for return function tracing */
         struct ftrace_ret_stack *ret_stack;
@@@ -2439,9 -2395,7 +2391,7 @@@ void thread_group_cputimer(struct task_
   
   static inline void thread_group_cputime_init(struct signal_struct *sig)
   {
-       sig->cputimer.cputime = INIT_CPUTIME;
         spin_lock_init(&sig->cputimer.lock);
-       sig->cputimer.running = 0;
   }
   
   static inline void thread_group_cputime_free(struct signal_struct *sig)
diff --combined kernel/sched.c

index cc6dc8caa3809f6e73e8780a39b8f6a3c29e715e,49d2fa7b687a6cd9956e5d7179ab26822bcff6f1..52b7efd274167faf094605665ba37ca73f714804
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -492,11 -492,8 +492,11 @@@ struct rq 
         #define CPU_LOAD_IDX_MAX 5
         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
   #ifdef CONFIG_NO_HZ
+ +      u64 nohz_stamp;
         unsigned char in_nohz_recently;
   #endif
+ +      unsigned int skip_clock_update;
+ +
         /* capture load from *all* tasks on this cpu: */
         struct load_weight load;
         unsigned long nr_load_updates;
@@@ -594,13 -591,6 +594,13 @@@ static inlin
   void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
   {
         rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+ +
+ +      /*
+ +       * A queue event has occurred, and we're going to schedule.  In
+ +       * this case, we can save a useless back to back clock update.
+ +       */
+ +      if (test_tsk_need_resched(p))
+ +              rq->skip_clock_update = 1;
   }
   
   static inline int cpu_of(struct rq *rq)
@@@ -635,8 -625,7 +635,8 @@@
   
   inline void update_rq_clock(struct rq *rq)
   {
- -      rq->clock = sched_clock_cpu(cpu_of(rq));
+ +      if (!rq->skip_clock_update)
+ +              rq->clock = sched_clock_cpu(cpu_of(rq));
   }
   
   /*
@@@ -1239,17 -1228,6 +1239,17 @@@ void wake_up_idle_cpu(int cpu
         if (!tsk_is_polling(rq->idle))
                 smp_send_reschedule(cpu);
   }
+ +
+ +int nohz_ratelimit(int cpu)
+ +{
+ +      struct rq *rq = cpu_rq(cpu);
+ +      u64 diff = rq->clock - rq->nohz_stamp;
+ +
+ +      rq->nohz_stamp = rq->clock;
+ +
+ +      return diff < (NSEC_PER_SEC / HZ) >> 1;
+ +}
+ +
   #endif /* CONFIG_NO_HZ */
   
   static u64 sched_avg_period(void)
@@@ -1543,7 -1521,7 +1543,7 @@@ static unsigned long cpu_avg_load_per_t
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
   
- static __read_mostly unsigned long *update_shares_data;
+ static __read_mostly unsigned long __percpu *update_shares_data;
   
   static void __set_se_shares(struct sched_entity *se, unsigned long shares);
   
@@@ -1792,6 -1770,8 +1792,6 @@@ static void double_rq_lock(struct rq *r
                         raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
                 }
         }
- -      update_rq_clock(rq1);
- -      update_rq_clock(rq2);
   }
   
   /*
@@@ -1888,7 -1868,9 +1888,7 @@@ static void update_avg(u64 *avg, u64 sa
   static void
   enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
   {
- -      if (wakeup)
- -              p->se.start_runtime = p->se.sum_exec_runtime;
- -
+ +      update_rq_clock(rq);
         sched_info_queued(p);
         p->sched_class->enqueue_task(rq, p, wakeup, head);
         p->se.on_rq = 1;
@@@ -1896,7 -1878,17 +1896,7 @@@
   
   static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
   {
- -      if (sleep) {
- -              if (p->se.last_wakeup) {
- -                      update_avg(&p->se.avg_overlap,
- -                              p->se.sum_exec_runtime - p->se.last_wakeup);
- -                      p->se.last_wakeup = 0;
- -              } else {
- -                      update_avg(&p->se.avg_wakeup,
- -                              sysctl_sched_wakeup_granularity);
- -              }
- -      }
- -
+ +      update_rq_clock(rq);
         sched_info_dequeued(p);
         p->sched_class->dequeue_task(rq, p, sleep);
         p->se.on_rq = 0;
@@@ -2369,10 -2361,14 +2369,10 @@@ static int try_to_wake_up(struct task_s
         unsigned long flags;
         struct rq *rq;
   
- -      if (!sched_feat(SYNC_WAKEUPS))
- -              wake_flags &= ~WF_SYNC;
- -
         this_cpu = get_cpu();
   
         smp_wmb();
         rq = task_rq_lock(p, &flags);
- -      update_rq_clock(rq);
         if (!(p->state & state))
                 goto out;
   
@@@ -2413,6 -2409,7 +2413,6 @@@
   
         rq = cpu_rq(cpu);
         raw_spin_lock(&rq->lock);
- -      update_rq_clock(rq);
   
         /*
          * We migrated the task without holding either rq->lock, however
@@@ -2440,18 -2437,34 +2440,18 @@@
   
   out_activate:
   #endif /* CONFIG_SMP */
- -      schedstat_inc(p, se.nr_wakeups);
+ +      schedstat_inc(p, se.statistics.nr_wakeups);
         if (wake_flags & WF_SYNC)
- -              schedstat_inc(p, se.nr_wakeups_sync);
+ +              schedstat_inc(p, se.statistics.nr_wakeups_sync);
         if (orig_cpu != cpu)
- -              schedstat_inc(p, se.nr_wakeups_migrate);
+ +              schedstat_inc(p, se.statistics.nr_wakeups_migrate);
         if (cpu == this_cpu)
- -              schedstat_inc(p, se.nr_wakeups_local);
+ +              schedstat_inc(p, se.statistics.nr_wakeups_local);
         else
- -              schedstat_inc(p, se.nr_wakeups_remote);
+ +              schedstat_inc(p, se.statistics.nr_wakeups_remote);
         activate_task(rq, p, 1);
         success = 1;
   
- -      /*
- -       * Only attribute actual wakeups done by this task.
- -       */
- -      if (!in_interrupt()) {
- -              struct sched_entity *se = &current->se;
- -              u64 sample = se->sum_exec_runtime;
- -
- -              if (se->last_wakeup)
- -                      sample -= se->last_wakeup;
- -              else
- -                      sample -= se->start_runtime;
- -              update_avg(&se->avg_wakeup, sample);
- -
- -              se->last_wakeup = se->sum_exec_runtime;
- -      }
- -
   out_running:
         trace_sched_wakeup(rq, p, success);
         check_preempt_curr(rq, p, wake_flags);
@@@ -2513,9 -2526,42 +2513,9 @@@ static void __sched_fork(struct task_st
         p->se.sum_exec_runtime          = 0;
         p->se.prev_sum_exec_runtime     = 0;
         p->se.nr_migrations             = 0;
- -      p->se.last_wakeup               = 0;
- -      p->se.avg_overlap               = 0;
- -      p->se.start_runtime             = 0;
- -      p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
   
   #ifdef CONFIG_SCHEDSTATS
- -      p->se.wait_start                        = 0;
- -      p->se.wait_max                          = 0;
- -      p->se.wait_count                        = 0;
- -      p->se.wait_sum                          = 0;
- -
- -      p->se.sleep_start                       = 0;
- -      p->se.sleep_max                         = 0;
- -      p->se.sum_sleep_runtime                 = 0;
- -
- -      p->se.block_start                       = 0;
- -      p->se.block_max                         = 0;
- -      p->se.exec_max                          = 0;
- -      p->se.slice_max                         = 0;
- -
- -      p->se.nr_migrations_cold                = 0;
- -      p->se.nr_failed_migrations_affine       = 0;
- -      p->se.nr_failed_migrations_running      = 0;
- -      p->se.nr_failed_migrations_hot          = 0;
- -      p->se.nr_forced_migrations              = 0;
- -
- -      p->se.nr_wakeups                        = 0;
- -      p->se.nr_wakeups_sync                   = 0;
- -      p->se.nr_wakeups_migrate                = 0;
- -      p->se.nr_wakeups_local                  = 0;
- -      p->se.nr_wakeups_remote                 = 0;
- -      p->se.nr_wakeups_affine                 = 0;
- -      p->se.nr_wakeups_affine_attempts        = 0;
- -      p->se.nr_wakeups_passive                = 0;
- -      p->se.nr_wakeups_idle                   = 0;
- -
+ +      memset(&p->se.statistics, 0, sizeof(p->se.statistics));
   #endif
   
         INIT_LIST_HEAD(&p->rt.run_list);
@@@ -2604,7 -2650,7 +2604,7 @@@ void wake_up_new_task(struct task_struc
   {
         unsigned long flags;
         struct rq *rq;
-       int cpu = get_cpu();
+       int cpu __maybe_unused = get_cpu();
   
   #ifdef CONFIG_SMP
         /*
@@@ -2629,6 -2675,7 +2629,6 @@@
   
         BUG_ON(p->state != TASK_WAKING);
         p->state = TASK_RUNNING;
- -      update_rq_clock(rq);
         activate_task(rq, p, 0);
         trace_sched_wakeup_new(rq, p, 1);
         check_preempt_curr(rq, p, WF_FORK);
@@@ -3582,9 -3629,23 +3582,9 @@@ static inline void schedule_debug(struc
   
   static void put_prev_task(struct rq *rq, struct task_struct *prev)
   {
- -      if (prev->state == TASK_RUNNING) {
- -              u64 runtime = prev->se.sum_exec_runtime;
- -
- -              runtime -= prev->se.prev_sum_exec_runtime;
- -              runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
- -
- -              /*
- -               * In order to avoid avg_overlap growing stale when we are
- -               * indeed overlapping and hence not getting put to sleep, grow
- -               * the avg_overlap on preemption.
- -               *
- -               * We use the average preemption runtime because that
- -               * correlates to the amount of cache footprint a task can
- -               * build up.
- -               */
- -              update_avg(&prev->se.avg_overlap, runtime);
- -      }
+ +      if (prev->se.on_rq)
+ +              update_rq_clock(rq);
+ +      rq->skip_clock_update = 0;
         prev->sched_class->put_prev_task(rq, prev);
   }
   
@@@ -3647,6 -3708,7 +3647,6 @@@ need_resched_nonpreemptible
                 hrtick_clear(rq);
   
         raw_spin_lock_irq(&rq->lock);
- -      update_rq_clock(rq);
         clear_tsk_need_resched(prev);
   
         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@@ -4203,6 -4265,7 +4203,6 @@@ void rt_mutex_setprio(struct task_struc
         BUG_ON(prio < 0 || prio > MAX_PRIO);
   
         rq = task_rq_lock(p, &flags);
- -      update_rq_clock(rq);
   
         oldprio = p->prio;
         prev_class = p->sched_class;
@@@ -4245,6 -4308,7 +4245,6 @@@ void set_user_nice(struct task_struct *
          * the task might be in the middle of scheduling on another CPU.
          */
         rq = task_rq_lock(p, &flags);
- -      update_rq_clock(rq);
         /*
          * The RT priorities are set via sched_setscheduler(), but we still
          * allow the 'normal' nice value to be set - but as expected
@@@ -4289,7 -4353,7 +4289,7 @@@ int can_nice(const struct task_struct *
         /* convert nice value [19,-20] to rlimit style value [1,40] */
         int nice_rlim = 20 - nice;
   
-       return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+       return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
                 capable(CAP_SYS_NICE));
   }
   
@@@ -4466,7 -4530,7 +4466,7 @@@ recheck
   
                         if (!lock_task_sighand(p, &flags))
                                 return -ESRCH;
-                       rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+                       rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
                         unlock_task_sighand(p, &flags);
   
                         /* can't set/change the rt policy */
@@@ -4527,6 -4591,7 +4527,6 @@@
                 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                 goto recheck;
         }
- -      update_rq_clock(rq);
         on_rq = p->se.on_rq;
         running = task_current(rq, p);
         if (on_rq)
@@@ -4837,7 -4902,9 +4837,9 @@@ SYSCALL_DEFINE3(sched_getaffinity, pid_
         int ret;
         cpumask_var_t mask;
   
-       if (len < cpumask_size())
+       if (len < nr_cpu_ids)
+               return -EINVAL;
+       if (len & (sizeof(unsigned long)-1))
                 return -EINVAL;
   
         if (!alloc_cpumask_var(&mask, GFP_KERNEL))
@@@ -4845,10 -4912,12 +4847,12 @@@
   
         ret = sched_getaffinity(pid, mask);
         if (ret == 0) {
-               if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+               size_t retlen = min_t(size_t, len, cpumask_size());
+ 
+               if (copy_to_user(user_mask_ptr, mask, retlen))
                         ret = -EFAULT;
                 else
-                       ret = cpumask_size();
+                       ret = retlen;
         }
         free_cpumask_var(mask);
   
@@@ -5533,6 -5602,7 +5537,6 @@@ void sched_idle_next(void
   
         __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
   
- -      update_rq_clock(rq);
         activate_task(rq, p, 0);
   
         raw_spin_unlock_irqrestore(&rq->lock, flags);
@@@ -5587,6 -5657,7 +5591,6 @@@ static void migrate_dead_tasks(unsigne
         for ( ; ; ) {
                 if (!rq->nr_running)
                         break;
- -              update_rq_clock(rq);
                 next = pick_next_task(rq);
                 if (!next)
                         break;
@@@ -5870,6 -5941,7 +5874,6 @@@ migration_call(struct notifier_block *n
                 rq->migration_thread = NULL;
                 /* Idle task back to normal (off runqueue, low prio) */
                 raw_spin_lock_irq(&rq->lock);
- -              update_rq_clock(rq);
                 deactivate_task(rq, rq->idle, 0);
                 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
                 rq->idle->sched_class = &idle_sched_class;
@@@ -7338,11 -7410,13 +7342,13 @@@ static ssize_t sched_power_savings_stor
   
   #ifdef CONFIG_SCHED_MC
   static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+                                          struct sysdev_class_attribute *attr,
                                            char *page)
   {
         return sprintf(page, "%u\n", sched_mc_power_savings);
   }
   static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
+                                           struct sysdev_class_attribute *attr,
                                             const char *buf, size_t count)
   {
         return sched_power_savings_store(buf, count, 0);
@@@ -7354,11 -7428,13 +7360,13 @@@ static SYSDEV_CLASS_ATTR(sched_mc_power
   
   #ifdef CONFIG_SCHED_SMT
   static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+                                           struct sysdev_class_attribute *attr,
                                             char *page)
   {
         return sprintf(page, "%u\n", sched_smt_power_savings);
   }
   static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
+                                            struct sysdev_class_attribute *attr,
                                              const char *buf, size_t count)
   {
         return sched_power_savings_store(buf, count, 1);
@@@ -7815,6 -7891,7 +7823,6 @@@ static void normalize_task(struct rq *r
   {
         int on_rq;
   
- -      update_rq_clock(rq);
         on_rq = p->se.on_rq;
         if (on_rq)
                 deactivate_task(rq, p, 0);
@@@ -7841,9 -7918,9 +7849,9 @@@ void normalize_rt_tasks(void
   
                 p->se.exec_start                = 0;
   #ifdef CONFIG_SCHEDSTATS
- -              p->se.wait_start                = 0;
- -              p->se.sleep_start               = 0;
- -              p->se.block_start               = 0;
+ +              p->se.statistics.wait_start     = 0;
+ +              p->se.statistics.sleep_start    = 0;
+ +              p->se.statistics.block_start    = 0;
   #endif
   
                 if (!rt_task(p)) {
@@@ -8176,6 -8253,8 +8184,6 @@@ void sched_move_task(struct task_struc
   
         rq = task_rq_lock(tsk, &flags);
   
- -      update_rq_clock(rq);
- -
         running = task_current(rq, tsk);
         on_rq = tsk->se.on_rq;
   
@@@ -8742,7 -8821,7 +8750,7 @@@ struct cgroup_subsys cpu_cgroup_subsys 
   struct cpuacct {
         struct cgroup_subsys_state css;
         /* cpuusage holds pointer to a u64-type object on every cpu */
-       u64 *cpuusage;
+       u64 __percpu *cpuusage;
         struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
         struct cpuacct *parent;
   };
diff --combined kernel/sched_fair.c

index 35a5c649638b8cdcbd9901c9a9dd659465ac823f,5a5ea2cd924fa8494abfa21f8203f919f40ff1ca..49ad99378f82b064258d67829f1443d569716067
--- 1/kernel/sched_fair.c
--- 2/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@@ -35,8 -35,8 +35,8 @@@
    * (to see the precise effective timeslice length of your workload,
    *  run vmstat and monitor the context-switches (cs) field)
    */
- -unsigned int sysctl_sched_latency = 5000000ULL;
- -unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+ +unsigned int sysctl_sched_latency = 6000000ULL;
+ +unsigned int normalized_sysctl_sched_latency = 6000000ULL;
   
   /*
    * The initial- and re-scaling of tunables is configurable
@@@ -52,15 -52,15 +52,15 @@@ enum sched_tunable_scaling sysctl_sched
   
   /*
    * Minimal preemption granularity for CPU-bound tasks:
- - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ + * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
    */
- -unsigned int sysctl_sched_min_granularity = 1000000ULL;
- -unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
+ +unsigned int sysctl_sched_min_granularity = 2000000ULL;
+ +unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
   
   /*
    * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
    */
- -static unsigned int sched_nr_latency = 5;
+ +static unsigned int sched_nr_latency = 3;
   
   /*
    * After fork, child runs first. If set to 0 (default) then
@@@ -505,8 -505,7 +505,8 @@@ __update_curr(struct cfs_rq *cfs_rq, st
   {
         unsigned long delta_exec_weighted;
   
- -      schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
+ +      schedstat_set(curr->statistics.exec_max,
+ +                    max((u64)delta_exec, curr->statistics.exec_max));
   
         curr->sum_exec_runtime += delta_exec;
         schedstat_add(cfs_rq, exec_clock, delta_exec);
@@@ -549,7 -548,7 +549,7 @@@ static void update_curr(struct cfs_rq *
   static inline void
   update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
- -      schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
+ +      schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
   }
   
   /*
@@@ -568,18 -567,18 +568,18 @@@ static void update_stats_enqueue(struc
   static void
   update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
- -      schedstat_set(se->wait_max, max(se->wait_max,
- -                      rq_of(cfs_rq)->clock - se->wait_start));
- -      schedstat_set(se->wait_count, se->wait_count + 1);
- -      schedstat_set(se->wait_sum, se->wait_sum +
- -                      rq_of(cfs_rq)->clock - se->wait_start);
+ +      schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
+ +                      rq_of(cfs_rq)->clock - se->statistics.wait_start));
+ +      schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
+ +      schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
+ +                      rq_of(cfs_rq)->clock - se->statistics.wait_start);
   #ifdef CONFIG_SCHEDSTATS
         if (entity_is_task(se)) {
                 trace_sched_stat_wait(task_of(se),
- -                      rq_of(cfs_rq)->clock - se->wait_start);
+ +                      rq_of(cfs_rq)->clock - se->statistics.wait_start);
         }
   #endif
- -      schedstat_set(se->wait_start, 0);
+ +      schedstat_set(se->statistics.wait_start, 0);
   }
   
   static inline void
@@@ -658,39 -657,39 +658,39 @@@ static void enqueue_sleeper(struct cfs_
         if (entity_is_task(se))
                 tsk = task_of(se);
   
- -      if (se->sleep_start) {
- -              u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+ +      if (se->statistics.sleep_start) {
+ +              u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
   
                 if ((s64)delta < 0)
                         delta = 0;
   
- -              if (unlikely(delta > se->sleep_max))
- -                      se->sleep_max = delta;
+ +              if (unlikely(delta > se->statistics.sleep_max))
+ +                      se->statistics.sleep_max = delta;
   
- -              se->sleep_start = 0;
- -              se->sum_sleep_runtime += delta;
+ +              se->statistics.sleep_start = 0;
+ +              se->statistics.sum_sleep_runtime += delta;
   
                 if (tsk) {
                         account_scheduler_latency(tsk, delta >> 10, 1);
                         trace_sched_stat_sleep(tsk, delta);
                 }
         }
- -      if (se->block_start) {
- -              u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+ +      if (se->statistics.block_start) {
+ +              u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
   
                 if ((s64)delta < 0)
                         delta = 0;
   
- -              if (unlikely(delta > se->block_max))
- -                      se->block_max = delta;
+ +              if (unlikely(delta > se->statistics.block_max))
+ +                      se->statistics.block_max = delta;
   
- -              se->block_start = 0;
- -              se->sum_sleep_runtime += delta;
+ +              se->statistics.block_start = 0;
+ +              se->statistics.sum_sleep_runtime += delta;
   
                 if (tsk) {
                         if (tsk->in_iowait) {
- -                              se->iowait_sum += delta;
- -                              se->iowait_count++;
+ +                              se->statistics.iowait_sum += delta;
+ +                              se->statistics.iowait_count++;
                                 trace_sched_stat_iowait(tsk, delta);
                         }
   
@@@ -738,9 -737,19 +738,9 @@@ place_entity(struct cfs_rq *cfs_rq, str
                 vruntime += sched_vslice(cfs_rq, se);
   
         /* sleeps up to a single latency don't count. */
- -      if (!initial && sched_feat(FAIR_SLEEPERS)) {
+ +      if (!initial) {
                 unsigned long thresh = sysctl_sched_latency;
   
- -              /*
- -               * Convert the sleeper threshold into virtual time.
- -               * SCHED_IDLE is a special sub-class.  We care about
- -               * fairness only relative to other SCHED_IDLE tasks,
- -               * all of which have the same weight.
- -               */
- -              if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
- -                               task_of(se)->policy != SCHED_IDLE))
- -                      thresh = calc_delta_fair(thresh, se);
- -
                 /*
                  * Halve their sleep time's effect, to allow
                  * for a gentler effect of sleepers:
@@@ -817,9 -826,9 +817,9 @@@ dequeue_entity(struct cfs_rq *cfs_rq, s
                         struct task_struct *tsk = task_of(se);
   
                         if (tsk->state & TASK_INTERRUPTIBLE)
- -                              se->sleep_start = rq_of(cfs_rq)->clock;
+ +                              se->statistics.sleep_start = rq_of(cfs_rq)->clock;
                         if (tsk->state & TASK_UNINTERRUPTIBLE)
- -                              se->block_start = rq_of(cfs_rq)->clock;
+ +                              se->statistics.block_start = rq_of(cfs_rq)->clock;
                 }
   #endif
         }
@@@ -903,7 -912,7 +903,7 @@@ set_next_entity(struct cfs_rq *cfs_rq, 
          * when there are only lesser-weight tasks around):
          */
         if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
- -              se->slice_max = max(se->slice_max,
+ +              se->statistics.slice_max = max(se->statistics.slice_max,
                         se->sum_exec_runtime - se->prev_sum_exec_runtime);
         }
   #endif
@@@ -1231,6 -1240,7 +1231,6 @@@ static inline unsigned long effective_l
   
   static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
   {
- -      struct task_struct *curr = current;
         unsigned long this_load, load;
         int idx, this_cpu, prev_cpu;
         unsigned long tl_per_task;
@@@ -1245,6 -1255,18 +1245,6 @@@
         load      = source_load(prev_cpu, idx);
         this_load = target_load(this_cpu, idx);
   
- -      if (sync) {
- -             if (sched_feat(SYNC_LESS) &&
- -                 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
- -                  p->se.avg_overlap > sysctl_sched_migration_cost))
- -                     sync = 0;
- -      } else {
- -              if (sched_feat(SYNC_MORE) &&
- -                  (curr->se.avg_overlap < sysctl_sched_migration_cost &&
- -                   p->se.avg_overlap < sysctl_sched_migration_cost))
- -                      sync = 1;
- -      }
- -
         /*
          * If sync wakeup then subtract the (maximum possible)
          * effect of the currently running task from the load
@@@ -1284,7 -1306,7 +1284,7 @@@
         if (sync && balanced)
                 return 1;
   
- -      schedstat_inc(p, se.nr_wakeups_affine_attempts);
+ +      schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
         tl_per_task = cpu_avg_load_per_task(this_cpu);
   
         if (balanced ||
@@@ -1296,7 -1318,7 +1296,7 @@@
                  * there is no bad imbalance.
                  */
                 schedstat_inc(sd, ttwu_move_affine);
- -              schedstat_inc(p, se.nr_wakeups_affine);
+ +              schedstat_inc(p, se.statistics.nr_wakeups_affine);
   
                 return 1;
         }
@@@ -1429,12 -1451,13 +1429,12 @@@ static int select_task_rq_fair(struct t
         int cpu = smp_processor_id();
         int prev_cpu = task_cpu(p);
         int new_cpu = cpu;
- -      int want_affine = 0;
+ +      int want_affine = 0, cpu_idle = !current->pid;
         int want_sd = 1;
         int sync = wake_flags & WF_SYNC;
   
         if (sd_flag & SD_BALANCE_WAKE) {
- -              if (sched_feat(AFFINE_WAKEUPS) &&
- -                  cpumask_test_cpu(cpu, &p->cpus_allowed))
+ +              if (cpumask_test_cpu(cpu, &p->cpus_allowed))
                         want_affine = 1;
                 new_cpu = prev_cpu;
         }
@@@ -1486,15 -1509,13 +1486,15 @@@
                          * If there's an idle sibling in this domain, make that
                          * the wake_affine target instead of the current cpu.
                          */
- -                      if (tmp->flags & SD_SHARE_PKG_RESOURCES)
+ +                      if (!cpu_idle && tmp->flags & SD_SHARE_PKG_RESOURCES)
                                 target = select_idle_sibling(p, tmp, target);
   
                         if (target >= 0) {
                                 if (tmp->flags & SD_WAKE_AFFINE) {
                                         affine_sd = tmp;
                                         want_affine = 0;
+ +                                      if (target != cpu)
+ +                                              cpu_idle = 1;
                                 }
                                 cpu = target;
                         }
@@@ -1510,7 -1531,6 +1510,7 @@@
                         sd = tmp;
         }
   
+ +#ifdef CONFIG_FAIR_GROUP_SCHED
         if (sched_feat(LB_SHARES_UPDATE)) {
                 /*
                  * Pick the largest domain to update shares over
@@@ -1524,12 -1544,9 +1524,12 @@@
                 if (tmp)
                         update_shares(tmp);
         }
+ +#endif
   
- -      if (affine_sd && wake_affine(affine_sd, p, sync))
- -              return cpu;
+ +      if (affine_sd) {
+ +              if (cpu_idle || cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+ +                      return cpu;
+ +      }
   
         while (sd) {
                 int load_idx = sd->forkexec_idx;
@@@ -1574,26 -1591,63 +1574,26 @@@
   }
   #endif /* CONFIG_SMP */
   
- -/*
- - * Adaptive granularity
- - *
- - * se->avg_wakeup gives the average time a task runs until it does a wakeup,
- - * with the limit of wakeup_gran -- when it never does a wakeup.
- - *
- - * So the smaller avg_wakeup is the faster we want this task to preempt,
- - * but we don't want to treat the preemptee unfairly and therefore allow it
- - * to run for at least the amount of time we'd like to run.
- - *
- - * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
- - *
- - * NOTE: we use *nr_running to scale with load, this nicely matches the
- - *       degrading latency on load.
- - */
- -static unsigned long
- -adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
- -{
- -      u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- -      u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
- -      u64 gran = 0;
- -
- -      if (this_run < expected_wakeup)
- -              gran = expected_wakeup - this_run;
- -
- -      return min_t(s64, gran, sysctl_sched_wakeup_granularity);
- -}
- -
   static unsigned long
   wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
   {
         unsigned long gran = sysctl_sched_wakeup_granularity;
   
- -      if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
- -              gran = adaptive_gran(curr, se);
- -
         /*
          * Since its curr running now, convert the gran from real-time
          * to virtual-time in his units.
+ +       *
+ +       * By using 'se' instead of 'curr' we penalize light tasks, so
+ +       * they get preempted easier. That is, if 'se' < 'curr' then
+ +       * the resulting gran will be larger, therefore penalizing the
+ +       * lighter, if otoh 'se' > 'curr' then the resulting gran will
+ +       * be smaller, again penalizing the lighter task.
+ +       *
+ +       * This is especially important for buddies when the leftmost
+ +       * task is higher priority than the buddy.
          */
- -      if (sched_feat(ASYM_GRAN)) {
- -              /*
- -               * By using 'se' instead of 'curr' we penalize light tasks, so
- -               * they get preempted easier. That is, if 'se' < 'curr' then
- -               * the resulting gran will be larger, therefore penalizing the
- -               * lighter, if otoh 'se' > 'curr' then the resulting gran will
- -               * be smaller, again penalizing the lighter task.
- -               *
- -               * This is especially important for buddies when the leftmost
- -               * task is higher priority than the buddy.
- -               */
- -              if (unlikely(se->load.weight != NICE_0_LOAD))
- -                      gran = calc_delta_fair(gran, se);
- -      } else {
- -              if (unlikely(curr->load.weight != NICE_0_LOAD))
- -                      gran = calc_delta_fair(gran, curr);
- -      }
+ +      if (unlikely(se->load.weight != NICE_0_LOAD))
+ +              gran = calc_delta_fair(gran, se);
   
         return gran;
   }
@@@ -1651,6 -1705,7 +1651,6 @@@ static void check_preempt_wakeup(struc
         struct task_struct *curr = rq->curr;
         struct sched_entity *se = &curr->se, *pse = &p->se;
         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- -      int sync = wake_flags & WF_SYNC;
         int scale = cfs_rq->nr_running >= sched_nr_latency;
   
         if (unlikely(rt_prio(p->prio)))
@@@ -1683,6 -1738,14 +1683,6 @@@
         if (unlikely(curr->policy == SCHED_IDLE))
                 goto preempt;
   
- -      if (sched_feat(WAKEUP_SYNC) && sync)
- -              goto preempt;
- -
- -      if (sched_feat(WAKEUP_OVERLAP) &&
- -                      se->avg_overlap < sysctl_sched_migration_cost &&
- -                      pse->avg_overlap < sysctl_sched_migration_cost)
- -              goto preempt;
- -
         if (!sched_feat(WAKEUP_PREEMPT))
                 return;
   
@@@ -1781,13 -1844,13 +1781,13 @@@ int can_migrate_task(struct task_struc
          * 3) are cache-hot on their current CPU.
          */
         if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
- -              schedstat_inc(p, se.nr_failed_migrations_affine);
+ +              schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
                 return 0;
         }
         *all_pinned = 0;
   
         if (task_running(rq, p)) {
- -              schedstat_inc(p, se.nr_failed_migrations_running);
+ +              schedstat_inc(p, se.statistics.nr_failed_migrations_running);
                 return 0;
         }
   
@@@ -1803,14 -1866,14 +1803,14 @@@
   #ifdef CONFIG_SCHEDSTATS
                 if (tsk_cache_hot) {
                         schedstat_inc(sd, lb_hot_gained[idle]);
- -                      schedstat_inc(p, se.nr_forced_migrations);
+ +                      schedstat_inc(p, se.statistics.nr_forced_migrations);
                 }
   #endif
                 return 1;
         }
   
         if (tsk_cache_hot) {
- -              schedstat_inc(p, se.nr_failed_migrations_hot);
+ +              schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
                 return 0;
         }
         return 1;
@@@ -3049,6 -3112,8 +3049,6 @@@ static void active_load_balance(struct 
   
         /* move a task from busiest_rq to target_rq */
         double_lock_balance(busiest_rq, target_rq);
- -      update_rq_clock(busiest_rq);
- -      update_rq_clock(target_rq);
   
         /* Search for an sd spanning us and the target CPU. */
         for_each_domain(target_cpu, sd) {
@@@ -3411,7 -3476,7 +3411,7 @@@ static void run_rebalance_domains(struc
   
   static inline int on_null_domain(int cpu)
   {
-       return !rcu_dereference(cpu_rq(cpu)->sd);
+       return !rcu_dereference_sched(cpu_rq(cpu)->sd);
   }
   
   /*
diff --combined kernel/sched_rt.c

index 0335e87f5204f096378dd6ef40a7027fff4d13a2,b5b920ae2ea7fe83ca17d2c94d0a7b638574144c..012d69bb67c7a0e41a18141fe7d338026a9491be
--- 1/kernel/sched_rt.c
--- 2/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@@ -613,7 -613,7 +613,7 @@@ static void update_curr_rt(struct rq *r
         if (unlikely((s64)delta_exec < 0))
                 delta_exec = 0;
   
- -      schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
+ +      schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
   
         curr->se.sum_exec_runtime += delta_exec;
         account_group_exec_runtime(curr, delta_exec);
@@@ -1667,8 -1667,9 +1667,9 @@@ static void watchdog(struct rq *rq, str
         if (!p->signal)
                 return;
   
-       soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
-       hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
+       /* max may change after cur was read, this will be fixed next tick */
+       soft = task_rlimit(p, RLIMIT_RTTIME);
+       hard = task_rlimit_max(p, RLIMIT_RTTIME);
   
         if (soft != RLIM_INFINITY) {
                 unsigned long next;
author	Ingo Molnar <[email protected]>
	Fri, 2 Apr 2010 18:02:55 +0000 (20:02 +0200)
committer	Ingo Molnar <[email protected]>
	Fri, 2 Apr 2010 18:03:08 +0000 (20:03 +0200)
		1	2
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_rt.c	patch \|	diff1 \|	diff2 \|	blob \| history