Merge tag 'v3.15-rc6' into sched/core, to pick up the latest fixes

author Ingo Molnar <[email protected]>

Thu, 22 May 2014 08:28:56 +0000 (10:28 +0200)

committer Ingo Molnar <[email protected]>

Thu, 22 May 2014 08:28:56 +0000 (10:28 +0200)
author Ingo Molnar <[email protected]>
Thu, 22 May 2014 08:28:56 +0000 (10:28 +0200)
committer Ingo Molnar <[email protected]>
Thu, 22 May 2014 08:28:56 +0000 (10:28 +0200)
diff --combined kernel/sched/core.c

index 4b82622b62529a87ddedff7fbb862a4ef2b14948,d9d8ece46a15885410a1bb138f4c6c7155573803..092e511605ecb8a7661a0d5be97cf7384114f7cc
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -505,39 -505,6 +505,39 @@@ static inline void init_hrtick(void
   }
   #endif        /* CONFIG_SCHED_HRTICK */
   
+ +/*
+ + * cmpxchg based fetch_or, macro so it works for different integer types
+ + */
+ +#define fetch_or(ptr, val)                                            \
+ +({    typeof(*(ptr)) __old, __val = *(ptr);                           \
+ +      for (;;) {                                                      \
+ +              __old = cmpxchg((ptr), __val, __val | (val));           \
+ +              if (__old == __val)                                     \
+ +                      break;                                          \
+ +              __val = __old;                                          \
+ +      }                                                               \
+ +      __old;                                                          \
+ +})
+ +
+ +#ifdef TIF_POLLING_NRFLAG
+ +/*
+ + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ + * this avoids any races wrt polling state changes and thereby avoids
+ + * spurious IPIs.
+ + */
+ +static bool set_nr_and_not_polling(struct task_struct *p)
+ +{
+ +      struct thread_info *ti = task_thread_info(p);
+ +      return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+ +}
+ +#else
+ +static bool set_nr_and_not_polling(struct task_struct *p)
+ +{
+ +      set_tsk_need_resched(p);
+ +      return true;
+ +}
+ +#endif
+ +
   /*
    * resched_task - mark a task 'to be rescheduled now'.
    *
@@@ -554,15 -521,17 +554,15 @@@ void resched_task(struct task_struct *p
         if (test_tsk_need_resched(p))
                 return;
   
- -      set_tsk_need_resched(p);
- -
         cpu = task_cpu(p);
+ +
         if (cpu == smp_processor_id()) {
+ +              set_tsk_need_resched(p);
                 set_preempt_need_resched();
                 return;
         }
   
- -      /* NEED_RESCHED must be visible before we test polling */
- -      smp_mb();
- -      if (!tsk_is_polling(p))
+ +      if (set_nr_and_not_polling(p))
                 smp_send_reschedule(cpu);
   }
   
@@@ -2223,7 -2192,7 +2223,7 @@@ static inline void post_schedule(struc
    * schedule_tail - first thing a freshly forked thread must call.
    * @prev: the thread we just switched away from.
    */
- asmlinkage void schedule_tail(struct task_struct *prev)
+ asmlinkage __visible void schedule_tail(struct task_struct *prev)
         __releases(rq->lock)
   {
         struct rq *rq = this_rq();
@@@ -2623,14 -2592,8 +2623,14 @@@ pick_next_task(struct rq *rq, struct ta
         if (likely(prev->sched_class == class &&
                    rq->nr_running == rq->cfs.h_nr_running)) {
                 p = fair_sched_class.pick_next_task(rq, prev);
- -              if (likely(p && p != RETRY_TASK))
- -                      return p;
+ +              if (unlikely(p == RETRY_TASK))
+ +                      goto again;
+ +
+ +              /* assumes fair_sched_class->next == idle_sched_class */
+ +              if (unlikely(!p))
+ +                      p = idle_sched_class.pick_next_task(rq, prev);
+ +
+ +              return p;
         }
   
   again:
@@@ -2778,7 -2741,7 +2778,7 @@@ static inline void sched_submit_work(st
                 blk_schedule_flush_plug(tsk);
   }
   
- asmlinkage void __sched schedule(void)
+ asmlinkage __visible void __sched schedule(void)
   {
         struct task_struct *tsk = current;
   
@@@ -2788,7 -2751,7 +2788,7 @@@
   EXPORT_SYMBOL(schedule);
   
   #ifdef CONFIG_CONTEXT_TRACKING
- asmlinkage void __sched schedule_user(void)
+ asmlinkage __visible void __sched schedule_user(void)
   {
         /*
          * If we come here after a random call to set_need_resched(),
@@@ -2820,7 -2783,7 +2820,7 @@@ void __sched schedule_preempt_disabled(
    * off of preempt_enable. Kernel preemptions off return from interrupt
    * occur there and call schedule directly.
    */
- asmlinkage void __sched notrace preempt_schedule(void)
+ asmlinkage __visible void __sched notrace preempt_schedule(void)
   {
         /*
          * If there is a non-zero preempt_count or interrupts are disabled,
@@@ -2850,7 -2813,7 +2850,7 @@@ EXPORT_SYMBOL(preempt_schedule)
    * Note, that this is called and return with irqs disabled. This will
    * protect us against recursive calling from irq.
    */
- asmlinkage void __sched preempt_schedule_irq(void)
+ asmlinkage __visible void __sched preempt_schedule_irq(void)
   {
         enum ctx_state prev_state;
   
@@@ -3161,7 -3124,6 +3161,7 @@@ __setparam_dl(struct task_struct *p, co
         dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
         dl_se->dl_throttled = 0;
         dl_se->dl_new = 1;
+ +      dl_se->dl_yielded = 0;
   }
   
   static void __setscheduler_params(struct task_struct *p,
@@@ -3677,7 -3639,6 +3677,7 @@@ SYSCALL_DEFINE2(sched_setparam, pid_t, 
    * sys_sched_setattr - same as above, but with extended sched_attr
    * @pid: the pid in question.
    * @uattr: structure containing the extended parameters.
+ + * @flags: for future extension.
    */
   SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
                                unsigned int, flags)
@@@ -3822,7 -3783,6 +3822,7 @@@ err_size
    * @pid: the pid in question.
    * @uattr: structure containing the extended parameters.
    * @size: sizeof(attr) for fwd/bwd comp.
+ + * @flags: for future extension.
    */
   SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
                 unsigned int, size, unsigned int, flags)
@@@ -5292,8 -5252,7 +5292,8 @@@ static int sd_degenerate(struct sched_d
                          SD_BALANCE_FORK |
                          SD_BALANCE_EXEC |
                          SD_SHARE_CPUPOWER |
- -                       SD_SHARE_PKG_RESOURCES)) {
+ +                       SD_SHARE_PKG_RESOURCES |
+ +                       SD_SHARE_POWERDOMAIN)) {
                 if (sd->groups != sd->groups->next)
                         return 0;
         }
@@@ -5324,8 -5283,7 +5324,8 @@@ sd_parent_degenerate(struct sched_domai
                                 SD_BALANCE_EXEC |
                                 SD_SHARE_CPUPOWER |
                                 SD_SHARE_PKG_RESOURCES |
- -                              SD_PREFER_SIBLING);
+ +                              SD_PREFER_SIBLING |
+ +                              SD_SHARE_POWERDOMAIN);
                 if (nr_node_ids == 1)
                         pflags &= ~SD_SERIALIZE;
         }
@@@ -5599,6 -5557,17 +5599,6 @@@ static int __init isolated_cpu_setup(ch
   
   __setup("isolcpus=", isolated_cpu_setup);
   
- -static const struct cpumask *cpu_cpu_mask(int cpu)
- -{
- -      return cpumask_of_node(cpu_to_node(cpu));
- -}
- -
- -struct sd_data {
- -      struct sched_domain **__percpu sd;
- -      struct sched_group **__percpu sg;
- -      struct sched_group_power **__percpu sgp;
- -};
- -
   struct s_data {
         struct sched_domain ** __percpu sd;
         struct root_domain      *rd;
@@@ -5611,6 -5580,21 +5611,6 @@@ enum s_alloc 
         sa_none,
   };
   
- -struct sched_domain_topology_level;
- -
- -typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
- -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
- -
- -#define SDTL_OVERLAP  0x01
- -
- -struct sched_domain_topology_level {
- -      sched_domain_init_f init;
- -      sched_domain_mask_f mask;
- -      int                 flags;
- -      int                 numa_level;
- -      struct sd_data      data;
- -};
- -
   /*
    * Build an iteration mask that can exclude certain CPUs from the upwards
    * domain traversal.
@@@ -5829,11 -5813,44 +5829,11 @@@ static void init_sched_groups_power(in
         atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
   }
   
- -int __weak arch_sd_sibling_asym_packing(void)
- -{
- -       return 0*SD_ASYM_PACKING;
- -}
- -
   /*
    * Initializers for schedule domains
    * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
    */
   
- -#ifdef CONFIG_SCHED_DEBUG
- -# define SD_INIT_NAME(sd, type)               sd->name = #type
- -#else
- -# define SD_INIT_NAME(sd, type)               do { } while (0)
- -#endif
- -
- -#define SD_INIT_FUNC(type)                                            \
- -static noinline struct sched_domain *                                 \
- -sd_init_##type(struct sched_domain_topology_level *tl, int cpu)       \
- -{                                                                     \
- -      struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
- -      *sd = SD_##type##_INIT;                                         \
- -      SD_INIT_NAME(sd, type);                                         \
- -      sd->private = &tl->data;                                        \
- -      return sd;                                                      \
- -}
- -
- -SD_INIT_FUNC(CPU)
- -#ifdef CONFIG_SCHED_SMT
- - SD_INIT_FUNC(SIBLING)
- -#endif
- -#ifdef CONFIG_SCHED_MC
- - SD_INIT_FUNC(MC)
- -#endif
- -#ifdef CONFIG_SCHED_BOOK
- - SD_INIT_FUNC(BOOK)
- -#endif
- -
   static int default_relax_domain_level = -1;
   int sched_domain_level_max;
   
@@@ -5921,154 -5938,97 +5921,154 @@@ static void claim_allocations(int cpu, 
                 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
   }
   
- -#ifdef CONFIG_SCHED_SMT
- -static const struct cpumask *cpu_smt_mask(int cpu)
- -{
- -      return topology_thread_cpumask(cpu);
- -}
- -#endif
- -
- -/*
- - * Topology list, bottom-up.
- - */
- -static struct sched_domain_topology_level default_topology[] = {
- -#ifdef CONFIG_SCHED_SMT
- -      { sd_init_SIBLING, cpu_smt_mask, },
- -#endif
- -#ifdef CONFIG_SCHED_MC
- -      { sd_init_MC, cpu_coregroup_mask, },
- -#endif
- -#ifdef CONFIG_SCHED_BOOK
- -      { sd_init_BOOK, cpu_book_mask, },
- -#endif
- -      { sd_init_CPU, cpu_cpu_mask, },
- -      { NULL, },
- -};
- -
- -static struct sched_domain_topology_level *sched_domain_topology = default_topology;
- -
- -#define for_each_sd_topology(tl)                      \
- -      for (tl = sched_domain_topology; tl->init; tl++)
- -
   #ifdef CONFIG_NUMA
- -
   static int sched_domains_numa_levels;
   static int *sched_domains_numa_distance;
   static struct cpumask ***sched_domains_numa_masks;
   static int sched_domains_curr_level;
+ +#endif
   
- -static inline int sd_local_flags(int level)
- -{
- -      if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
- -              return 0;
- -
- -      return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
- -}
+ +/*
+ + * SD_flags allowed in topology descriptions.
+ + *
+ + * SD_SHARE_CPUPOWER      - describes SMT topologies
+ + * SD_SHARE_PKG_RESOURCES - describes shared caches
+ + * SD_NUMA                - describes NUMA topologies
+ + * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ + *
+ + * Odd one out:
+ + * SD_ASYM_PACKING        - describes SMT quirks
+ + */
+ +#define TOPOLOGY_SD_FLAGS             \
+ +      (SD_SHARE_CPUPOWER |            \
+ +       SD_SHARE_PKG_RESOURCES |       \
+ +       SD_NUMA |                      \
+ +       SD_ASYM_PACKING |              \
+ +       SD_SHARE_POWERDOMAIN)
   
   static struct sched_domain *
- -sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+ +sd_init(struct sched_domain_topology_level *tl, int cpu)
   {
         struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
- -      int level = tl->numa_level;
- -      int sd_weight = cpumask_weight(
- -                      sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+ +      int sd_weight, sd_flags = 0;
+ +
+ +#ifdef CONFIG_NUMA
+ +      /*
+ +       * Ugly hack to pass state to sd_numa_mask()...
+ +       */
+ +      sched_domains_curr_level = tl->numa_level;
+ +#endif
+ +
+ +      sd_weight = cpumask_weight(tl->mask(cpu));
+ +
+ +      if (tl->sd_flags)
+ +              sd_flags = (*tl->sd_flags)();
+ +      if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+ +                      "wrong sd_flags in topology description\n"))
+ +              sd_flags &= ~TOPOLOGY_SD_FLAGS;
   
         *sd = (struct sched_domain){
                 .min_interval           = sd_weight,
                 .max_interval           = 2*sd_weight,
                 .busy_factor            = 32,
                 .imbalance_pct          = 125,
- -              .cache_nice_tries       = 2,
- -              .busy_idx               = 3,
- -              .idle_idx               = 2,
+ +
+ +              .cache_nice_tries       = 0,
+ +              .busy_idx               = 0,
+ +              .idle_idx               = 0,
                 .newidle_idx            = 0,
                 .wake_idx               = 0,
                 .forkexec_idx           = 0,
   
                 .flags                  = 1*SD_LOAD_BALANCE
                                         | 1*SD_BALANCE_NEWIDLE
- -                                      | 0*SD_BALANCE_EXEC
- -                                      | 0*SD_BALANCE_FORK
+ +                                      | 1*SD_BALANCE_EXEC
+ +                                      | 1*SD_BALANCE_FORK
                                         | 0*SD_BALANCE_WAKE
- -                                      | 0*SD_WAKE_AFFINE
+ +                                      | 1*SD_WAKE_AFFINE
                                         | 0*SD_SHARE_CPUPOWER
                                         | 0*SD_SHARE_PKG_RESOURCES
- -                                      | 1*SD_SERIALIZE
+ +                                      | 0*SD_SERIALIZE
                                         | 0*SD_PREFER_SIBLING
- -                                      | 1*SD_NUMA
- -                                      | sd_local_flags(level)
+ +                                      | 0*SD_NUMA
+ +                                      | sd_flags
                                         ,
+ +
                 .last_balance           = jiffies,
                 .balance_interval       = sd_weight,
+ +              .smt_gain               = 0,
+ +              .max_newidle_lb_cost    = 0,
+ +              .next_decay_max_lb_cost = jiffies,
+ +#ifdef CONFIG_SCHED_DEBUG
+ +              .name                   = tl->name,
+ +#endif
         };
- -      SD_INIT_NAME(sd, NUMA);
- -      sd->private = &tl->data;
   
         /*
- -       * Ugly hack to pass state to sd_numa_mask()...
+ +       * Convert topological properties into behaviour.
          */
- -      sched_domains_curr_level = tl->numa_level;
+ +
+ +      if (sd->flags & SD_SHARE_CPUPOWER) {
+ +              sd->imbalance_pct = 110;
+ +              sd->smt_gain = 1178; /* ~15% */
+ +
+ +      } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ +              sd->imbalance_pct = 117;
+ +              sd->cache_nice_tries = 1;
+ +              sd->busy_idx = 2;
+ +
+ +#ifdef CONFIG_NUMA
+ +      } else if (sd->flags & SD_NUMA) {
+ +              sd->cache_nice_tries = 2;
+ +              sd->busy_idx = 3;
+ +              sd->idle_idx = 2;
+ +
+ +              sd->flags |= SD_SERIALIZE;
+ +              if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ +                      sd->flags &= ~(SD_BALANCE_EXEC |
+ +                                     SD_BALANCE_FORK |
+ +                                     SD_WAKE_AFFINE);
+ +              }
+ +
+ +#endif
+ +      } else {
+ +              sd->flags |= SD_PREFER_SIBLING;
+ +              sd->cache_nice_tries = 1;
+ +              sd->busy_idx = 2;
+ +              sd->idle_idx = 1;
+ +      }
+ +
+ +      sd->private = &tl->data;
   
         return sd;
   }
   
+ +/*
+ + * Topology list, bottom-up.
+ + */
+ +static struct sched_domain_topology_level default_topology[] = {
+ +#ifdef CONFIG_SCHED_SMT
+ +      { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+ +#endif
+ +#ifdef CONFIG_SCHED_MC
+ +      { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ +#endif
+ +      { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ +      { NULL, },
+ +};
+ +
+ +struct sched_domain_topology_level *sched_domain_topology = default_topology;
+ +
+ +#define for_each_sd_topology(tl)                      \
+ +      for (tl = sched_domain_topology; tl->mask; tl++)
+ +
+ +void set_sched_topology(struct sched_domain_topology_level *tl)
+ +{
+ +      sched_domain_topology = tl;
+ +}
+ +
+ +#ifdef CONFIG_NUMA
+ +
   static const struct cpumask *sd_numa_mask(int cpu)
   {
         return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@@ -6212,10 -6172,7 +6212,10 @@@ static void sched_init_numa(void
                 }
         }
   
- -      tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+ +      /* Compute default topology size */
+ +      for (i = 0; sched_domain_topology[i].mask; i++);
+ +
+ +      tl = kzalloc((i + level) *
                         sizeof(struct sched_domain_topology_level), GFP_KERNEL);
         if (!tl)
                 return;
@@@ -6223,19 -6180,18 +6223,19 @@@
         /*
          * Copy the default topology bits..
          */
- -      for (i = 0; default_topology[i].init; i++)
- -              tl[i] = default_topology[i];
+ +      for (i = 0; sched_domain_topology[i].mask; i++)
+ +              tl[i] = sched_domain_topology[i];
   
         /*
          * .. and append 'j' levels of NUMA goodness.
          */
         for (j = 0; j < level; i++, j++) {
                 tl[i] = (struct sched_domain_topology_level){
- -                      .init = sd_numa_init,
                         .mask = sd_numa_mask,
+ +                      .sd_flags = cpu_numa_flags,
                         .flags = SDTL_OVERLAP,
                         .numa_level = j,
+ +                      SD_INIT_NAME(NUMA)
                 };
         }
   
@@@ -6393,7 -6349,7 +6393,7 @@@ struct sched_domain *build_sched_domain
                 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
                 struct sched_domain *child, int cpu)
   {
- -      struct sched_domain *sd = tl->init(tl, cpu);
+ +      struct sched_domain *sd = sd_init(tl, cpu);
         if (!sd)
                 return child;
   
diff --combined kernel/workqueue.c

index c30c01b32ecea6d9fea573551bb1b71fb6e543d2,8edc87185427cb17fa02ed93498fcf6f8301cb7e..a4bab46cd38e1ee177fe30bcf51d9d5861cac3ed
--- 1/kernel/workqueue.c
--- 2/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@@ -100,10 -100,10 +100,10 @@@ enum 
   
         /*
          * Rescue workers are used only on emergencies and shared by
- -       * all cpus.  Give -20.
+ +       * all cpus.  Give MIN_NICE.
          */
- -      RESCUER_NICE_LEVEL      = -20,
- -      HIGHPRI_NICE_LEVEL      = -20,
+ +      RESCUER_NICE_LEVEL      = MIN_NICE,
+ +      HIGHPRI_NICE_LEVEL      = MIN_NICE,
   
         WQ_NAME_LEN             = 24,
   };
@@@ -1916,6 -1916,12 +1916,12 @@@ static void send_mayday(struct work_str
   
         /* mayday mayday mayday */
         if (list_empty(&pwq->mayday_node)) {
+               /*
+                * If @pwq is for an unbound wq, its base ref may be put at
+                * any time due to an attribute change.  Pin @pwq until the
+                * rescuer is done with it.
+                */
+               get_pwq(pwq);
                 list_add_tail(&pwq->mayday_node, &wq->maydays);
                 wake_up_process(wq->rescuer->task);
         }
@@@ -2398,6 -2404,7 +2404,7 @@@ static int rescuer_thread(void *__rescu
         struct worker *rescuer = __rescuer;
         struct workqueue_struct *wq = rescuer->rescue_wq;
         struct list_head *scheduled = &rescuer->scheduled;
+       bool should_stop;
   
         set_user_nice(current, RESCUER_NICE_LEVEL);
   
@@@ -2409,11 -2416,15 +2416,15 @@@
   repeat:
         set_current_state(TASK_INTERRUPTIBLE);
   
-       if (kthread_should_stop()) {
-               __set_current_state(TASK_RUNNING);
-               rescuer->task->flags &= ~PF_WQ_WORKER;
-               return 0;
-       }
+       /*
+        * By the time the rescuer is requested to stop, the workqueue
+        * shouldn't have any work pending, but @wq->maydays may still have
+        * pwq(s) queued.  This can happen by non-rescuer workers consuming
+        * all the work items before the rescuer got to them.  Go through
+        * @wq->maydays processing before acting on should_stop so that the
+        * list is always empty on exit.
+        */
+       should_stop = kthread_should_stop();
   
         /* see whether any pwq is asking for help */
         spin_lock_irq(&wq_mayday_lock);
@@@ -2444,6 -2455,12 +2455,12 @@@
   
                 process_scheduled_works(rescuer);
   
+               /*
+                * Put the reference grabbed by send_mayday().  @pool won't
+                * go away while we're holding its lock.
+                */
+               put_pwq(pwq);
+ 
                 /*
                  * Leave this pool.  If keep_working() is %true, notify a
                  * regular worker; otherwise, we end up with 0 concurrency
@@@ -2459,6 -2476,12 +2476,12 @@@
   
         spin_unlock_irq(&wq_mayday_lock);
   
+       if (should_stop) {
+               __set_current_state(TASK_RUNNING);
+               rescuer->task->flags &= ~PF_WQ_WORKER;
+               return 0;
+       }
+ 
         /* rescuers should never participate in concurrency management */
         WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
         schedule();
@@@ -4100,7 -4123,8 +4123,8 @@@ static void wq_update_unbound_numa(stru
         if (!pwq) {
                 pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
                            wq->name);
-               goto out_unlock;
+               mutex_lock(&wq->mutex);
+               goto use_dfl_pwq;
         }
   
         /*
diff --combined mm/memory.c

index 9c2dc659f6f6d593ce8db2276ec0c6160ac376d5,037b812a953141f3dc77b1f7402b29bb54cd9e44..e302ae1dcce05bb50be6c40fa836990bdd9ac522
--- 1/mm/memory.c
--- 2/mm/memory.c
+++ b/mm/memory.c
@@@ -232,17 -232,18 +232,18 @@@ void tlb_gather_mmu(struct mmu_gather *
   #endif
   }
   
- void tlb_flush_mmu(struct mmu_gather *tlb)
+ static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
   {
-       struct mmu_gather_batch *batch;
- 
-       if (!tlb->need_flush)
-               return;
         tlb->need_flush = 0;
         tlb_flush(tlb);
   #ifdef CONFIG_HAVE_RCU_TABLE_FREE
         tlb_table_flush(tlb);
   #endif
+ }
+ 
+ static void tlb_flush_mmu_free(struct mmu_gather *tlb)
+ {
+       struct mmu_gather_batch *batch;
   
         for (batch = &tlb->local; batch; batch = batch->next) {
                 free_pages_and_swap_cache(batch->pages, batch->nr);
@@@ -251,6 -252,14 +252,14 @@@
         tlb->active = &tlb->local;
   }
   
+ void tlb_flush_mmu(struct mmu_gather *tlb)
+ {
+       if (!tlb->need_flush)
+               return;
+       tlb_flush_mmu_tlbonly(tlb);
+       tlb_flush_mmu_free(tlb);
+ }
+ 
   /* tlb_finish_mmu
    *    Called at the end of the shootdown operation to free up any resources
    *    that were required.
@@@ -1127,8 -1136,10 +1136,10 @@@ again
                         if (PageAnon(page))
                                 rss[MM_ANONPAGES]--;
                         else {
-                               if (pte_dirty(ptent))
+                               if (pte_dirty(ptent)) {
+                                       force_flush = 1;
                                         set_page_dirty(page);
+                               }
                                 if (pte_young(ptent) &&
                                     likely(!(vma->vm_flags & VM_SEQ_READ)))
                                         mark_page_accessed(page);
@@@ -1137,9 -1148,10 +1148,10 @@@
                         page_remove_rmap(page);
                         if (unlikely(page_mapcount(page) < 0))
                                 print_bad_pte(vma, addr, ptent, page);
-                       force_flush = !__tlb_remove_page(tlb, page);
-                       if (force_flush)
+                       if (unlikely(!__tlb_remove_page(tlb, page))) {
+                               force_flush = 1;
                                 break;
+                       }
                         continue;
                 }
                 /*
@@@ -1174,18 -1186,11 +1186,11 @@@
   
         add_mm_rss_vec(mm, rss);
         arch_leave_lazy_mmu_mode();
-       pte_unmap_unlock(start_pte, ptl);
   
-       /*
-        * mmu_gather ran out of room to batch pages, we break out of
-        * the PTE lock to avoid doing the potential expensive TLB invalidate
-        * and page-free while holding it.
-        */
+       /* Do the actual TLB flush before dropping ptl */
         if (force_flush) {
                 unsigned long old_end;
   
-               force_flush = 0;
- 
                 /*
                  * Flush the TLB just for the previous segment,
                  * then update the range to be the remaining
@@@ -1193,11 -1198,21 +1198,21 @@@
                  */
                 old_end = tlb->end;
                 tlb->end = addr;
- 
-               tlb_flush_mmu(tlb);
- 
+               tlb_flush_mmu_tlbonly(tlb);
                 tlb->start = addr;
                 tlb->end = old_end;
+       }
+       pte_unmap_unlock(start_pte, ptl);
+ 
+       /*
+        * If we forced a TLB flush (either due to running out of
+        * batch buffers or because we needed to flush dirty TLB
+        * entries before releasing the ptl), free the batched
+        * memory too. Restart if we didn't do everything.
+        */
+       if (force_flush) {
+               force_flush = 0;
+               tlb_flush_mmu_free(tlb);
   
                 if (addr != end)
                         goto again;
@@@ -1955,12 -1970,17 +1970,17 @@@ int fixup_user_fault(struct task_struc
                      unsigned long address, unsigned int fault_flags)
   {
         struct vm_area_struct *vma;
+       vm_flags_t vm_flags;
         int ret;
   
         vma = find_extend_vma(mm, address);
         if (!vma || address < vma->vm_start)
                 return -EFAULT;
   
+       vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
+       if (!(vm_flags & vma->vm_flags))
+               return -EFAULT;
+ 
         ret = handle_mm_fault(mm, vma, address, fault_flags);
         if (ret & VM_FAULT_ERROR) {
                 if (ret & VM_FAULT_OOM)
@@@ -3900,6 -3920,9 +3920,6 @@@ static int __handle_mm_fault(struct mm_
                 }
         }
   
- -      /* THP should already have been handled */
- -      BUG_ON(pmd_numa(*pmd));
- -
         /*
          * Use __pte_alloc instead of pte_alloc_map, because we can't
          * run pte_offset_map on the pmd, if an huge pmd could
author	Ingo Molnar <[email protected]>
	Thu, 22 May 2014 08:28:56 +0000 (10:28 +0200)
committer	Ingo Molnar <[email protected]>
	Thu, 22 May 2014 08:28:56 +0000 (10:28 +0200)
		1	2
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/workqueue.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memory.c	patch \|	diff1 \|	diff2 \|	blob \| history