}
#endif /* CONFIG_SCHED_HRTICK */
+/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, val) \
+({ typeof(*(ptr)) __old, __val = *(ptr); \
+ for (;;) { \
+ __old = cmpxchg((ptr), __val, __val | (val)); \
+ if (__old == __val) \
+ break; \
+ __val = __old; \
+ } \
+ __old; \
+})
+
+#ifdef TIF_POLLING_NRFLAG
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ set_tsk_need_resched(p);
+ return true;
+}
+#endif
+
/*
* resched_task - mark a task 'to be rescheduled now'.
*
if (test_tsk_need_resched(p))
return;
- set_tsk_need_resched(p);
-
cpu = task_cpu(p);
+
if (cpu == smp_processor_id()) {
+ set_tsk_need_resched(p);
set_preempt_need_resched();
return;
}
- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(p))
+ if (set_nr_and_not_polling(p))
smp_send_reschedule(cpu);
}
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
*/
- asmlinkage void schedule_tail(struct task_struct *prev)
+ asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
struct rq *rq = this_rq();
if (likely(prev->sched_class == class &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq, prev);
- if (likely(p && p != RETRY_TASK))
- return p;
+ if (unlikely(p == RETRY_TASK))
+ goto again;
+
+ /* assumes fair_sched_class->next == idle_sched_class */
+ if (unlikely(!p))
+ p = idle_sched_class.pick_next_task(rq, prev);
+
+ return p;
}
again:
blk_schedule_flush_plug(tsk);
}
- asmlinkage void __sched schedule(void)
+ asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
EXPORT_SYMBOL(schedule);
#ifdef CONFIG_CONTEXT_TRACKING
- asmlinkage void __sched schedule_user(void)
+ asmlinkage __visible void __sched schedule_user(void)
{
/*
* If we come here after a random call to set_need_resched(),
* off of preempt_enable. Kernel preemptions off return from interrupt
* occur there and call schedule directly.
*/
- asmlinkage void __sched notrace preempt_schedule(void)
+ asmlinkage __visible void __sched notrace preempt_schedule(void)
{
/*
* If there is a non-zero preempt_count or interrupts are disabled,
* Note, that this is called and return with irqs disabled. This will
* protect us against recursive calling from irq.
*/
- asmlinkage void __sched preempt_schedule_irq(void)
+ asmlinkage __visible void __sched preempt_schedule_irq(void)
{
enum ctx_state prev_state;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_throttled = 0;
dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
}
static void __setscheduler_params(struct task_struct *p,
* sys_sched_setattr - same as above, but with extended sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
*/
SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, flags)
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
* @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
*/
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, size, unsigned int, flags)
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
- SD_SHARE_PKG_RESOURCES)) {
+ SD_SHARE_PKG_RESOURCES |
+ SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next)
return 0;
}
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
SD_SHARE_PKG_RESOURCES |
- SD_PREFER_SIBLING);
+ SD_PREFER_SIBLING |
+ SD_SHARE_POWERDOMAIN);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
__setup("isolcpus=", isolated_cpu_setup);
-static const struct cpumask *cpu_cpu_mask(int cpu)
-{
- return cpumask_of_node(cpu_to_node(cpu));
-}
-
-struct sd_data {
- struct sched_domain **__percpu sd;
- struct sched_group **__percpu sg;
- struct sched_group_power **__percpu sgp;
-};
-
struct s_data {
struct sched_domain ** __percpu sd;
struct root_domain *rd;
sa_none,
};
-struct sched_domain_topology_level;
-
-typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-
-#define SDTL_OVERLAP 0x01
-
-struct sched_domain_topology_level {
- sched_domain_init_f init;
- sched_domain_mask_f mask;
- int flags;
- int numa_level;
- struct sd_data data;
-};
-
/*
* Build an iteration mask that can exclude certain CPUs from the upwards
* domain traversal.
atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
}
-int __weak arch_sd_sibling_asym_packing(void)
-{
- return 0*SD_ASYM_PACKING;
-}
-
/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
-#ifdef CONFIG_SCHED_DEBUG
-# define SD_INIT_NAME(sd, type) sd->name = #type
-#else
-# define SD_INIT_NAME(sd, type) do { } while (0)
-#endif
-
-#define SD_INIT_FUNC(type) \
-static noinline struct sched_domain * \
-sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
-{ \
- struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
- *sd = SD_##type##_INIT; \
- SD_INIT_NAME(sd, type); \
- sd->private = &tl->data; \
- return sd; \
-}
-
-SD_INIT_FUNC(CPU)
-#ifdef CONFIG_SCHED_SMT
- SD_INIT_FUNC(SIBLING)
-#endif
-#ifdef CONFIG_SCHED_MC
- SD_INIT_FUNC(MC)
-#endif
-#ifdef CONFIG_SCHED_BOOK
- SD_INIT_FUNC(BOOK)
-#endif
-
static int default_relax_domain_level = -1;
int sched_domain_level_max;
*per_cpu_ptr(sdd->sgp, cpu) = NULL;
}
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *cpu_smt_mask(int cpu)
-{
- return topology_thread_cpumask(cpu);
-}
-#endif
-
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
- { sd_init_SIBLING, cpu_smt_mask, },
-#endif
-#ifdef CONFIG_SCHED_MC
- { sd_init_MC, cpu_coregroup_mask, },
-#endif
-#ifdef CONFIG_SCHED_BOOK
- { sd_init_BOOK, cpu_book_mask, },
-#endif
- { sd_init_CPU, cpu_cpu_mask, },
- { NULL, },
-};
-
-static struct sched_domain_topology_level *sched_domain_topology = default_topology;
-
-#define for_each_sd_topology(tl) \
- for (tl = sched_domain_topology; tl->init; tl++)
-
#ifdef CONFIG_NUMA
-
static int sched_domains_numa_levels;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
static int sched_domains_curr_level;
+#endif
-static inline int sd_local_flags(int level)
-{
- if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
- return 0;
-
- return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
-}
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * SD_SHARE_CPUPOWER - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS \
+ (SD_SHARE_CPUPOWER | \
+ SD_SHARE_PKG_RESOURCES | \
+ SD_NUMA | \
+ SD_ASYM_PACKING | \
+ SD_SHARE_POWERDOMAIN)
static struct sched_domain *
-sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl, int cpu)
{
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
- int level = tl->numa_level;
- int sd_weight = cpumask_weight(
- sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+ int sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+ /*
+ * Ugly hack to pass state to sd_numa_mask()...
+ */
+ sched_domains_curr_level = tl->numa_level;
+#endif
+
+ sd_weight = cpumask_weight(tl->mask(cpu));
+
+ if (tl->sd_flags)
+ sd_flags = (*tl->sd_flags)();
+ if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+ "wrong sd_flags in topology description\n"))
+ sd_flags &= ~TOPOLOGY_SD_FLAGS;
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
.busy_factor = 32,
.imbalance_pct = 125,
- .cache_nice_tries = 2,
- .busy_idx = 3,
- .idle_idx = 2,
+
+ .cache_nice_tries = 0,
+ .busy_idx = 0,
+ .idle_idx = 0,
.newidle_idx = 0,
.wake_idx = 0,
.forkexec_idx = 0,
.flags = 1*SD_LOAD_BALANCE
| 1*SD_BALANCE_NEWIDLE
- | 0*SD_BALANCE_EXEC
- | 0*SD_BALANCE_FORK
+ | 1*SD_BALANCE_EXEC
+ | 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
- | 0*SD_WAKE_AFFINE
+ | 1*SD_WAKE_AFFINE
| 0*SD_SHARE_CPUPOWER
| 0*SD_SHARE_PKG_RESOURCES
- | 1*SD_SERIALIZE
+ | 0*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
- | 1*SD_NUMA
- | sd_local_flags(level)
+ | 0*SD_NUMA
+ | sd_flags
,
+
.last_balance = jiffies,
.balance_interval = sd_weight,
+ .smt_gain = 0,
+ .max_newidle_lb_cost = 0,
+ .next_decay_max_lb_cost = jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+ .name = tl->name,
+#endif
};
- SD_INIT_NAME(sd, NUMA);
- sd->private = &tl->data;
/*
- * Ugly hack to pass state to sd_numa_mask()...
+ * Convert topological properties into behaviour.
*/
- sched_domains_curr_level = tl->numa_level;
+
+ if (sd->flags & SD_SHARE_CPUPOWER) {
+ sd->imbalance_pct = 110;
+ sd->smt_gain = 1178; /* ~15% */
+
+ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->imbalance_pct = 117;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+ } else if (sd->flags & SD_NUMA) {
+ sd->cache_nice_tries = 2;
+ sd->busy_idx = 3;
+ sd->idle_idx = 2;
+
+ sd->flags |= SD_SERIALIZE;
+ if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ sd->flags &= ~(SD_BALANCE_EXEC |
+ SD_BALANCE_FORK |
+ SD_WAKE_AFFINE);
+ }
+
+#endif
+ } else {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+ sd->idle_idx = 1;
+ }
+
+ sd->private = &tl->data;
return sd;
}
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
+struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+#define for_each_sd_topology(tl) \
+ for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+ sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
static const struct cpumask *sd_numa_mask(int cpu)
{
return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
}
}
- tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+ /* Compute default topology size */
+ for (i = 0; sched_domain_topology[i].mask; i++);
+
+ tl = kzalloc((i + level) *
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
if (!tl)
return;
/*
* Copy the default topology bits..
*/
- for (i = 0; default_topology[i].init; i++)
- tl[i] = default_topology[i];
+ for (i = 0; sched_domain_topology[i].mask; i++)
+ tl[i] = sched_domain_topology[i];
/*
* .. and append 'j' levels of NUMA goodness.
*/
for (j = 0; j < level; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
- .init = sd_numa_init,
.mask = sd_numa_mask,
+ .sd_flags = cpu_numa_flags,
.flags = SDTL_OVERLAP,
.numa_level = j,
+ SD_INIT_NAME(NUMA)
};
}
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
- struct sched_domain *sd = tl->init(tl, cpu);
+ struct sched_domain *sd = sd_init(tl, cpu);
if (!sd)
return child;
/*
* Rescue workers are used only on emergencies and shared by
- * all cpus. Give -20.
+ * all cpus. Give MIN_NICE.
*/
- RESCUER_NICE_LEVEL = -20,
- HIGHPRI_NICE_LEVEL = -20,
+ RESCUER_NICE_LEVEL = MIN_NICE,
+ HIGHPRI_NICE_LEVEL = MIN_NICE,
WQ_NAME_LEN = 24,
};
/* mayday mayday mayday */
if (list_empty(&pwq->mayday_node)) {
+ /*
+ * If @pwq is for an unbound wq, its base ref may be put at
+ * any time due to an attribute change. Pin @pwq until the
+ * rescuer is done with it.
+ */
+ get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
wake_up_process(wq->rescuer->task);
}
struct worker *rescuer = __rescuer;
struct workqueue_struct *wq = rescuer->rescue_wq;
struct list_head *scheduled = &rescuer->scheduled;
+ bool should_stop;
set_user_nice(current, RESCUER_NICE_LEVEL);
repeat:
set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- rescuer->task->flags &= ~PF_WQ_WORKER;
- return 0;
- }
+ /*
+ * By the time the rescuer is requested to stop, the workqueue
+ * shouldn't have any work pending, but @wq->maydays may still have
+ * pwq(s) queued. This can happen by non-rescuer workers consuming
+ * all the work items before the rescuer got to them. Go through
+ * @wq->maydays processing before acting on should_stop so that the
+ * list is always empty on exit.
+ */
+ should_stop = kthread_should_stop();
/* see whether any pwq is asking for help */
spin_lock_irq(&wq_mayday_lock);
process_scheduled_works(rescuer);
+ /*
+ * Put the reference grabbed by send_mayday(). @pool won't
+ * go away while we're holding its lock.
+ */
+ put_pwq(pwq);
+
/*
* Leave this pool. If keep_working() is %true, notify a
* regular worker; otherwise, we end up with 0 concurrency
spin_unlock_irq(&wq_mayday_lock);
+ if (should_stop) {
+ __set_current_state(TASK_RUNNING);
+ rescuer->task->flags &= ~PF_WQ_WORKER;
+ return 0;
+ }
+
/* rescuers should never participate in concurrency management */
WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
schedule();
if (!pwq) {
pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
wq->name);
- goto out_unlock;
+ mutex_lock(&wq->mutex);
+ goto use_dfl_pwq;
}
/*
#endif
}
- void tlb_flush_mmu(struct mmu_gather *tlb)
+ static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
{
- struct mmu_gather_batch *batch;
-
- if (!tlb->need_flush)
- return;
tlb->need_flush = 0;
tlb_flush(tlb);
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb_table_flush(tlb);
#endif
+ }
+
+ static void tlb_flush_mmu_free(struct mmu_gather *tlb)
+ {
+ struct mmu_gather_batch *batch;
for (batch = &tlb->local; batch; batch = batch->next) {
free_pages_and_swap_cache(batch->pages, batch->nr);
tlb->active = &tlb->local;
}
+ void tlb_flush_mmu(struct mmu_gather *tlb)
+ {
+ if (!tlb->need_flush)
+ return;
+ tlb_flush_mmu_tlbonly(tlb);
+ tlb_flush_mmu_free(tlb);
+ }
+
/* tlb_finish_mmu
* Called at the end of the shootdown operation to free up any resources
* that were required.
if (PageAnon(page))
rss[MM_ANONPAGES]--;
else {
- if (pte_dirty(ptent))
+ if (pte_dirty(ptent)) {
+ force_flush = 1;
set_page_dirty(page);
+ }
if (pte_young(ptent) &&
likely(!(vma->vm_flags & VM_SEQ_READ)))
mark_page_accessed(page);
page_remove_rmap(page);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
- force_flush = !__tlb_remove_page(tlb, page);
- if (force_flush)
+ if (unlikely(!__tlb_remove_page(tlb, page))) {
+ force_flush = 1;
break;
+ }
continue;
}
/*
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(start_pte, ptl);
- /*
- * mmu_gather ran out of room to batch pages, we break out of
- * the PTE lock to avoid doing the potential expensive TLB invalidate
- * and page-free while holding it.
- */
+ /* Do the actual TLB flush before dropping ptl */
if (force_flush) {
unsigned long old_end;
- force_flush = 0;
-
/*
* Flush the TLB just for the previous segment,
* then update the range to be the remaining
*/
old_end = tlb->end;
tlb->end = addr;
-
- tlb_flush_mmu(tlb);
-
+ tlb_flush_mmu_tlbonly(tlb);
tlb->start = addr;
tlb->end = old_end;
+ }
+ pte_unmap_unlock(start_pte, ptl);
+
+ /*
+ * If we forced a TLB flush (either due to running out of
+ * batch buffers or because we needed to flush dirty TLB
+ * entries before releasing the ptl), free the batched
+ * memory too. Restart if we didn't do everything.
+ */
+ if (force_flush) {
+ force_flush = 0;
+ tlb_flush_mmu_free(tlb);
if (addr != end)
goto again;
unsigned long address, unsigned int fault_flags)
{
struct vm_area_struct *vma;
+ vm_flags_t vm_flags;
int ret;
vma = find_extend_vma(mm, address);
if (!vma || address < vma->vm_start)
return -EFAULT;
+ vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
+ if (!(vm_flags & vma->vm_flags))
+ return -EFAULT;
+
ret = handle_mm_fault(mm, vma, address, fault_flags);
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
}
}
- /* THP should already have been handled */
- BUG_ON(pmd_numa(*pmd));
-
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could