]> Git Repo - linux.git/commitdiff
Merge branch 'for-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
authorLinus Torvalds <[email protected]>
Thu, 5 Nov 2015 22:51:32 +0000 (14:51 -0800)
committerLinus Torvalds <[email protected]>
Thu, 5 Nov 2015 22:51:32 +0000 (14:51 -0800)
Pull cgroup updates from Tejun Heo:
 "The cgroup core saw several significant updates this cycle:

   - percpu_rwsem for threadgroup locking is reinstated.  This was
     temporarily dropped due to down_write latency issues.  Oleg's
     rework of percpu_rwsem which is scheduled to be merged in this
     merge window resolves the issue.

   - On the v2 hierarchy, when controllers are enabled and disabled, all
     operations are atomic and can fail and revert cleanly.  This allows
     ->can_attach() failure which is necessary for cpu RT slices.

   - Tasks now stay associated with the original cgroups after exit
     until released.  This allows tracking resources held by zombies
     (e.g.  pids) and makes it easy to find out where zombies came from
     on the v2 hierarchy.  The pids controller was broken before these
     changes as zombies escaped the limits; unfortunately, updating this
     behavior required too many invasive changes and I don't think it's
     a good idea to backport them, so the pids controller on 4.3, the
     first version which included the pids controller, will stay broken
     at least until I'm sure about the cgroup core changes.

   - Optimization of a couple common tests using static_key"

* 'for-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (38 commits)
  cgroup: fix race condition around termination check in css_task_iter_next()
  blkcg: don't create "io.stat" on the root cgroup
  cgroup: drop cgroup__DEVEL__legacy_files_on_dfl
  cgroup: replace error handling in cgroup_init() with WARN_ON()s
  cgroup: add cgroup_subsys->free() method and use it to fix pids controller
  cgroup: keep zombies associated with their original cgroups
  cgroup: make css_set_rwsem a spinlock and rename it to css_set_lock
  cgroup: don't hold css_set_rwsem across css task iteration
  cgroup: reorganize css_task_iter functions
  cgroup: factor out css_set_move_task()
  cgroup: keep css_set and task lists in chronological order
  cgroup: make cgroup_destroy_locked() test cgroup_is_populated()
  cgroup: make css_sets pin the associated cgroups
  cgroup: relocate cgroup_[try]get/put()
  cgroup: move check_for_release() invocation
  cgroup: replace cgroup_has_tasks() with cgroup_is_populated()
  cgroup: make cgroup->nr_populated count the number of populated css_sets
  cgroup: remove an unused parameter from cgroup_task_migrate()
  cgroup: fix too early usage of static_branch_disable()
  cgroup: make cgroup_update_dfl_csses() migrate all target processes atomically
  ...

1  2 
block/blk-cgroup.c
include/linux/backing-dev.h
include/linux/init_task.h
include/linux/jump_label.h
include/linux/memcontrol.h
include/linux/sched.h
kernel/events/core.c
kernel/fork.c
kernel/sched/core.c
mm/memcontrol.c
mm/vmscan.c

diff --combined block/blk-cgroup.c
index 55512dd626336eae49b758def08d601bc3515b74,4fa54161d423097da1441a41004ce416bf547280..5bcdfc10c23a6340367c4b9781496a49b5c81efe
@@@ -370,9 -370,6 +370,9 @@@ static void blkg_destroy_all(struct req
                blkg_destroy(blkg);
                spin_unlock(&blkcg->lock);
        }
 +
 +      q->root_blkg = NULL;
 +      q->root_rl.blkg = NULL;
  }
  
  /*
@@@ -899,6 -896,7 +899,7 @@@ static int blkcg_print_stat(struct seq_
  struct cftype blkcg_files[] = {
        {
                .name = "stat",
+               .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = blkcg_print_stat,
        },
        { }     /* terminate */
index c85f74946a8bab65ff3f16cddea6a4446b0a4799,08d9a8eac42c35fb103747ff105091f4af0f27bd..c82794f20110420582d496ae478bc600f9400233
  #include <linux/sched.h>
  #include <linux/blkdev.h>
  #include <linux/writeback.h>
- #include <linux/memcontrol.h>
  #include <linux/blk-cgroup.h>
  #include <linux/backing-dev-defs.h>
  #include <linux/slab.h>
  
  int __must_check bdi_init(struct backing_dev_info *bdi);
 -void bdi_destroy(struct backing_dev_info *bdi);
 +void bdi_exit(struct backing_dev_info *bdi);
  
  __printf(3, 4)
  int bdi_register(struct backing_dev_info *bdi, struct device *parent,
                const char *fmt, ...);
  int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 +void bdi_unregister(struct backing_dev_info *bdi);
 +
  int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
 +void bdi_destroy(struct backing_dev_info *bdi);
 +
  void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
                        bool range_cyclic, enum wb_reason reason);
  void wb_start_background_writeback(struct bdi_writeback *wb);
@@@ -267,8 -262,8 +266,8 @@@ static inline bool inode_cgwb_enabled(s
  {
        struct backing_dev_info *bdi = inode_to_bdi(inode);
  
-       return cgroup_on_dfl(mem_cgroup_root_css->cgroup) &&
-               cgroup_on_dfl(blkcg_root_css->cgroup) &&
+       return cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
+               cgroup_subsys_on_dfl(io_cgrp_subsys) &&
                bdi_cap_account_dirty(bdi) &&
                (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
                (inode->i_sb->s_iflags & SB_I_CGROUPWB);
@@@ -412,6 -407,61 +411,6 @@@ static inline void unlocked_inode_to_wb
        rcu_read_unlock();
  }
  
 -struct wb_iter {
 -      int                     start_memcg_id;
 -      struct radix_tree_iter  tree_iter;
 -      void                    **slot;
 -};
 -
 -static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
 -                                                 struct backing_dev_info *bdi)
 -{
 -      struct radix_tree_iter *titer = &iter->tree_iter;
 -
 -      WARN_ON_ONCE(!rcu_read_lock_held());
 -
 -      if (iter->start_memcg_id >= 0) {
 -              iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
 -              iter->start_memcg_id = -1;
 -      } else {
 -              iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
 -      }
 -
 -      if (!iter->slot)
 -              iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
 -      if (iter->slot)
 -              return *iter->slot;
 -      return NULL;
 -}
 -
 -static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
 -                                                 struct backing_dev_info *bdi,
 -                                                 int start_memcg_id)
 -{
 -      iter->start_memcg_id = start_memcg_id;
 -
 -      if (start_memcg_id)
 -              return __wb_iter_next(iter, bdi);
 -      else
 -              return &bdi->wb;
 -}
 -
 -/**
 - * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
 - * @wb_cur: cursor struct bdi_writeback pointer
 - * @bdi: bdi to walk wb's of
 - * @iter: pointer to struct wb_iter to be used as iteration buffer
 - * @start_memcg_id: memcg ID to start iteration from
 - *
 - * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
 - * memcg ID order starting from @start_memcg_id.  @iter is struct wb_iter
 - * to be used as temp storage during iteration.  rcu_read_lock() must be
 - * held throughout iteration.
 - */
 -#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id)            \
 -      for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id);      \
 -           (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
 -
  #else /* CONFIG_CGROUP_WRITEBACK */
  
  static inline bool inode_cgwb_enabled(struct inode *inode)
@@@ -471,6 -521,14 +470,6 @@@ static inline void wb_blkcg_offline(str
  {
  }
  
 -struct wb_iter {
 -      int             next_id;
 -};
 -
 -#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)            \
 -      for ((iter)->next_id = (start_blkcg_id);                        \
 -           ({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
 -
  static inline int inode_congested(struct inode *inode, int cong_bits)
  {
        return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
index 810a34f604247065d78c2820ed34b0bf8d3aabf6,d0b380ee7d67abbd421bf69fdd63ff10b2aa88b1..1c1ff7e4faa4bf158166b789605107f6a65baf44
  extern struct files_struct init_files;
  extern struct fs_struct init_fs;
  
- #ifdef CONFIG_CGROUPS
- #define INIT_GROUP_RWSEM(sig)                                         \
-       .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem),
- #else
- #define INIT_GROUP_RWSEM(sig)
- #endif
  #ifdef CONFIG_CPUSETS
  #define INIT_CPUSET_SEQ(tsk)                                                  \
        .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq),
        .rlim           = INIT_RLIMITS,                                 \
        .cputimer       = {                                             \
                .cputime_atomic = INIT_CPUTIME_ATOMIC,                  \
 -              .running        = 0,                                    \
 +              .running        = false,                                \
 +              .checking_timer = false,                                \
        },                                                              \
        INIT_PREV_CPUTIME(sig)                                          \
        .cred_guard_mutex =                                             \
                 __MUTEX_INITIALIZER(sig.cred_guard_mutex),             \
-       INIT_GROUP_RWSEM(sig)                                           \
  }
  
  extern struct nsproxy init_nsproxy;
index f1094238ab2a0f0fddeb40e3c7aadde7c2a89015,c9ca050de8466afdd28189f42598620912c4b53d..8dde55974f186bca7c1488866aaacdd805c347b3
@@@ -21,8 -21,8 +21,8 @@@
   *
   * DEFINE_STATIC_KEY_TRUE(key);
   * DEFINE_STATIC_KEY_FALSE(key);
 - * static_key_likely()
 - * statick_key_unlikely()
 + * static_branch_likely()
 + * static_branch_unlikely()
   *
   * Jump labels provide an interface to generate dynamic branches using
   * self-modifying code. Assuming toolchain and architecture support, if we
   * statement, setting the key to true requires us to patch in a jump
   * to the out-of-line of true branch.
   *
 - * In addtion to static_branch_{enable,disable}, we can also reference count
 + * In addition to static_branch_{enable,disable}, we can also reference count
   * the key or branch direction via static_branch_{inc,dec}. Thus,
   * static_branch_inc() can be thought of as a 'make more true' and
 - * static_branch_dec() as a 'make more false'. The inc()/dec()
 - * interface is meant to be used exclusively from the inc()/dec() for a given
 - * key.
 + * static_branch_dec() as a 'make more false'.
   *
   * Since this relies on modifying code, the branch modifying functions
   * must be considered absolute slow paths (machine wide synchronization etc.).
@@@ -214,11 -216,6 +214,6 @@@ static inline int jump_label_apply_nops
  #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
  #define jump_label_enabled static_key_enabled
  
- static inline bool static_key_enabled(struct static_key *key)
- {
-       return static_key_count(key) > 0;
- }
  static inline void static_key_enable(struct static_key *key)
  {
        int count = static_key_count(key);
@@@ -265,6 -262,17 +260,17 @@@ struct static_key_false 
  #define DEFINE_STATIC_KEY_FALSE(name) \
        struct static_key_false name = STATIC_KEY_FALSE_INIT
  
+ extern bool ____wrong_branch_error(void);
+ #define static_key_enabled(x)                                                 \
+ ({                                                                            \
+       if (!__builtin_types_compatible_p(typeof(*x), struct static_key) &&     \
+           !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\
+           !__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
+               ____wrong_branch_error();                                       \
+       static_key_count((struct static_key *)x) > 0;                           \
+ })
  #ifdef HAVE_JUMP_LABEL
  
  /*
   * See jump_label_type() / jump_label_init_type().
   */
  
- extern bool ____wrong_branch_error(void);
  #define static_branch_likely(x)                                                       \
  ({                                                                            \
        bool branch;                                                            \
index 3e3318ddfc0e3e09a0e15825f78eb6052d628d78,c83c699a6605b982ca8fbec56f9510d44e2c9cbd..27251ed428f7db8adaf54c58b7f9e41deda9048d
@@@ -213,6 -213,9 +213,9 @@@ struct mem_cgroup 
        /* OOM-Killer disable */
        int             oom_kill_disable;
  
+       /* handle for "memory.events" */
+       struct cgroup_file events_file;
        /* protect arrays of thresholds */
        struct mutex thresholds_lock;
  
         * percpu counter.
         */
        struct mem_cgroup_stat_cpu __percpu *stat;
 -      spinlock_t pcp_counter_lock;
  
  #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
        struct cg_proto tcp_mem;
@@@ -285,6 -289,7 +288,7 @@@ static inline void mem_cgroup_events(st
                       unsigned int nr)
  {
        this_cpu_add(memcg->stat->events[idx], nr);
+       cgroup_file_notify(&memcg->events_file);
  }
  
  bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
@@@ -346,9 -351,7 +350,7 @@@ ino_t page_cgroup_ino(struct page *page
  
  static inline bool mem_cgroup_disabled(void)
  {
-       if (memory_cgrp_subsys.disabled)
-               return true;
-       return false;
+       return !cgroup_subsys_enabled(memory_cgrp_subsys);
  }
  
  /*
@@@ -676,9 -679,8 +678,9 @@@ enum 
  
  struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
  struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
 -void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
 -                       unsigned long *pdirty, unsigned long *pwriteback);
 +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 +                       unsigned long *pheadroom, unsigned long *pdirty,
 +                       unsigned long *pwriteback);
  
  #else /* CONFIG_CGROUP_WRITEBACK */
  
@@@ -688,8 -690,7 +690,8 @@@ static inline struct wb_domain *mem_cgr
  }
  
  static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
 -                                     unsigned long *pavail,
 +                                     unsigned long *pfilepages,
 +                                     unsigned long *pheadroom,
                                       unsigned long *pdirty,
                                       unsigned long *pwriteback)
  {
diff --combined include/linux/sched.h
index c115d617739d8e6f1f388c13069ad32cfca1bb3b,a4ab9daa387c0bbcaca1923620ceb2ed74bfd84e..4effb1025fbb1555bc9c3ce6f80d98db004271a2
@@@ -599,42 -599,33 +599,42 @@@ struct task_cputime_atomic 
                .sum_exec_runtime = ATOMIC64_INIT(0),           \
        }
  
 -#ifdef CONFIG_PREEMPT_COUNT
 -#define PREEMPT_DISABLED      (1 + PREEMPT_ENABLED)
 -#else
 -#define PREEMPT_DISABLED      PREEMPT_ENABLED
 -#endif
 +#define PREEMPT_DISABLED      (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
 +
 +/*
 + * Disable preemption until the scheduler is running -- use an unconditional
 + * value so that it also works on !PREEMPT_COUNT kernels.
 + *
 + * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
 + */
 +#define INIT_PREEMPT_COUNT    PREEMPT_OFFSET
  
  /*
 - * Disable preemption until the scheduler is running.
 - * Reset by start_kernel()->sched_init()->init_idle().
 + * Initial preempt_count value; reflects the preempt_count schedule invariant
 + * which states that during context switches:
   *
 - * We include PREEMPT_ACTIVE to avoid cond_resched() from working
 - * before the scheduler is active -- see should_resched().
 + *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
 + *
 + * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
 + * Note: See finish_task_switch().
   */
 -#define INIT_PREEMPT_COUNT    (PREEMPT_DISABLED + PREEMPT_ACTIVE)
 +#define FORK_PREEMPT_COUNT    (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
  
  /**
   * struct thread_group_cputimer - thread group interval timer counts
   * @cputime_atomic:   atomic thread group interval timers.
 - * @running:          non-zero when there are timers running and
 - *                    @cputime receives updates.
 + * @running:          true when there are timers running and
 + *                    @cputime_atomic receives updates.
 + * @checking_timer:   true when a thread in the group is in the
 + *                    process of checking for thread group timers.
   *
   * This structure contains the version of task_cputime, above, that is
   * used for thread group CPU timer calculations.
   */
  struct thread_group_cputimer {
        struct task_cputime_atomic cputime_atomic;
 -      int running;
 +      bool running;
 +      bool checking_timer;
  };
  
  #include <linux/rwsem.h>
@@@ -771,18 -762,6 +771,6 @@@ struct signal_struct 
        unsigned audit_tty_log_passwd;
        struct tty_audit_buf *tty_audit_buf;
  #endif
- #ifdef CONFIG_CGROUPS
-       /*
-        * group_rwsem prevents new tasks from entering the threadgroup and
-        * member tasks from exiting,a more specifically, setting of
-        * PF_EXITING.  fork and exit paths are protected with this rwsem
-        * using threadgroup_change_begin/end().  Users which require
-        * threadgroup to remain stable should use threadgroup_[un]lock()
-        * which also takes care of exec path.  Currently, cgroup is the
-        * only user.
-        */
-       struct rw_semaphore group_rwsem;
- #endif
  
        oom_flags_t oom_flags;
        short oom_score_adj;            /* OOM kill score adjustment */
@@@ -849,7 -828,7 +837,7 @@@ struct user_struct 
        struct hlist_node uidhash_node;
        kuid_t uid;
  
 -#ifdef CONFIG_PERF_EVENTS
 +#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL)
        atomic_long_t locked_vm;
  #endif
  };
@@@ -1148,6 -1127,8 +1136,6 @@@ struct sched_domain_topology_level 
  #endif
  };
  
 -extern struct sched_domain_topology_level *sched_domain_topology;
 -
  extern void set_sched_topology(struct sched_domain_topology_level *tl);
  extern void wake_up_if_idle(int cpu);
  
@@@ -1196,10 -1177,10 +1184,10 @@@ struct load_weight 
  
  /*
   * The load_avg/util_avg accumulates an infinite geometric series.
 - * 1) load_avg factors the amount of time that a sched_entity is
 - * runnable on a rq into its weight. For cfs_rq, it is the aggregated
 - * such weights of all runnable and blocked sched_entities.
 - * 2) util_avg factors frequency scaling into the amount of time
 + * 1) load_avg factors frequency scaling into the amount of time that a
 + * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
 + * aggregated such weights of all runnable and blocked sched_entities.
 + * 2) util_avg factors frequency and cpu scaling into the amount of time
   * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
   * For cfs_rq, it is the aggregated such times of all runnable and
   * blocked sched_entities.
@@@ -1349,12 -1330,10 +1337,12 @@@ struct sched_dl_entity 
  
  union rcu_special {
        struct {
 -              bool blocked;
 -              bool need_qs;
 -      } b;
 -      short s;
 +              u8 blocked;
 +              u8 need_qs;
 +              u8 exp_need_qs;
 +              u8 pad; /* Otherwise the compiler can store garbage here. */
 +      } b; /* Bits. */
 +      u32 s; /* Set of bits. */
  };
  struct rcu_node;
  
diff --combined kernel/events/core.c
index 39db20c6248e47c940bd8721c41ade530e1eb5c9,e9874949c78734d2c662845aedb69d79ccd5b839..1a734e0adfa78259dac6cae57239a2d5f992a9ac
@@@ -196,7 -196,7 +196,7 @@@ static int perf_sample_period_ns __read
  static int perf_sample_allowed_ns __read_mostly =
        DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
  
 -void update_perf_cpu_limits(void)
 +static void update_perf_cpu_limits(void)
  {
        u64 tmp = perf_sample_period_ns;
  
@@@ -472,7 -472,7 +472,7 @@@ perf_cgroup_set_timestamp(struct task_s
   * mode SWOUT : schedule out everything
   * mode SWIN : schedule in based on cgroup for next
   */
 -void perf_cgroup_switch(struct task_struct *task, int mode)
 +static void perf_cgroup_switch(struct task_struct *task, int mode)
  {
        struct perf_cpu_context *cpuctx;
        struct pmu *pmu;
@@@ -1243,7 -1243,11 +1243,7 @@@ static inline void perf_event__state_in
                                              PERF_EVENT_STATE_INACTIVE;
  }
  
 -/*
 - * Called at perf_event creation and when events are attached/detached from a
 - * group.
 - */
 -static void perf_event__read_size(struct perf_event *event)
 +static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
  {
        int entry = sizeof(u64); /* value */
        int size = 0;
                entry += sizeof(u64);
  
        if (event->attr.read_format & PERF_FORMAT_GROUP) {
 -              nr += event->group_leader->nr_siblings;
 +              nr += nr_siblings;
                size += sizeof(u64);
        }
  
        event->read_size = size;
  }
  
 -static void perf_event__header_size(struct perf_event *event)
 +static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
  {
        struct perf_sample_data *data;
 -      u64 sample_type = event->attr.sample_type;
        u16 size = 0;
  
 -      perf_event__read_size(event);
 -
        if (sample_type & PERF_SAMPLE_IP)
                size += sizeof(data->ip);
  
        event->header_size = size;
  }
  
 +/*
 + * Called at perf_event creation and when events are attached/detached from a
 + * group.
 + */
 +static void perf_event__header_size(struct perf_event *event)
 +{
 +      __perf_event_read_size(event,
 +                             event->group_leader->nr_siblings);
 +      __perf_event_header_size(event, event->attr.sample_type);
 +}
 +
  static void perf_event__id_header_size(struct perf_event *event)
  {
        struct perf_sample_data *data;
        event->id_header_size = size;
  }
  
 +static bool perf_event_validate_size(struct perf_event *event)
 +{
 +      /*
 +       * The values computed here will be over-written when we actually
 +       * attach the event.
 +       */
 +      __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
 +      __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
 +      perf_event__id_header_size(event);
 +
 +      /*
 +       * Sum the lot; should not exceed the 64k limit we have on records.
 +       * Conservative limit to allow for callchains and other variable fields.
 +       */
 +      if (event->read_size + event->header_size +
 +          event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
 +              return false;
 +
 +      return true;
 +}
 +
  static void perf_group_attach(struct perf_event *event)
  {
        struct perf_event *group_leader = event->group_leader, *pos;
@@@ -1939,7 -1914,7 +1939,7 @@@ group_sched_in(struct perf_event *group
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
  
 -      pmu->start_txn(pmu);
 +      pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
  
        if (event_sched_in(group_event, cpuctx, ctx)) {
                pmu->cancel_txn(pmu);
@@@ -3209,22 -3184,14 +3209,22 @@@ void perf_event_exec(void
        rcu_read_unlock();
  }
  
 +struct perf_read_data {
 +      struct perf_event *event;
 +      bool group;
 +      int ret;
 +};
 +
  /*
   * Cross CPU call to read the hardware event
   */
  static void __perf_event_read(void *info)
  {
 -      struct perf_event *event = info;
 +      struct perf_read_data *data = info;
 +      struct perf_event *sub, *event = data->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 +      struct pmu *pmu = event->pmu;
  
        /*
         * If this is a task context, we need to check whether it is
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }
 +
        update_event_times(event);
 -      if (event->state == PERF_EVENT_STATE_ACTIVE)
 -              event->pmu->read(event);
 +      if (event->state != PERF_EVENT_STATE_ACTIVE)
 +              goto unlock;
 +
 +      if (!data->group) {
 +              pmu->read(event);
 +              data->ret = 0;
 +              goto unlock;
 +      }
 +
 +      pmu->start_txn(pmu, PERF_PMU_TXN_READ);
 +
 +      pmu->read(event);
 +
 +      list_for_each_entry(sub, &event->sibling_list, group_entry) {
 +              update_event_times(sub);
 +              if (sub->state == PERF_EVENT_STATE_ACTIVE) {
 +                      /*
 +                       * Use sibling's PMU rather than @event's since
 +                       * sibling could be on different (eg: software) PMU.
 +                       */
 +                      sub->pmu->read(sub);
 +              }
 +      }
 +
 +      data->ret = pmu->commit_txn(pmu);
 +
 +unlock:
        raw_spin_unlock(&ctx->lock);
  }
  
@@@ -3334,23 -3275,15 +3334,23 @@@ u64 perf_event_read_local(struct perf_e
        return val;
  }
  
 -static u64 perf_event_read(struct perf_event *event)
 +static int perf_event_read(struct perf_event *event, bool group)
  {
 +      int ret = 0;
 +
        /*
         * If event is enabled and currently active on a CPU, update the
         * value in the event structure:
         */
        if (event->state == PERF_EVENT_STATE_ACTIVE) {
 +              struct perf_read_data data = {
 +                      .event = event,
 +                      .group = group,
 +                      .ret = 0,
 +              };
                smp_call_function_single(event->oncpu,
 -                                       __perf_event_read, event, 1);
 +                                       __perf_event_read, &data, 1);
 +              ret = data.ret;
        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;
                        update_context_time(ctx);
                        update_cgrp_time_from_event(event);
                }
 -              update_event_times(event);
 +              if (group)
 +                      update_group_times(event);
 +              else
 +                      update_event_times(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
  
 -      return perf_event_count(event);
 +      return ret;
  }
  
  /*
@@@ -3814,7 -3744,7 +3814,7 @@@ static void put_event(struct perf_even
         *     see the comment there.
         *
         *  2) there is a lock-inversion with mmap_sem through
 -       *     perf_event_read_group(), which takes faults while
 +       *     perf_read_group(), which takes faults while
         *     holding ctx->mutex, however this is called after
         *     the last filedesc died, so there is no possibility
         *     to trigger the AB-BA case.
@@@ -3888,18 -3818,14 +3888,18 @@@ u64 perf_event_read_value(struct perf_e
        *running = 0;
  
        mutex_lock(&event->child_mutex);
 -      total += perf_event_read(event);
 +
 +      (void)perf_event_read(event, false);
 +      total += perf_event_count(event);
 +
        *enabled += event->total_time_enabled +
                        atomic64_read(&event->child_total_time_enabled);
        *running += event->total_time_running +
                        atomic64_read(&event->child_total_time_running);
  
        list_for_each_entry(child, &event->child_list, child_list) {
 -              total += perf_event_read(child);
 +              (void)perf_event_read(child, false);
 +              total += perf_event_count(child);
                *enabled += child->total_time_enabled;
                *running += child->total_time_running;
        }
  }
  EXPORT_SYMBOL_GPL(perf_event_read_value);
  
 -static int perf_event_read_group(struct perf_event *event,
 -                                 u64 read_format, char __user *buf)
 +static int __perf_read_group_add(struct perf_event *leader,
 +                                      u64 read_format, u64 *values)
  {
 -      struct perf_event *leader = event->group_leader, *sub;
 -      struct perf_event_context *ctx = leader->ctx;
 -      int n = 0, size = 0, ret;
 -      u64 count, enabled, running;
 -      u64 values[5];
 +      struct perf_event *sub;
 +      int n = 1; /* skip @nr */
 +      int ret;
  
 -      lockdep_assert_held(&ctx->mutex);
 +      ret = perf_event_read(leader, true);
 +      if (ret)
 +              return ret;
  
 -      count = perf_event_read_value(leader, &enabled, &running);
 +      /*
 +       * Since we co-schedule groups, {enabled,running} times of siblings
 +       * will be identical to those of the leader, so we only publish one
 +       * set.
 +       */
 +      if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
 +              values[n++] += leader->total_time_enabled +
 +                      atomic64_read(&leader->child_total_time_enabled);
 +      }
  
 -      values[n++] = 1 + leader->nr_siblings;
 -      if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 -              values[n++] = enabled;
 -      if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 -              values[n++] = running;
 -      values[n++] = count;
 +      if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
 +              values[n++] += leader->total_time_running +
 +                      atomic64_read(&leader->child_total_time_running);
 +      }
 +
 +      /*
 +       * Write {count,id} tuples for every sibling.
 +       */
 +      values[n++] += perf_event_count(leader);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
  
 -      size = n * sizeof(u64);
 +      list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 +              values[n++] += perf_event_count(sub);
 +              if (read_format & PERF_FORMAT_ID)
 +                      values[n++] = primary_event_id(sub);
 +      }
  
 -      if (copy_to_user(buf, values, size))
 -              return -EFAULT;
 +      return 0;
 +}
  
 -      ret = size;
 +static int perf_read_group(struct perf_event *event,
 +                                 u64 read_format, char __user *buf)
 +{
 +      struct perf_event *leader = event->group_leader, *child;
 +      struct perf_event_context *ctx = leader->ctx;
 +      int ret;
 +      u64 *values;
  
 -      list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 -              n = 0;
 +      lockdep_assert_held(&ctx->mutex);
  
 -              values[n++] = perf_event_read_value(sub, &enabled, &running);
 -              if (read_format & PERF_FORMAT_ID)
 -                      values[n++] = primary_event_id(sub);
 +      values = kzalloc(event->read_size, GFP_KERNEL);
 +      if (!values)
 +              return -ENOMEM;
  
 -              size = n * sizeof(u64);
 +      values[0] = 1 + leader->nr_siblings;
  
 -              if (copy_to_user(buf + ret, values, size)) {
 -                      return -EFAULT;
 -              }
 +      /*
 +       * By locking the child_mutex of the leader we effectively
 +       * lock the child list of all siblings.. XXX explain how.
 +       */
 +      mutex_lock(&leader->child_mutex);
  
 -              ret += size;
 +      ret = __perf_read_group_add(leader, read_format, values);
 +      if (ret)
 +              goto unlock;
 +
 +      list_for_each_entry(child, &leader->child_list, child_list) {
 +              ret = __perf_read_group_add(child, read_format, values);
 +              if (ret)
 +                      goto unlock;
        }
  
 +      mutex_unlock(&leader->child_mutex);
 +
 +      ret = event->read_size;
 +      if (copy_to_user(buf, values, event->read_size))
 +              ret = -EFAULT;
 +      goto out;
 +
 +unlock:
 +      mutex_unlock(&leader->child_mutex);
 +out:
 +      kfree(values);
        return ret;
  }
  
 -static int perf_event_read_one(struct perf_event *event,
 +static int perf_read_one(struct perf_event *event,
                                 u64 read_format, char __user *buf)
  {
        u64 enabled, running;
@@@ -4035,7 -3921,7 +4035,7 @@@ static bool is_event_hup(struct perf_ev
   * Read the performance event - simple non blocking version for now
   */
  static ssize_t
 -perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
 +__perf_read(struct perf_event *event, char __user *buf, size_t count)
  {
        u64 read_format = event->attr.read_format;
        int ret;
  
        WARN_ON_ONCE(event->ctx->parent_ctx);
        if (read_format & PERF_FORMAT_GROUP)
 -              ret = perf_event_read_group(event, read_format, buf);
 +              ret = perf_read_group(event, read_format, buf);
        else
 -              ret = perf_event_read_one(event, read_format, buf);
 +              ret = perf_read_one(event, read_format, buf);
  
        return ret;
  }
@@@ -4068,7 -3954,7 +4068,7 @@@ perf_read(struct file *file, char __use
        int ret;
  
        ctx = perf_event_ctx_lock(event);
 -      ret = perf_read_hw(event, buf, count);
 +      ret = __perf_read(event, buf, count);
        perf_event_ctx_unlock(event, ctx);
  
        return ret;
@@@ -4099,7 -3985,7 +4099,7 @@@ static unsigned int perf_poll(struct fi
  
  static void _perf_event_reset(struct perf_event *event)
  {
 -      (void)perf_event_read(event);
 +      (void)perf_event_read(event, false);
        local64_set(&event->count, 0);
        perf_event_update_userpage(event);
  }
@@@ -5375,15 -5261,9 +5375,15 @@@ void perf_output_sample(struct perf_out
  
        if (sample_type & PERF_SAMPLE_RAW) {
                if (data->raw) {
 -                      perf_output_put(handle, data->raw->size);
 -                      __output_copy(handle, data->raw->data,
 -                                         data->raw->size);
 +                      u32 raw_size = data->raw->size;
 +                      u32 real_size = round_up(raw_size + sizeof(u32),
 +                                               sizeof(u64)) - sizeof(u32);
 +                      u64 zero = 0;
 +
 +                      perf_output_put(handle, real_size);
 +                      __output_copy(handle, data->raw->data, raw_size);
 +                      if (real_size - raw_size)
 +                              __output_copy(handle, &zero, real_size - raw_size);
                } else {
                        struct {
                                u32     size;
@@@ -5515,7 -5395,8 +5515,7 @@@ void perf_prepare_sample(struct perf_ev
                else
                        size += sizeof(u32);
  
 -              WARN_ON_ONCE(size & (sizeof(u64)-1));
 -              header->size += size;
 +              header->size += round_up(size, sizeof(u64));
        }
  
        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@@ -7386,49 -7267,24 +7386,49 @@@ static void perf_pmu_nop_void(struct pm
  {
  }
  
 +static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
 +{
 +}
 +
  static int perf_pmu_nop_int(struct pmu *pmu)
  {
        return 0;
  }
  
 -static void perf_pmu_start_txn(struct pmu *pmu)
 +static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
 +
 +static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
  {
 +      __this_cpu_write(nop_txn_flags, flags);
 +
 +      if (flags & ~PERF_PMU_TXN_ADD)
 +              return;
 +
        perf_pmu_disable(pmu);
  }
  
  static int perf_pmu_commit_txn(struct pmu *pmu)
  {
 +      unsigned int flags = __this_cpu_read(nop_txn_flags);
 +
 +      __this_cpu_write(nop_txn_flags, 0);
 +
 +      if (flags & ~PERF_PMU_TXN_ADD)
 +              return 0;
 +
        perf_pmu_enable(pmu);
        return 0;
  }
  
  static void perf_pmu_cancel_txn(struct pmu *pmu)
  {
 +      unsigned int flags =  __this_cpu_read(nop_txn_flags);
 +
 +      __this_cpu_write(nop_txn_flags, 0);
 +
 +      if (flags & ~PERF_PMU_TXN_ADD)
 +              return;
 +
        perf_pmu_enable(pmu);
  }
  
@@@ -7667,7 -7523,7 +7667,7 @@@ got_cpu_context
                        pmu->commit_txn = perf_pmu_commit_txn;
                        pmu->cancel_txn = perf_pmu_cancel_txn;
                } else {
 -                      pmu->start_txn  = perf_pmu_nop_void;
 +                      pmu->start_txn  = perf_pmu_nop_txn;
                        pmu->commit_txn = perf_pmu_nop_int;
                        pmu->cancel_txn = perf_pmu_nop_void;
                }
@@@ -7755,7 -7611,7 +7755,7 @@@ static int perf_try_init_event(struct p
        return ret;
  }
  
 -struct pmu *perf_init_event(struct perf_event *event)
 +static struct pmu *perf_init_event(struct perf_event *event)
  {
        struct pmu *pmu = NULL;
        int idx;
@@@ -8441,35 -8297,13 +8441,35 @@@ SYSCALL_DEFINE5(perf_event_open
  
        if (move_group) {
                gctx = group_leader->ctx;
 +              mutex_lock_double(&gctx->mutex, &ctx->mutex);
 +      } else {
 +              mutex_lock(&ctx->mutex);
 +      }
 +
 +      if (!perf_event_validate_size(event)) {
 +              err = -E2BIG;
 +              goto err_locked;
 +      }
 +
 +      /*
 +       * Must be under the same ctx::mutex as perf_install_in_context(),
 +       * because we need to serialize with concurrent event creation.
 +       */
 +      if (!exclusive_event_installable(event, ctx)) {
 +              /* exclusive and group stuff are assumed mutually exclusive */
 +              WARN_ON_ONCE(move_group);
 +
 +              err = -EBUSY;
 +              goto err_locked;
 +      }
 +
 +      WARN_ON_ONCE(ctx->parent_ctx);
  
 +      if (move_group) {
                /*
                 * See perf_event_ctx_lock() for comments on the details
                 * of swizzling perf_event::ctx.
                 */
 -              mutex_lock_double(&gctx->mutex, &ctx->mutex);
 -
                perf_remove_from_context(group_leader, false);
  
                list_for_each_entry(sibling, &group_leader->sibling_list,
                        perf_remove_from_context(sibling, false);
                        put_ctx(gctx);
                }
 -      } else {
 -              mutex_lock(&ctx->mutex);
 -      }
 -
 -      WARN_ON_ONCE(ctx->parent_ctx);
  
 -      if (move_group) {
                /*
                 * Wait for everybody to stop referencing the events through
                 * the old lists, before installing it on new lists.
                perf_event__state_init(group_leader);
                perf_install_in_context(ctx, group_leader, group_leader->cpu);
                get_ctx(ctx);
 -      }
  
 -      if (!exclusive_event_installable(event, ctx)) {
 -              err = -EBUSY;
 -              mutex_unlock(&ctx->mutex);
 -              fput(event_file);
 -              goto err_context;
 +              /*
 +               * Now that all events are installed in @ctx, nothing
 +               * references @gctx anymore, so drop the last reference we have
 +               * on it.
 +               */
 +              put_ctx(gctx);
        }
  
 +      /*
 +       * Precalculate sample_data sizes; do while holding ctx::mutex such
 +       * that we're serialized against further additions and before
 +       * perf_install_in_context() which is the point the event is active and
 +       * can use these values.
 +       */
 +      perf_event__header_size(event);
 +      perf_event__id_header_size(event);
 +
        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
  
 -      if (move_group) {
 +      if (move_group)
                mutex_unlock(&gctx->mutex);
 -              put_ctx(gctx);
 -      }
        mutex_unlock(&ctx->mutex);
  
        put_online_cpus();
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);
  
 -      /*
 -       * Precalculate sample_data sizes
 -       */
 -      perf_event__header_size(event);
 -      perf_event__id_header_size(event);
 -
        /*
         * Drop the reference on the group_event after placing the
         * new event on the sibling_list. This ensures destruction
        fd_install(event_fd, event_file);
        return event_fd;
  
 +err_locked:
 +      if (move_group)
 +              mutex_unlock(&gctx->mutex);
 +      mutex_unlock(&ctx->mutex);
 +/* err_file: */
 +      fput(event_file);
  err_context:
        perf_unpin_context(ctx);
        put_ctx(ctx);
@@@ -9460,17 -9293,9 +9460,9 @@@ static void perf_cgroup_attach(struct c
                task_function_call(task, __perf_cgroup_move, task);
  }
  
- static void perf_cgroup_exit(struct cgroup_subsys_state *css,
-                            struct cgroup_subsys_state *old_css,
-                            struct task_struct *task)
- {
-       task_function_call(task, __perf_cgroup_move, task);
- }
  struct cgroup_subsys perf_event_cgrp_subsys = {
        .css_alloc      = perf_cgroup_css_alloc,
        .css_free       = perf_cgroup_css_free,
-       .exit           = perf_cgroup_exit,
        .attach         = perf_cgroup_attach,
  };
  #endif /* CONFIG_CGROUP_PERF */
diff --combined kernel/fork.c
index 6ac894244d3978fb800f7a1a02912bb2901e5e84,118743bb596498edb919f09b92cbca956f479595..825ecc32454d23f4e60216bedfb2de31fe504699
@@@ -251,6 -251,7 +251,7 @@@ void __put_task_struct(struct task_stru
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
  
+       cgroup_free(tsk);
        task_numa_free(tsk);
        security_task_free(tsk);
        exit_creds(tsk);
@@@ -1101,7 -1102,7 +1102,7 @@@ static void posix_cpu_timers_init_group
        cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
        if (cpu_limit != RLIM_INFINITY) {
                sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
 -              sig->cputimer.running = 1;
 +              sig->cputimer.running = true;
        }
  
        /* The timer lists. */
@@@ -1149,10 -1150,6 +1150,6 @@@ static int copy_signal(unsigned long cl
        tty_audit_fork(sig);
        sched_autogroup_fork(sig);
  
- #ifdef CONFIG_CGROUPS
-       init_rwsem(&sig->group_rwsem);
- #endif
        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
  
diff --combined kernel/sched/core.c
index aa5973220ad213a960092012bf4493f296dab90b,2cad9ba9103682a14fc596f8ee813b52a70b928c..4d568ac9319eaf04c9d00673483678bc5e14f22e
@@@ -621,21 -621,18 +621,21 @@@ int get_nohz_timer_target(void
        int i, cpu = smp_processor_id();
        struct sched_domain *sd;
  
 -      if (!idle_cpu(cpu))
 +      if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
                return cpu;
  
        rcu_read_lock();
        for_each_domain(cpu, sd) {
                for_each_cpu(i, sched_domain_span(sd)) {
 -                      if (!idle_cpu(i)) {
 +                      if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
                                cpu = i;
                                goto unlock;
                        }
                }
        }
 +
 +      if (!is_housekeeping_cpu(cpu))
 +              cpu = housekeeping_any_cpu();
  unlock:
        rcu_read_unlock();
        return cpu;
@@@ -817,7 -814,7 +817,7 @@@ static void set_load_weight(struct task
        /*
         * SCHED_IDLE tasks get minimal weight:
         */
 -      if (p->policy == SCHED_IDLE) {
 +      if (idle_policy(p->policy)) {
                load->weight = scale_load(WEIGHT_IDLEPRIO);
                load->inv_weight = WMULT_IDLEPRIO;
                return;
        load->inv_weight = prio_to_wmult[prio];
  }
  
 -static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 +static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
  {
        update_rq_clock(rq);
 -      sched_info_queued(rq, p);
 +      if (!(flags & ENQUEUE_RESTORE))
 +              sched_info_queued(rq, p);
        p->sched_class->enqueue_task(rq, p, flags);
  }
  
 -static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 +static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
  {
        update_rq_clock(rq);
 -      sched_info_dequeued(rq, p);
 +      if (!(flags & DEQUEUE_SAVE))
 +              sched_info_dequeued(rq, p);
        p->sched_class->dequeue_task(rq, p, flags);
  }
  
@@@ -1180,7 -1175,7 +1180,7 @@@ void do_set_cpus_allowed(struct task_st
                 * holding rq->lock.
                 */
                lockdep_assert_held(&rq->lock);
 -              dequeue_task(rq, p, 0);
 +              dequeue_task(rq, p, DEQUEUE_SAVE);
        }
        if (running)
                put_prev_task(rq, p);
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued)
 -              enqueue_task(rq, p, 0);
 +              enqueue_task(rq, p, ENQUEUE_RESTORE);
  }
  
  /*
@@@ -1294,7 -1289,7 +1294,7 @@@ void set_task_cpu(struct task_struct *p
  
        if (task_cpu(p) != new_cpu) {
                if (p->sched_class->migrate_task_rq)
 -                      p->sched_class->migrate_task_rq(p, new_cpu);
 +                      p->sched_class->migrate_task_rq(p);
                p->se.nr_migrations++;
                perf_event_task_migrate(p);
        }
@@@ -1335,16 -1330,12 +1335,16 @@@ static int migrate_swap_stop(void *data
        struct rq *src_rq, *dst_rq;
        int ret = -EAGAIN;
  
 +      if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
 +              return -EAGAIN;
 +
        src_rq = cpu_rq(arg->src_cpu);
        dst_rq = cpu_rq(arg->dst_cpu);
  
        double_raw_lock(&arg->src_task->pi_lock,
                        &arg->dst_task->pi_lock);
        double_rq_lock(src_rq, dst_rq);
 +
        if (task_cpu(arg->dst_task) != arg->dst_cpu)
                goto unlock;
  
@@@ -1580,15 -1571,13 +1580,15 @@@ static int select_fallback_rq(int cpu, 
                        goto out;
                }
  
 +              /* No more Mr. Nice Guy. */
                switch (state) {
                case cpuset:
 -                      /* No more Mr. Nice Guy. */
 -                      cpuset_cpus_allowed_fallback(p);
 -                      state = possible;
 -                      break;
 -
 +                      if (IS_ENABLED(CONFIG_CPUSETS)) {
 +                              cpuset_cpus_allowed_fallback(p);
 +                              state = possible;
 +                              break;
 +                      }
 +                      /* fall-through */
                case possible:
                        do_set_cpus_allowed(p, cpu_possible_mask);
                        state = fail;
@@@ -1700,7 -1689,7 +1700,7 @@@ ttwu_stat(struct task_struct *p, int cp
  #endif /* CONFIG_SCHEDSTATS */
  }
  
 -static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 +static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
  {
        activate_task(rq, p, en_flags);
        p->on_rq = TASK_ON_RQ_QUEUED;
@@@ -2122,17 -2111,23 +2122,17 @@@ static void __sched_fork(unsigned long 
  #endif /* CONFIG_NUMA_BALANCING */
  }
  
 +DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
 +
  #ifdef CONFIG_NUMA_BALANCING
 -#ifdef CONFIG_SCHED_DEBUG
 +
  void set_numabalancing_state(bool enabled)
  {
        if (enabled)
 -              sched_feat_set("NUMA");
 +              static_branch_enable(&sched_numa_balancing);
        else
 -              sched_feat_set("NO_NUMA");
 +              static_branch_disable(&sched_numa_balancing);
  }
 -#else
 -__read_mostly bool numabalancing_enabled;
 -
 -void set_numabalancing_state(bool enabled)
 -{
 -      numabalancing_enabled = enabled;
 -}
 -#endif /* CONFIG_SCHED_DEBUG */
  
  #ifdef CONFIG_PROC_SYSCTL
  int sysctl_numa_balancing(struct ctl_table *table, int write,
  {
        struct ctl_table t;
        int err;
 -      int state = numabalancing_enabled;
 +      int state = static_branch_likely(&sched_numa_balancing);
  
        if (write && !capable(CAP_SYS_ADMIN))
                return -EPERM;
@@@ -2351,8 -2346,6 +2351,8 @@@ void wake_up_new_task(struct task_struc
        struct rq *rq;
  
        raw_spin_lock_irqsave(&p->pi_lock, flags);
 +      /* Initialize new task's runnable average */
 +      init_entity_runnable_average(&p->se);
  #ifdef CONFIG_SMP
        /*
         * Fork balancing, do it here and not earlier because:
        set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
  #endif
  
 -      /* Initialize new task's runnable average */
 -      init_entity_runnable_average(&p->se);
        rq = __task_rq_lock(p);
        activate_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p);
        check_preempt_curr(rq, p, WF_FORK);
  #ifdef CONFIG_SMP
 -      if (p->sched_class->task_woken)
 +      if (p->sched_class->task_woken) {
 +              /*
 +               * Nothing relies on rq->lock after this, so its fine to
 +               * drop it.
 +               */
 +              lockdep_unpin_lock(&rq->lock);
                p->sched_class->task_woken(rq, p);
 +              lockdep_pin_lock(&rq->lock);
 +      }
  #endif
        task_rq_unlock(rq, p, &flags);
  }
@@@ -2485,6 -2473,7 +2485,6 @@@ static inline voi
  prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
  {
 -      trace_sched_switch(prev, next);
        sched_info_switch(rq, prev, next);
        perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
@@@ -2518,22 -2507,6 +2518,22 @@@ static struct rq *finish_task_switch(st
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
  
 +      /*
 +       * The previous task will have left us with a preempt_count of 2
 +       * because it left us after:
 +       *
 +       *      schedule()
 +       *        preempt_disable();                    // 1
 +       *        __schedule()
 +       *          raw_spin_lock_irq(&rq->lock)        // 2
 +       *
 +       * Also, see FORK_PREEMPT_COUNT.
 +       */
 +      if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
 +                    "corrupted preempt_count: %s/%d/0x%x\n",
 +                    current->comm, current->pid, preempt_count()))
 +              preempt_count_set(FORK_PREEMPT_COUNT);
 +
        rq->prev_mm = NULL;
  
        /*
         * If a task dies, then it sets TASK_DEAD in tsk->state and calls
         * schedule one last time. The schedule call will never return, and
         * the scheduled task must drop that reference.
 -       * The test for TASK_DEAD must occur while the runqueue locks are
 -       * still held, otherwise prev could be scheduled on another cpu, die
 -       * there before we look at prev->state, and then the reference would
 -       * be dropped twice.
 -       *              Manfred Spraul <[email protected]>
 +       *
 +       * We must observe prev->state before clearing prev->on_cpu (in
 +       * finish_lock_switch), otherwise a concurrent wakeup can get prev
 +       * running on another CPU and we could rave with its RUNNING -> DEAD
 +       * transition, resulting in a double drop.
         */
        prev_state = prev->state;
        vtime_task_switch(prev);
@@@ -2618,15 -2591,8 +2618,15 @@@ asmlinkage __visible void schedule_tail
  {
        struct rq *rq;
  
 -      /* finish_task_switch() drops rq->lock and enables preemtion */
 -      preempt_disable();
 +      /*
 +       * New tasks start with FORK_PREEMPT_COUNT, see there and
 +       * finish_task_switch() for details.
 +       *
 +       * finish_task_switch() will drop rq->lock() and lower preempt_count
 +       * and the preempt_enable() will end up enabling preemption (on
 +       * PREEMPT_COUNT kernels).
 +       */
 +
        rq = finish_task_switch(prev);
        balance_callback(rq);
        preempt_enable();
@@@ -2700,20 -2666,13 +2700,20 @@@ unsigned long nr_running(void
  
  /*
   * Check if only the current task is running on the cpu.
 + *
 + * Caution: this function does not check that the caller has disabled
 + * preemption, thus the result might have a time-of-check-to-time-of-use
 + * race.  The caller is responsible to use it correctly, for example:
 + *
 + * - from a non-preemptable section (of course)
 + *
 + * - from a thread that is bound to a single CPU
 + *
 + * - in a loop with very short iterations (e.g. a polling loop)
   */
  bool single_task_running(void)
  {
 -      if (cpu_rq(smp_processor_id())->nr_running == 1)
 -              return true;
 -      else
 -              return false;
 +      return raw_rq()->nr_running == 1;
  }
  EXPORT_SYMBOL(single_task_running);
  
@@@ -2984,13 -2943,15 +2984,13 @@@ static noinline void __schedule_bug(str
  static inline void schedule_debug(struct task_struct *prev)
  {
  #ifdef CONFIG_SCHED_STACK_END_CHECK
 -      BUG_ON(unlikely(task_stack_end_corrupted(prev)));
 +      BUG_ON(task_stack_end_corrupted(prev));
  #endif
 -      /*
 -       * Test if we are atomic. Since do_exit() needs to call into
 -       * schedule() atomically, we ignore that path. Otherwise whine
 -       * if we are scheduling when we should not.
 -       */
 -      if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
 +
 +      if (unlikely(in_atomic_preempt_off())) {
                __schedule_bug(prev);
 +              preempt_count_set(PREEMPT_DISABLED);
 +      }
        rcu_sleep_check();
  
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@@ -3076,7 -3037,7 +3076,7 @@@ again
   *
   * WARNING: must be called with preemption disabled!
   */
 -static void __sched __schedule(void)
 +static void __sched notrace __schedule(bool preempt)
  {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        rcu_note_context_switch();
        prev = rq->curr;
  
 +      /*
 +       * do_exit() calls schedule() with preemption disabled as an exception;
 +       * however we must fix that up, otherwise the next task will see an
 +       * inconsistent (higher) preempt count.
 +       *
 +       * It also avoids the below schedule_debug() test from complaining
 +       * about this.
 +       */
 +      if (unlikely(prev->state == TASK_DEAD))
 +              preempt_enable_no_resched_notrace();
 +
        schedule_debug(prev);
  
        if (sched_feat(HRTICK))
        rq->clock_skip_update <<= 1; /* promote REQ to ACT */
  
        switch_count = &prev->nivcsw;
 -      if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 +      if (!preempt && prev->state) {
                if (unlikely(signal_pending_state(prev->state, prev))) {
                        prev->state = TASK_RUNNING;
                } else {
                rq->curr = next;
                ++*switch_count;
  
 +              trace_sched_switch(preempt, prev, next);
                rq = context_switch(rq, prev, next); /* unlocks the rq */
                cpu = cpu_of(rq);
        } else {
@@@ -3182,7 -3131,7 +3182,7 @@@ asmlinkage __visible void __sched sched
        sched_submit_work(tsk);
        do {
                preempt_disable();
 -              __schedule();
 +              __schedule(false);
                sched_preempt_enable_no_resched();
        } while (need_resched());
  }
@@@ -3222,9 -3171,9 +3222,9 @@@ void __sched schedule_preempt_disabled(
  static void __sched notrace preempt_schedule_common(void)
  {
        do {
 -              preempt_active_enter();
 -              __schedule();
 -              preempt_active_exit();
 +              preempt_disable_notrace();
 +              __schedule(true);
 +              preempt_enable_no_resched_notrace();
  
                /*
                 * Check again in case we missed a preemption opportunity
@@@ -3275,17 -3224,24 +3275,17 @@@ asmlinkage __visible void __sched notra
                return;
  
        do {
 -              /*
 -               * Use raw __prempt_count() ops that don't call function.
 -               * We can't call functions before disabling preemption which
 -               * disarm preemption tracing recursions.
 -               */
 -              __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
 -              barrier();
 +              preempt_disable_notrace();
                /*
                 * Needs preempt disabled in case user_exit() is traced
                 * and the tracer calls preempt_enable_notrace() causing
                 * an infinite recursion.
                 */
                prev_ctx = exception_enter();
 -              __schedule();
 +              __schedule(true);
                exception_exit(prev_ctx);
  
 -              barrier();
 -              __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
 +              preempt_enable_no_resched_notrace();
        } while (need_resched());
  }
  EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
@@@ -3308,11 -3264,11 +3308,11 @@@ asmlinkage __visible void __sched preem
        prev_state = exception_enter();
  
        do {
 -              preempt_active_enter();
 +              preempt_disable();
                local_irq_enable();
 -              __schedule();
 +              __schedule(true);
                local_irq_disable();
 -              preempt_active_exit();
 +              sched_preempt_enable_no_resched();
        } while (need_resched());
  
        exception_exit(prev_state);
@@@ -3340,7 -3296,7 +3340,7 @@@ EXPORT_SYMBOL(default_wake_function)
   */
  void rt_mutex_setprio(struct task_struct *p, int prio)
  {
 -      int oldprio, queued, running, enqueue_flag = 0;
 +      int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
        struct rq *rq;
        const struct sched_class *prev_class;
  
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
 -              dequeue_task(rq, p, 0);
 +              dequeue_task(rq, p, DEQUEUE_SAVE);
        if (running)
                put_prev_task(rq, p);
  
                if (!dl_prio(p->normal_prio) ||
                    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                        p->dl.dl_boosted = 1;
 -                      enqueue_flag = ENQUEUE_REPLENISH;
 +                      enqueue_flag |= ENQUEUE_REPLENISH;
                } else
                        p->dl.dl_boosted = 0;
                p->sched_class = &dl_sched_class;
                if (dl_prio(oldprio))
                        p->dl.dl_boosted = 0;
                if (oldprio < prio)
 -                      enqueue_flag = ENQUEUE_HEAD;
 +                      enqueue_flag |= ENQUEUE_HEAD;
                p->sched_class = &rt_sched_class;
        } else {
                if (dl_prio(oldprio))
@@@ -3450,7 -3406,7 +3450,7 @@@ void set_user_nice(struct task_struct *
        }
        queued = task_on_rq_queued(p);
        if (queued)
 -              dequeue_task(rq, p, 0);
 +              dequeue_task(rq, p, DEQUEUE_SAVE);
  
        p->static_prio = NICE_TO_PRIO(nice);
        set_load_weight(p);
        delta = p->prio - old_prio;
  
        if (queued) {
 -              enqueue_task(rq, p, 0);
 +              enqueue_task(rq, p, ENQUEUE_RESTORE);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@@ -3780,7 -3736,10 +3780,7 @@@ recheck
        } else {
                reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
  
 -              if (policy != SCHED_DEADLINE &&
 -                              policy != SCHED_FIFO && policy != SCHED_RR &&
 -                              policy != SCHED_NORMAL && policy != SCHED_BATCH &&
 -                              policy != SCHED_IDLE)
 +              if (!valid_policy(policy))
                        return -EINVAL;
        }
  
                 * Treat SCHED_IDLE as nice 20. Only allow a switch to
                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                 */
 -              if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
 +              if (idle_policy(p->policy) && !idle_policy(policy)) {
                        if (!can_nice(p, task_nice(p)))
                                return -EPERM;
                }
@@@ -3961,7 -3920,7 +3961,7 @@@ change
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
 -              dequeue_task(rq, p, 0);
 +              dequeue_task(rq, p, DEQUEUE_SAVE);
        if (running)
                put_prev_task(rq, p);
  
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued) {
 +              int enqueue_flags = ENQUEUE_RESTORE;
                /*
                 * We enqueue to tail when the priority of a task is
                 * increased (user space view).
                 */
 -              enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
 +              if (oldprio <= p->prio)
 +                      enqueue_flags |= ENQUEUE_HEAD;
 +
 +              enqueue_task(rq, p, enqueue_flags);
        }
  
        check_class_changed(rq, p, prev_class, oldprio);
@@@ -4057,7 -4012,6 +4057,7 @@@ int sched_setscheduler_nocheck(struct t
  {
        return _sched_setscheduler(p, policy, param, false);
  }
 +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
  
  static int
  do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@@ -4970,15 -4924,7 +4970,15 @@@ void init_idle(struct task_struct *idle
        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
  
 -      do_set_cpus_allowed(idle, cpumask_of(cpu));
 +#ifdef CONFIG_SMP
 +      /*
 +       * Its possible that init_idle() gets called multiple times on a task,
 +       * in that case do_set_cpus_allowed() will not do the right thing.
 +       *
 +       * And since this is boot we can forgo the serialization.
 +       */
 +      set_cpus_allowed_common(idle, cpumask_of(cpu));
 +#endif
        /*
         * We're having a chicken and egg problem, even though we are
         * holding rq->lock, the cpu isn't yet set to this cpu so the
  
        rq->curr = rq->idle = idle;
        idle->on_rq = TASK_ON_RQ_QUEUED;
 -#if defined(CONFIG_SMP)
 +#ifdef CONFIG_SMP
        idle->on_cpu = 1;
  #endif
        raw_spin_unlock(&rq->lock);
        idle->sched_class = &idle_sched_class;
        ftrace_graph_init_idle_task(idle, cpu);
        vtime_init_idle(idle, cpu);
 -#if defined(CONFIG_SMP)
 +#ifdef CONFIG_SMP
        sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
  #endif
  }
@@@ -5129,7 -5075,7 +5129,7 @@@ void sched_setnuma(struct task_struct *
        running = task_current(rq, p);
  
        if (queued)
 -              dequeue_task(rq, p, 0);
 +              dequeue_task(rq, p, DEQUEUE_SAVE);
        if (running)
                put_prev_task(rq, p);
  
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued)
 -              enqueue_task(rq, p, 0);
 +              enqueue_task(rq, p, ENQUEUE_RESTORE);
        task_rq_unlock(rq, p, &flags);
  }
  #endif /* CONFIG_NUMA_BALANCING */
@@@ -5232,47 -5178,24 +5232,47 @@@ static void migrate_tasks(struct rq *de
                        break;
  
                /*
 -               * Ensure rq->lock covers the entire task selection
 -               * until the migration.
 +               * pick_next_task assumes pinned rq->lock.
                 */
                lockdep_pin_lock(&rq->lock);
                next = pick_next_task(rq, &fake_task);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
  
 +              /*
 +               * Rules for changing task_struct::cpus_allowed are holding
 +               * both pi_lock and rq->lock, such that holding either
 +               * stabilizes the mask.
 +               *
 +               * Drop rq->lock is not quite as disastrous as it usually is
 +               * because !cpu_active at this point, which means load-balance
 +               * will not interfere. Also, stop-machine.
 +               */
 +              lockdep_unpin_lock(&rq->lock);
 +              raw_spin_unlock(&rq->lock);
 +              raw_spin_lock(&next->pi_lock);
 +              raw_spin_lock(&rq->lock);
 +
 +              /*
 +               * Since we're inside stop-machine, _nothing_ should have
 +               * changed the task, WARN if weird stuff happened, because in
 +               * that case the above rq->lock drop is a fail too.
 +               */
 +              if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
 +                      raw_spin_unlock(&next->pi_lock);
 +                      continue;
 +              }
 +
                /* Find suitable destination for @next, with force if needed. */
                dest_cpu = select_fallback_rq(dead_rq->cpu, next);
  
 -              lockdep_unpin_lock(&rq->lock);
                rq = __migrate_task(rq, next, dest_cpu);
                if (rq != dead_rq) {
                        raw_spin_unlock(&rq->lock);
                        rq = dead_rq;
                        raw_spin_lock(&rq->lock);
                }
 +              raw_spin_unlock(&next->pi_lock);
        }
  
        rq->stop = stop;
@@@ -5559,27 -5482,21 +5559,27 @@@ static void set_cpu_rq_start_time(void
  static int sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
  {
 +      int cpu = (long)hcpu;
 +
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_STARTING:
                set_cpu_rq_start_time();
                return NOTIFY_OK;
 +
        case CPU_ONLINE:
                /*
                 * At this point a starting CPU has marked itself as online via
                 * set_cpu_online(). But it might not yet have marked itself
                 * as active, which is essential from here on.
 -               *
 -               * Thus, fall-through and help the starting CPU along.
                 */
 +              set_cpu_active(cpu, true);
 +              stop_machine_unpark(cpu);
 +              return NOTIFY_OK;
 +
        case CPU_DOWN_FAILED:
 -              set_cpu_active((long)hcpu, true);
 +              set_cpu_active(cpu, true);
                return NOTIFY_OK;
 +
        default:
                return NOTIFY_DONE;
        }
@@@ -6511,8 -6428,7 +6511,8 @@@ static struct sched_domain_topology_lev
        { NULL, },
  };
  
 -struct sched_domain_topology_level *sched_domain_topology = default_topology;
 +static struct sched_domain_topology_level *sched_domain_topology =
 +      default_topology;
  
  #define for_each_sd_topology(tl)                      \
        for (tl = sched_domain_topology; tl->mask; tl++)
@@@ -7281,6 -7197,9 +7281,6 @@@ void __init sched_init_smp(void
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
  
 -      /* nohz_full won't take effect without isolating the cpus. */
 -      tick_nohz_full_add_cpus_to(cpu_isolated_map);
 -
        sched_init_numa();
  
        /*
@@@ -7513,7 -7432,7 +7513,7 @@@ void __init sched_init(void
  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
  static inline int preempt_count_equals(int preempt_offset)
  {
 -      int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
 +      int nested = preempt_count() + rcu_preempt_depth();
  
        return (nested == preempt_offset);
  }
@@@ -7760,7 -7679,7 +7760,7 @@@ void sched_move_task(struct task_struc
        queued = task_on_rq_queued(tsk);
  
        if (queued)
 -              dequeue_task(rq, tsk, 0);
 +              dequeue_task(rq, tsk, DEQUEUE_SAVE);
        if (unlikely(running))
                put_prev_task(rq, tsk);
  
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
        if (tsk->sched_class->task_move_group)
 -              tsk->sched_class->task_move_group(tsk, queued);
 +              tsk->sched_class->task_move_group(tsk);
        else
  #endif
                set_task_rq(tsk, task_cpu(tsk));
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
        if (queued)
 -              enqueue_task(rq, tsk, 0);
 +              enqueue_task(rq, tsk, ENQUEUE_RESTORE);
  
        task_rq_unlock(rq, tsk, &flags);
  }
@@@ -8244,13 -8163,6 +8244,6 @@@ static void cpu_cgroup_attach(struct cg
                sched_move_task(task);
  }
  
- static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
-                           struct cgroup_subsys_state *old_css,
-                           struct task_struct *task)
- {
-       sched_move_task(task);
- }
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                struct cftype *cftype, u64 shareval)
@@@ -8582,7 -8494,6 +8575,6 @@@ struct cgroup_subsys cpu_cgrp_subsys = 
        .fork           = cpu_cgroup_fork,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
-       .exit           = cpu_cgroup_exit,
        .legacy_cftypes = cpu_files,
        .early_init     = 1,
  };
diff --combined mm/memcontrol.c
index c57c4423c68837d14816c5ff230435e1567e7c20,0ddd0ff2b52ecb0811f87dff872d9db8b4ce3579..b732edfddb767025185f27c8879903591c2b0c82
@@@ -434,7 -434,7 +434,7 @@@ struct cgroup_subsys_state *mem_cgroup_
  
        memcg = page->mem_cgroup;
  
-       if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+       if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
                memcg = root_mem_cgroup;
  
        rcu_read_unlock();
@@@ -644,14 -644,12 +644,14 @@@ mem_cgroup_largest_soft_limit_node(stru
  }
  
  /*
 + * Return page count for single (non recursive) @memcg.
 + *
   * Implementation Note: reading percpu statistics for memcg.
   *
   * Both of vmstat[] and percpu_counter has threshold and do periodic
   * synchronization to implement "quick" read. There are trade-off between
   * reading cost and precision of value. Then, we may have a chance to implement
 - * a periodic synchronizion of counter in memcg's counter.
 + * a periodic synchronization of counter in memcg's counter.
   *
   * But this _read() function is used for user interface now. The user accounts
   * memory usage by memory cgroup and he _always_ requires exact value because
   *
   * If there are kernel internal actions which can make use of some not-exact
   * value, and reading all cpu value can be performance bottleneck in some
 - * common workload, threashold and synchonization as vmstat[] should be
 + * common workload, threshold and synchronization as vmstat[] should be
   * implemented.
   */
 -static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
 -                               enum mem_cgroup_stat_index idx)
 +static unsigned long
 +mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
  {
        long val = 0;
        int cpu;
  
 +      /* Per-cpu values can be negative, use a signed accumulator */
        for_each_possible_cpu(cpu)
                val += per_cpu(memcg->stat->count[idx], cpu);
 +      /*
 +       * Summing races with updates, so val may be negative.  Avoid exposing
 +       * transient negative values.
 +       */
 +      if (val < 0)
 +              val = 0;
        return val;
  }
  
@@@ -1263,7 -1254,7 +1263,7 @@@ void mem_cgroup_print_oom_info(struct m
                for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
                        if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                                continue;
 -                      pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
 +                      pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
                                K(mem_cgroup_read_stat(iter, i)));
                }
  
@@@ -2828,11 -2819,14 +2828,11 @@@ static unsigned long tree_stat(struct m
                               enum mem_cgroup_stat_index idx)
  {
        struct mem_cgroup *iter;
 -      long val = 0;
 +      unsigned long val = 0;
  
 -      /* Per-cpu values can be negative, use a signed accumulator */
        for_each_mem_cgroup_tree(iter, memcg)
                val += mem_cgroup_read_stat(iter, idx);
  
 -      if (val < 0) /* race ? */
 -              val = 0;
        return val;
  }
  
@@@ -2926,7 -2920,7 +2926,7 @@@ static int memcg_activate_kmem(struct m
         * of course permitted.
         */
        mutex_lock(&memcg_create_mutex);
-       if (cgroup_has_tasks(memcg->css.cgroup) ||
+       if (cgroup_is_populated(memcg->css.cgroup) ||
            (memcg->use_hierarchy && memcg_has_children(memcg)))
                err = -EBUSY;
        mutex_unlock(&memcg_create_mutex);
@@@ -3175,7 -3169,7 +3175,7 @@@ static int memcg_stat_show(struct seq_f
        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                        continue;
 -              seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
 +              seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
                           mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
        }
  
                           (u64)memsw * PAGE_SIZE);
  
        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 -              long long val = 0;
 +              unsigned long long val = 0;
  
                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                        continue;
                for_each_mem_cgroup_tree(mi, memcg)
                        val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
 -              seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
 +              seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
        }
  
        for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
@@@ -3387,7 -3381,6 +3387,7 @@@ static int __mem_cgroup_usage_register_
        ret = page_counter_memparse(args, "-1", &threshold);
        if (ret)
                return ret;
 +      threshold <<= PAGE_SHIFT;
  
        mutex_lock(&memcg->thresholds_lock);
  
@@@ -3741,43 -3734,44 +3741,43 @@@ struct wb_domain *mem_cgroup_wb_domain(
  /**
   * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
   * @wb: bdi_writeback in question
 - * @pavail: out parameter for number of available pages
 + * @pfilepages: out parameter for number of file pages
 + * @pheadroom: out parameter for number of allocatable pages according to memcg
   * @pdirty: out parameter for number of dirty pages
   * @pwriteback: out parameter for number of pages under writeback
   *
 - * Determine the numbers of available, dirty, and writeback pages in @wb's
 - * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
 - * more involved.
 + * Determine the numbers of file, headroom, dirty, and writeback pages in
 + * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
 + * is a bit more involved.
   *
 - * A memcg's headroom is "min(max, high) - used".  The available memory is
 - * calculated as the lowest headroom of itself and the ancestors plus the
 - * number of pages already being used for file pages.  Note that this
 - * doesn't consider the actual amount of available memory in the system.
 - * The caller should further cap *@pavail accordingly.
 + * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
 + * headroom is calculated as the lowest headroom of itself and the
 + * ancestors.  Note that this doesn't consider the actual amount of
 + * available memory in the system.  The caller should further cap
 + * *@pheadroom accordingly.
   */
 -void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
 -                       unsigned long *pdirty, unsigned long *pwriteback)
 +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 +                       unsigned long *pheadroom, unsigned long *pdirty,
 +                       unsigned long *pwriteback)
  {
        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
        struct mem_cgroup *parent;
 -      unsigned long head_room = PAGE_COUNTER_MAX;
 -      unsigned long file_pages;
  
        *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
  
        /* this should eventually include NR_UNSTABLE_NFS */
        *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
 +      *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
 +                                                   (1 << LRU_ACTIVE_FILE));
 +      *pheadroom = PAGE_COUNTER_MAX;
  
 -      file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
 -                                                  (1 << LRU_ACTIVE_FILE));
        while ((parent = parent_mem_cgroup(memcg))) {
                unsigned long ceiling = min(memcg->memory.limit, memcg->high);
                unsigned long used = page_counter_read(&memcg->memory);
  
 -              head_room = min(head_room, ceiling - min(ceiling, used));
 +              *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
                memcg = parent;
        }
 -
 -      *pavail = file_pages + head_room;
  }
  
  #else /* CONFIG_CGROUP_WRITEBACK */
@@@ -4066,8 -4060,7 +4066,7 @@@ static struct cftype mem_cgroup_legacy_
        {
                .name = "cgroup.event_control",         /* XXX: for compat */
                .write = memcg_write_event_control,
-               .flags = CFTYPE_NO_PREFIX,
-               .mode = S_IWUGO,
+               .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
        },
        {
                .name = "swappiness",
@@@ -4185,6 -4178,7 +4184,6 @@@ static struct mem_cgroup *mem_cgroup_al
        if (memcg_wb_domain_init(memcg, GFP_KERNEL))
                goto out_free_stat;
  
 -      spin_lock_init(&memcg->pcp_counter_lock);
        return memcg;
  
  out_free_stat:
@@@ -4834,7 -4828,7 +4833,7 @@@ static int mem_cgroup_can_attach(struc
  {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup *from;
-       struct task_struct *p;
+       struct task_struct *leader, *p;
        struct mm_struct *mm;
        unsigned long move_flags;
        int ret = 0;
        if (!move_flags)
                return 0;
  
-       p = cgroup_taskset_first(tset);
+       /*
+        * Multi-process migrations only happen on the default hierarchy
+        * where charge immigration is not used.  Perform charge
+        * immigration if @tset contains a leader and whine if there are
+        * multiple.
+        */
+       p = NULL;
+       cgroup_taskset_for_each_leader(leader, tset) {
+               WARN_ON_ONCE(p);
+               p = leader;
+       }
+       if (!p)
+               return 0;
        from = mem_cgroup_from_task(p);
  
        VM_BUG_ON(from == memcg);
@@@ -5064,7 -5071,7 +5076,7 @@@ static void mem_cgroup_bind(struct cgro
         * guarantees that @root doesn't have any children, so turning it
         * on for the root memcg is enough.
         */
-       if (cgroup_on_dfl(root_css->cgroup))
+       if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                root_mem_cgroup->use_hierarchy = true;
        else
                root_mem_cgroup->use_hierarchy = false;
@@@ -5208,6 -5215,7 +5220,7 @@@ static struct cftype memory_files[] = 
        {
                .name = "events",
                .flags = CFTYPE_NOT_ON_ROOT,
+               .file_offset = offsetof(struct mem_cgroup, events_file),
                .seq_show = memory_events_show,
        },
        { }     /* terminate */
diff --combined mm/vmscan.c
index 7f63a9381f71ebbb0c1f9bdda94a913c930280f0,2d978b28a410b25df1acde351630dee387efbbe5..e7057af54b6e267558a99749fac80dc77dd7855f
@@@ -175,7 -175,7 +175,7 @@@ static bool sane_reclaim(struct scan_co
        if (!memcg)
                return true;
  #ifdef CONFIG_CGROUP_WRITEBACK
-       if (cgroup_on_dfl(memcg->css.cgroup))
 -      if (memcg->css.cgroup)
++      if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return true;
  #endif
        return false;
This page took 0.162061 seconds and 4 git commands to generate.