From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 Nov 2013 23:32:25 +0000 (-0500)
Subject: cgroup: Merge branch 'memcg_event' into for-3.14
X-Git-Tag: v3.14-rc1~136^2~35
X-Git-Url: https://repo.jachan.dev/linux.git/commitdiff_plain/edab95103d3a1eb5e3faf977eae4ad0b5bf5669c?hp=-c

cgroup: Merge branch 'memcg_event' into for-3.14

Merge v3.12 based patch series to move cgroup_event implementation to
memcg into for-3.14.  The following two commits cause a conflict in
kernel/cgroup.c

  2ff2a7d03bbe4 ("cgroup: kill css_id")
  79bd9814e5ec9 ("cgroup, memcg: move cgroup_event implementation to memcg")

Each patch removes a struct definition from kernel/cgroup.c.  As the
two are adjacent, they cause a context conflict.  Easily resolved by
removing both structs.

Signed-off-by: Tejun Heo <tj@kernel.org>
---

edab95103d3a1eb5e3faf977eae4ad0b5bf5669c
diff --combined include/linux/cgroup.h
index 39c1d9469677,8d9fa8967c9e..492fa01ec2d3
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@@ -29,7 -29,6 +29,6 @@@ struct cgroup_subsys
  struct inode;
  struct cgroup;
  struct css_id;
- struct eventfd_ctx;
  
  extern int cgroup_init_early(void);
  extern int cgroup_init(void);
@@@ -239,10 -238,6 +238,6 @@@ struct cgroup 
  	struct rcu_head rcu_head;
  	struct work_struct destroy_work;
  
- 	/* List of events which userspace want to receive */
- 	struct list_head event_list;
- 	spinlock_t event_list_lock;
- 
  	/* directory xattrs */
  	struct simple_xattrs xattrs;
  };
@@@ -506,25 -501,6 +501,6 @@@ struct cftype 
  	int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
  
  	int (*release)(struct inode *inode, struct file *file);
- 
- 	/*
- 	 * register_event() callback will be used to add new userspace
- 	 * waiter for changes related to the cftype. Implement it if
- 	 * you want to provide this functionality. Use eventfd_signal()
- 	 * on eventfd to send notification to userspace.
- 	 */
- 	int (*register_event)(struct cgroup_subsys_state *css,
- 			      struct cftype *cft, struct eventfd_ctx *eventfd,
- 			      const char *args);
- 	/*
- 	 * unregister_event() callback will be called when userspace
- 	 * closes the eventfd or on cgroup removing.
- 	 * This callback must be implemented, if you want provide
- 	 * notification functionality.
- 	 */
- 	void (*unregister_event)(struct cgroup_subsys_state *css,
- 				 struct cftype *cft,
- 				 struct eventfd_ctx *eventfd);
  };
  
  /*
@@@ -612,6 -588,11 +588,6 @@@ struct cgroup_subsys 
  	int subsys_id;
  	int disabled;
  	int early_init;
 -	/*
 -	 * True if this subsys uses ID. ID is not available before cgroup_init()
 -	 * (not available in early_init time.)
 -	 */
 -	bool use_id;
  
  	/*
  	 * If %false, this subsystem is properly hierarchical -
@@@ -637,6 -618,9 +613,6 @@@
  	 */
  	struct cgroupfs_root *root;
  	struct list_head sibling;
 -	/* used when use_id == true */
 -	struct idr idr;
 -	spinlock_t id_lock;
  
  	/* list of cftype_sets */
  	struct list_head cftsets;
@@@ -867,6 -851,35 +843,6 @@@ int css_scan_tasks(struct cgroup_subsys
  int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
  int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
  
 -/*
 - * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
 - * if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
 - * CSS ID is assigned at cgroup allocation (create) automatically
 - * and removed when subsys calls free_css_id() function. This is because
 - * the lifetime of cgroup_subsys_state is subsys's matter.
 - *
 - * Looking up and scanning function should be called under rcu_read_lock().
 - * Taking cgroup_mutex is not necessary for following calls.
 - * But the css returned by this routine can be "not populated yet" or "being
 - * destroyed". The caller should check css and cgroup's status.
 - */
 -
 -/*
 - * Typically Called at ->destroy(), or somewhere the subsys frees
 - * cgroup_subsys_state.
 - */
 -void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
 -
 -/* Find a cgroup_subsys_state which has given ID */
 -
 -struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
 -
 -/* Returns true if root is ancestor of cg */
 -bool css_is_ancestor(struct cgroup_subsys_state *cg,
 -		     const struct cgroup_subsys_state *root);
 -
 -/* Get id and depth of css */
 -unsigned short css_id(struct cgroup_subsys_state *css);
  struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
  					 struct cgroup_subsys *ss);
  
diff --combined init/Kconfig
index 79383d3aa5dc,3ca5b8110b0c..93f344337172
--- a/init/Kconfig
+++ b/init/Kconfig
@@@ -284,7 -284,7 +284,7 @@@ config AUDI
  
  config AUDITSYSCALL
  	bool "Enable system-call auditing support"
 -	depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM && AEABI && !OABI_COMPAT))
 +	depends on AUDIT && (X86 || PARISC || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM && AEABI && !OABI_COMPAT))
  	default y if SECURITY_SELINUX
  	help
  	  Enable low-overhead system-call auditing infrastructure that
@@@ -301,6 -301,20 +301,6 @@@ config AUDIT_TRE
  	depends on AUDITSYSCALL
  	select FSNOTIFY
  
 -config AUDIT_LOGINUID_IMMUTABLE
 -	bool "Make audit loginuid immutable"
 -	depends on AUDIT
 -	help
 -	  The config option toggles if a task setting its loginuid requires
 -	  CAP_SYS_AUDITCONTROL or if that task should require no special permissions
 -	  but should instead only allow setting its loginuid if it was never
 -	  previously set.  On systems which use systemd or a similar central
 -	  process to restart login services this should be set to true.  On older
 -	  systems in which an admin would typically have to directly stop and
 -	  start processes this should be set to false.  Setting this to true allows
 -	  one to drop potentially dangerous capabilites from the login tasks,
 -	  but may not be backwards compatible with older init systems.
 -
  source "kernel/irq/Kconfig"
  source "kernel/time/Kconfig"
  
@@@ -340,8 -354,7 +340,8 @@@ config VIRT_CPU_ACCOUNTING_NATIV
  
  config VIRT_CPU_ACCOUNTING_GEN
  	bool "Full dynticks CPU time accounting"
 -	depends on HAVE_CONTEXT_TRACKING && 64BIT
 +	depends on HAVE_CONTEXT_TRACKING
 +	depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
  	select VIRT_CPU_ACCOUNTING
  	select CONTEXT_TRACKING
  	help
@@@ -831,7 -844,7 +831,7 @@@ config NUMA_BALANCING_DEFAULT_ENABLE
  	default y
  	depends on NUMA_BALANCING
  	help
 -	  If set, autonumic NUMA balancing will be enabled if running on a NUMA
 +	  If set, automatic NUMA balancing will be enabled if running on a NUMA
  	  machine.
  
  config NUMA_BALANCING
@@@ -842,13 -855,12 +842,12 @@@
  	help
  	  This option adds support for automatic NUMA aware memory/task placement.
  	  The mechanism is quite primitive and is based on migrating memory when
 -	  it is references to the node the task is running on.
 +	  it has references to the node the task is running on.
  
  	  This system will be inactive on UMA systems.
  
  menuconfig CGROUPS
  	boolean "Control Group support"
- 	depends on EVENTFD
  	help
  	  This option adds support for grouping sets of processes together, for
  	  use with process control subsystems such as Cpusets, CFS, memory
@@@ -915,6 -927,7 +914,7 @@@ config MEMC
  	bool "Memory Resource Controller for Control Groups"
  	depends on RESOURCE_COUNTERS
  	select MM_OWNER
+ 	select EVENTFD
  	help
  	  Provides a memory resource controller that manages both anonymous
  	  memory and page cache. (See Documentation/cgroups/memory.txt)
@@@ -1154,7 -1167,6 +1154,6 @@@ config UIDGID_STRICT_TYPE_CHECK
  
  config SCHED_AUTOGROUP
  	bool "Automatic process group scheduling"
- 	select EVENTFD
  	select CGROUPS
  	select CGROUP_SCHED
  	select FAIR_GROUP_SCHED
@@@ -1655,18 -1667,6 +1654,18 @@@ config BASE_SMAL
  	default 0 if BASE_FULL
  	default 1 if !BASE_FULL
  
 +config SYSTEM_TRUSTED_KEYRING
 +	bool "Provide system-wide ring of trusted keys"
 +	depends on KEYS
 +	help
 +	  Provide a system keyring to which trusted keys can be added.  Keys in
 +	  the keyring are considered to be trusted.  Keys may be added at will
 +	  by the kernel from compiled-in data and from hardware key stores, but
 +	  userspace may only add extra keys if those keys can be verified by
 +	  keys already in the keyring.
 +
 +	  Keys in this keyring are used by module signature checking.
 +
  menuconfig MODULES
  	bool "Enable loadable module support"
  	option modules
@@@ -1740,7 -1740,6 +1739,7 @@@ config MODULE_SRCVERSION_AL
  config MODULE_SIG
  	bool "Module signature verification"
  	depends on MODULES
 +	select SYSTEM_TRUSTED_KEYRING
  	select KEYS
  	select CRYPTO
  	select ASYMMETRIC_KEY_TYPE
diff --combined kernel/cgroup.c
index a7b98ee35ef7,c0248e16461d..be42967f4f1a
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@@ -56,11 -56,8 +56,8 @@@
  #include <linux/pid_namespace.h>
  #include <linux/idr.h>
  #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
- #include <linux/eventfd.h>
- #include <linux/poll.h>
  #include <linux/flex_array.h> /* used in cgroup_attach_task */
  #include <linux/kthread.h>
- #include <linux/file.h>
  
  #include <linux/atomic.h>
  
@@@ -89,14 -86,6 +86,14 @@@ static DEFINE_MUTEX(cgroup_mutex)
  
  static DEFINE_MUTEX(cgroup_root_mutex);
  
 +/*
 + * cgroup destruction makes heavy use of work items and there can be a lot
 + * of concurrent destructions.  Use a separate workqueue so that cgroup
 + * destruction work items don't end up filling up max_active of system_wq
 + * which may lead to deadlock.
 + */
 +static struct workqueue_struct *cgroup_destroy_wq;
 +
  /*
   * Generate an array of cgroup subsystem pointers. At boot time, this is
   * populated with the built in subsystems, and modular subsystems are
@@@ -132,36 -121,38 +129,6 @@@ struct cfent 
  	struct simple_xattrs		xattrs;
  };
  
--/*
-  * cgroup_event represents events which userspace want to receive.
 - * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
 - * cgroup_subsys->use_id != 0.
-- */
- struct cgroup_event {
 -#define CSS_ID_MAX	(65535)
 -struct css_id {
--	/*
- 	 * css which the event belongs to.
 -	 * The css to which this ID points. This pointer is set to valid value
 -	 * after cgroup is populated. If cgroup is removed, this will be NULL.
 -	 * This pointer is expected to be RCU-safe because destroy()
 -	 * is called after synchronize_rcu(). But for safe use, css_tryget()
 -	 * should be used for avoiding race.
--	 */
- 	struct cgroup_subsys_state *css;
 -	struct cgroup_subsys_state __rcu *css;
--	/*
- 	 * Control file which the event associated.
 -	 * ID of this css.
--	 */
- 	struct cftype *cft;
 -	unsigned short id;
--	/*
- 	 * eventfd to signal userspace about the event.
 -	 * Depth in hierarchy which this ID belongs to.
--	 */
- 	struct eventfd_ctx *eventfd;
 -	unsigned short depth;
--	/*
- 	 * Each of these stored in a list by the cgroup.
 -	 * ID is freed by RCU. (and lookup routine is RCU safe.)
--	 */
- 	struct list_head list;
 -	struct rcu_head rcu_head;
--	/*
- 	 * All fields below needed to unregister event when
- 	 * userspace closes eventfd.
 -	 * Hierarchy of CSS ID belongs to.
--	 */
- 	poll_table pt;
- 	wait_queue_head_t *wqh;
- 	wait_queue_t wait;
- 	struct work_struct remove;
 -	unsigned short stack[0]; /* Array of Length (depth+1) */
--};
--
  /* The list of hierarchy roots */
  
  static LIST_HEAD(cgroup_roots);
@@@ -363,6 -354,9 +330,6 @@@ struct cgrp_cset_link 
  static struct css_set init_css_set;
  static struct cgrp_cset_link init_cgrp_cset_link;
  
 -static int cgroup_init_idr(struct cgroup_subsys *ss,
 -			   struct cgroup_subsys_state *css);
 -
  /*
   * css_set_lock protects the list of css_set objects, and the chain of
   * tasks off each css_set.  Nests outside task->alloc_lock due to
@@@ -814,6 -808,8 +781,6 @@@ static struct backing_dev_info cgroup_b
  	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
  };
  
 -static int alloc_css_id(struct cgroup_subsys_state *child_css);
 -
  static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
  {
  	struct inode *inode = new_inode(sb);
@@@ -879,7 -875,7 +846,7 @@@ static void cgroup_free_rcu(struct rcu_
  	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
  
  	INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
 -	schedule_work(&cgrp->destroy_work);
 +	queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
  }
  
  static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@@ -903,6 -899,11 +870,6 @@@
  	iput(inode);
  }
  
 -static int cgroup_delete(const struct dentry *d)
 -{
 -	return 1;
 -}
 -
  static void remove_dir(struct dentry *d)
  {
  	struct dentry *parent = dget(d->d_parent);
@@@ -1351,8 -1352,6 +1318,6 @@@ static void init_cgroup_housekeeping(st
  	INIT_LIST_HEAD(&cgrp->pidlists);
  	mutex_init(&cgrp->pidlist_mutex);
  	cgrp->dummy_css.cgroup = cgrp;
- 	INIT_LIST_HEAD(&cgrp->event_list);
- 	spin_lock_init(&cgrp->event_list_lock);
  	simple_xattrs_init(&cgrp->xattrs);
  }
  
@@@ -1489,7 -1488,7 +1454,7 @@@ static int cgroup_get_rootdir(struct su
  {
  	static const struct dentry_operations cgroup_dops = {
  		.d_iput = cgroup_diput,
 -		.d_delete = cgroup_delete,
 +		.d_delete = always_delete_dentry,
  	};
  
  	struct inode *inode =
@@@ -2626,16 -2625,6 +2591,6 @@@ static const struct inode_operations cg
  	.removexattr = cgroup_removexattr,
  };
  
- /*
-  * Check if a file is a control file
-  */
- static inline struct cftype *__file_cft(struct file *file)
- {
- 	if (file_inode(file)->i_fop != &cgroup_file_operations)
- 		return ERR_PTR(-EINVAL);
- 	return __d_cft(file->f_dentry);
- }
- 
  static int cgroup_create_file(struct dentry *dentry, umode_t mode,
  				struct super_block *sb)
  {
@@@ -3915,202 -3904,6 +3870,6 @@@ static void cgroup_dput(struct cgroup *
  	deactivate_super(sb);
  }
  
- /*
-  * Unregister event and free resources.
-  *
-  * Gets called from workqueue.
-  */
- static void cgroup_event_remove(struct work_struct *work)
- {
- 	struct cgroup_event *event = container_of(work, struct cgroup_event,
- 			remove);
- 	struct cgroup_subsys_state *css = event->css;
- 
- 	remove_wait_queue(event->wqh, &event->wait);
- 
- 	event->cft->unregister_event(css, event->cft, event->eventfd);
- 
- 	/* Notify userspace the event is going away. */
- 	eventfd_signal(event->eventfd, 1);
- 
- 	eventfd_ctx_put(event->eventfd);
- 	kfree(event);
- 	css_put(css);
- }
- 
- /*
-  * Gets called on POLLHUP on eventfd when user closes it.
-  *
-  * Called with wqh->lock held and interrupts disabled.
-  */
- static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
- 		int sync, void *key)
- {
- 	struct cgroup_event *event = container_of(wait,
- 			struct cgroup_event, wait);
- 	struct cgroup *cgrp = event->css->cgroup;
- 	unsigned long flags = (unsigned long)key;
- 
- 	if (flags & POLLHUP) {
- 		/*
- 		 * If the event has been detached at cgroup removal, we
- 		 * can simply return knowing the other side will cleanup
- 		 * for us.
- 		 *
- 		 * We can't race against event freeing since the other
- 		 * side will require wqh->lock via remove_wait_queue(),
- 		 * which we hold.
- 		 */
- 		spin_lock(&cgrp->event_list_lock);
- 		if (!list_empty(&event->list)) {
- 			list_del_init(&event->list);
- 			/*
- 			 * We are in atomic context, but cgroup_event_remove()
- 			 * may sleep, so we have to call it in workqueue.
- 			 */
- 			schedule_work(&event->remove);
- 		}
- 		spin_unlock(&cgrp->event_list_lock);
- 	}
- 
- 	return 0;
- }
- 
- static void cgroup_event_ptable_queue_proc(struct file *file,
- 		wait_queue_head_t *wqh, poll_table *pt)
- {
- 	struct cgroup_event *event = container_of(pt,
- 			struct cgroup_event, pt);
- 
- 	event->wqh = wqh;
- 	add_wait_queue(wqh, &event->wait);
- }
- 
- /*
-  * Parse input and register new cgroup event handler.
-  *
-  * Input must be in format '<event_fd> <control_fd> <args>'.
-  * Interpretation of args is defined by control file implementation.
-  */
- static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
- 				      struct cftype *cft, const char *buffer)
- {
- 	struct cgroup *cgrp = dummy_css->cgroup;
- 	struct cgroup_event *event;
- 	struct cgroup_subsys_state *cfile_css;
- 	unsigned int efd, cfd;
- 	struct fd efile;
- 	struct fd cfile;
- 	char *endp;
- 	int ret;
- 
- 	efd = simple_strtoul(buffer, &endp, 10);
- 	if (*endp != ' ')
- 		return -EINVAL;
- 	buffer = endp + 1;
- 
- 	cfd = simple_strtoul(buffer, &endp, 10);
- 	if ((*endp != ' ') && (*endp != '\0'))
- 		return -EINVAL;
- 	buffer = endp + 1;
- 
- 	event = kzalloc(sizeof(*event), GFP_KERNEL);
- 	if (!event)
- 		return -ENOMEM;
- 
- 	INIT_LIST_HEAD(&event->list);
- 	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
- 	init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
- 	INIT_WORK(&event->remove, cgroup_event_remove);
- 
- 	efile = fdget(efd);
- 	if (!efile.file) {
- 		ret = -EBADF;
- 		goto out_kfree;
- 	}
- 
- 	event->eventfd = eventfd_ctx_fileget(efile.file);
- 	if (IS_ERR(event->eventfd)) {
- 		ret = PTR_ERR(event->eventfd);
- 		goto out_put_efile;
- 	}
- 
- 	cfile = fdget(cfd);
- 	if (!cfile.file) {
- 		ret = -EBADF;
- 		goto out_put_eventfd;
- 	}
- 
- 	/* the process need read permission on control file */
- 	/* AV: shouldn't we check that it's been opened for read instead? */
- 	ret = inode_permission(file_inode(cfile.file), MAY_READ);
- 	if (ret < 0)
- 		goto out_put_cfile;
- 
- 	event->cft = __file_cft(cfile.file);
- 	if (IS_ERR(event->cft)) {
- 		ret = PTR_ERR(event->cft);
- 		goto out_put_cfile;
- 	}
- 
- 	if (!event->cft->ss) {
- 		ret = -EBADF;
- 		goto out_put_cfile;
- 	}
- 
- 	/*
- 	 * Determine the css of @cfile, verify it belongs to the same
- 	 * cgroup as cgroup.event_control, and associate @event with it.
- 	 * Remaining events are automatically removed on cgroup destruction
- 	 * but the removal is asynchronous, so take an extra ref.
- 	 */
- 	rcu_read_lock();
- 
- 	ret = -EINVAL;
- 	event->css = cgroup_css(cgrp, event->cft->ss);
- 	cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
- 	if (event->css && event->css == cfile_css && css_tryget(event->css))
- 		ret = 0;
- 
- 	rcu_read_unlock();
- 	if (ret)
- 		goto out_put_cfile;
- 
- 	if (!event->cft->register_event || !event->cft->unregister_event) {
- 		ret = -EINVAL;
- 		goto out_put_css;
- 	}
- 
- 	ret = event->cft->register_event(event->css, event->cft,
- 			event->eventfd, buffer);
- 	if (ret)
- 		goto out_put_css;
- 
- 	efile.file->f_op->poll(efile.file, &event->pt);
- 
- 	spin_lock(&cgrp->event_list_lock);
- 	list_add(&event->list, &cgrp->event_list);
- 	spin_unlock(&cgrp->event_list_lock);
- 
- 	fdput(cfile);
- 	fdput(efile);
- 
- 	return 0;
- 
- out_put_css:
- 	css_put(event->css);
- out_put_cfile:
- 	fdput(cfile);
- out_put_eventfd:
- 	eventfd_ctx_put(event->eventfd);
- out_put_efile:
- 	fdput(efile);
- out_kfree:
- 	kfree(event);
- 
- 	return ret;
- }
- 
  static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
  				      struct cftype *cft)
  {
@@@ -4135,11 -3928,6 +3894,6 @@@ static struct cftype cgroup_base_files[
  		.release = cgroup_pidlist_release,
  		.mode = S_IRUGO | S_IWUSR,
  	},
- 	{
- 		.name = "cgroup.event_control",
- 		.write_string = cgroup_write_event_control,
- 		.mode = S_IWUGO,
- 	},
  	{
  		.name = "cgroup.clone_children",
  		.flags = CFTYPE_INSANE,
@@@ -4206,6 -3994,21 +3960,6 @@@ static int cgroup_populate_dir(struct c
  				goto err;
  		}
  	}
 -
 -	/* This cgroup is ready now */
 -	for_each_root_subsys(cgrp->root, ss) {
 -		struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
 -		struct css_id *id = rcu_dereference_protected(css->id, true);
 -
 -		/*
 -		 * Update id->css pointer and make this css visible from
 -		 * CSS ID functions. This pointer will be dereferened
 -		 * from RCU-read-side without locks.
 -		 */
 -		if (id)
 -			rcu_assign_pointer(id->css, css);
 -	}
 -
  	return 0;
  err:
  	cgroup_clear_dir(cgrp, subsys_mask);
@@@ -4257,7 -4060,7 +4011,7 @@@ static void css_free_rcu_fn(struct rcu_
  	 * css_put().  dput() requires process context which we don't have.
  	 */
  	INIT_WORK(&css->destroy_work, css_free_work_fn);
 -	schedule_work(&css->destroy_work);
 +	queue_work(cgroup_destroy_wq, &css->destroy_work);
  }
  
  static void css_release(struct percpu_ref *ref)
@@@ -4274,6 -4077,7 +4028,6 @@@ static void init_css(struct cgroup_subs
  	css->cgroup = cgrp;
  	css->ss = ss;
  	css->flags = 0;
 -	css->id = NULL;
  
  	if (cgrp->parent)
  		css->parent = cgroup_css(cgrp->parent, ss);
@@@ -4405,6 -4209,12 +4159,6 @@@ static long cgroup_create(struct cgrou
  			goto err_free_all;
  
  		init_css(css, ss, cgrp);
 -
 -		if (ss->use_id) {
 -			err = alloc_css_id(css);
 -			if (err)
 -				goto err_free_all;
 -		}
  	}
  
  	/*
@@@ -4547,7 -4357,7 +4301,7 @@@ static void css_killed_ref_fn(struct pe
  		container_of(ref, struct cgroup_subsys_state, refcnt);
  
  	INIT_WORK(&css->destroy_work, css_killed_work_fn);
 -	schedule_work(&css->destroy_work);
 +	queue_work(cgroup_destroy_wq, &css->destroy_work);
  }
  
  /**
@@@ -4610,7 -4420,6 +4364,6 @@@ static int cgroup_destroy_locked(struc
  	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
  {
  	struct dentry *d = cgrp->dentry;
- 	struct cgroup_event *event, *tmp;
  	struct cgroup_subsys *ss;
  	struct cgroup *child;
  	bool empty;
@@@ -4685,18 -4494,6 +4438,6 @@@
  	dget(d);
  	cgroup_d_remove_dir(d);
  
- 	/*
- 	 * Unregister events and notify userspace.
- 	 * Notify userspace about cgroup removing only after rmdir of cgroup
- 	 * directory to avoid race between userspace and kernelspace.
- 	 */
- 	spin_lock(&cgrp->event_list_lock);
- 	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
- 		list_del_init(&event->list);
- 		schedule_work(&event->remove);
- 	}
- 	spin_unlock(&cgrp->event_list_lock);
- 
  	return 0;
  };
  
@@@ -4869,6 -4666,12 +4610,6 @@@ int __init_or_module cgroup_load_subsys
  
  	/* our new subsystem will be attached to the dummy hierarchy. */
  	init_css(css, ss, cgroup_dummy_top);
 -	/* init_idr must be after init_css() because it sets css->id. */
 -	if (ss->use_id) {
 -		ret = cgroup_init_idr(ss, css);
 -		if (ret)
 -			goto err_unload;
 -	}
  
  	/*
  	 * Now we need to entangle the css into the existing css_sets. unlike
@@@ -4934,6 -4737,9 +4675,6 @@@ void cgroup_unload_subsys(struct cgroup
  
  	offline_css(cgroup_css(cgroup_dummy_top, ss));
  
 -	if (ss->use_id)
 -		idr_destroy(&ss->idr);
 -
  	/* deassign the subsys_id */
  	cgroup_subsys[ss->subsys_id] = NULL;
  
@@@ -4960,7 -4766,8 +4701,7 @@@
  	/*
  	 * remove subsystem's css from the cgroup_dummy_top and free it -
  	 * need to free before marking as null because ss->css_free needs
 -	 * the cgrp->subsys pointer to find their state. note that this
 -	 * also takes care of freeing the css_id.
 +	 * the cgrp->subsys pointer to find their state.
  	 */
  	ss->css_free(cgroup_css(cgroup_dummy_top, ss));
  	RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
@@@ -5031,6 -4838,8 +4772,6 @@@ int __init cgroup_init(void
  	for_each_builtin_subsys(ss, i) {
  		if (!ss->early_init)
  			cgroup_init_subsys(ss);
 -		if (ss->use_id)
 -			cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
  	}
  
  	/* allocate id for the dummy hierarchy */
@@@ -5071,22 -4880,6 +4812,22 @@@ out
  	return err;
  }
  
 +static int __init cgroup_wq_init(void)
 +{
 +	/*
 +	 * There isn't much point in executing destruction path in
 +	 * parallel.  Good chunk is serialized with cgroup_mutex anyway.
 +	 * Use 1 for @max_active.
 +	 *
 +	 * We would prefer to do this in cgroup_init() above, but that
 +	 * is called before init_workqueues(): so leave this until after.
 +	 */
 +	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
 +	BUG_ON(!cgroup_destroy_wq);
 +	return 0;
 +}
 +core_initcall(cgroup_wq_init);
 +
  /*
   * proc_cgroup_show()
   *  - Print task's cgroup paths into seq_file, one line for each hierarchy
@@@ -5466,6 -5259,181 +5207,6 @@@ static int __init cgroup_disable(char *
  }
  __setup("cgroup_disable=", cgroup_disable);
  
 -/*
 - * Functons for CSS ID.
 - */
 -
 -/* to get ID other than 0, this should be called when !cgroup_is_dead() */
 -unsigned short css_id(struct cgroup_subsys_state *css)
 -{
 -	struct css_id *cssid;
 -
 -	/*
 -	 * This css_id() can return correct value when somone has refcnt
 -	 * on this or this is under rcu_read_lock(). Once css->id is allocated,
 -	 * it's unchanged until freed.
 -	 */
 -	cssid = rcu_dereference_raw(css->id);
 -
 -	if (cssid)
 -		return cssid->id;
 -	return 0;
 -}
 -EXPORT_SYMBOL_GPL(css_id);
 -
 -/**
 - *  css_is_ancestor - test "root" css is an ancestor of "child"
 - * @child: the css to be tested.
 - * @root: the css supporsed to be an ancestor of the child.
 - *
 - * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
 - * this function reads css->id, the caller must hold rcu_read_lock().
 - * But, considering usual usage, the csses should be valid objects after test.
 - * Assuming that the caller will do some action to the child if this returns
 - * returns true, the caller must take "child";s reference count.
 - * If "child" is valid object and this returns true, "root" is valid, too.
 - */
 -
 -bool css_is_ancestor(struct cgroup_subsys_state *child,
 -		    const struct cgroup_subsys_state *root)
 -{
 -	struct css_id *child_id;
 -	struct css_id *root_id;
 -
 -	child_id  = rcu_dereference(child->id);
 -	if (!child_id)
 -		return false;
 -	root_id = rcu_dereference(root->id);
 -	if (!root_id)
 -		return false;
 -	if (child_id->depth < root_id->depth)
 -		return false;
 -	if (child_id->stack[root_id->depth] != root_id->id)
 -		return false;
 -	return true;
 -}
 -
 -void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
 -{
 -	struct css_id *id = rcu_dereference_protected(css->id, true);
 -
 -	/* When this is called before css_id initialization, id can be NULL */
 -	if (!id)
 -		return;
 -
 -	BUG_ON(!ss->use_id);
 -
 -	rcu_assign_pointer(id->css, NULL);
 -	rcu_assign_pointer(css->id, NULL);
 -	spin_lock(&ss->id_lock);
 -	idr_remove(&ss->idr, id->id);
 -	spin_unlock(&ss->id_lock);
 -	kfree_rcu(id, rcu_head);
 -}
 -EXPORT_SYMBOL_GPL(free_css_id);
 -
 -/*
 - * This is called by init or create(). Then, calls to this function are
 - * always serialized (By cgroup_mutex() at create()).
 - */
 -
 -static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
 -{
 -	struct css_id *newid;
 -	int ret, size;
 -
 -	BUG_ON(!ss->use_id);
 -
 -	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
 -	newid = kzalloc(size, GFP_KERNEL);
 -	if (!newid)
 -		return ERR_PTR(-ENOMEM);
 -
 -	idr_preload(GFP_KERNEL);
 -	spin_lock(&ss->id_lock);
 -	/* Don't use 0. allocates an ID of 1-65535 */
 -	ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
 -	spin_unlock(&ss->id_lock);
 -	idr_preload_end();
 -
 -	/* Returns error when there are no free spaces for new ID.*/
 -	if (ret < 0)
 -		goto err_out;
 -
 -	newid->id = ret;
 -	newid->depth = depth;
 -	return newid;
 -err_out:
 -	kfree(newid);
 -	return ERR_PTR(ret);
 -
 -}
 -
 -static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
 -					    struct cgroup_subsys_state *rootcss)
 -{
 -	struct css_id *newid;
 -
 -	spin_lock_init(&ss->id_lock);
 -	idr_init(&ss->idr);
 -
 -	newid = get_new_cssid(ss, 0);
 -	if (IS_ERR(newid))
 -		return PTR_ERR(newid);
 -
 -	newid->stack[0] = newid->id;
 -	RCU_INIT_POINTER(newid->css, rootcss);
 -	RCU_INIT_POINTER(rootcss->id, newid);
 -	return 0;
 -}
 -
 -static int alloc_css_id(struct cgroup_subsys_state *child_css)
 -{
 -	struct cgroup_subsys_state *parent_css = css_parent(child_css);
 -	struct css_id *child_id, *parent_id;
 -	int i, depth;
 -
 -	parent_id = rcu_dereference_protected(parent_css->id, true);
 -	depth = parent_id->depth + 1;
 -
 -	child_id = get_new_cssid(child_css->ss, depth);
 -	if (IS_ERR(child_id))
 -		return PTR_ERR(child_id);
 -
 -	for (i = 0; i < depth; i++)
 -		child_id->stack[i] = parent_id->stack[i];
 -	child_id->stack[depth] = child_id->id;
 -	/*
 -	 * child_id->css pointer will be set after this cgroup is available
 -	 * see cgroup_populate_dir()
 -	 */
 -	rcu_assign_pointer(child_css->id, child_id);
 -
 -	return 0;
 -}
 -
 -/**
 - * css_lookup - lookup css by id
 - * @ss: cgroup subsys to be looked into.
 - * @id: the id
 - *
 - * Returns pointer to cgroup_subsys_state if there is valid one with id.
 - * NULL if not. Should be called under rcu_read_lock()
 - */
 -struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
 -{
 -	struct css_id *cssid = NULL;
 -
 -	BUG_ON(!ss->use_id);
 -	cssid = idr_find(&ss->idr, id);
 -
 -	if (unlikely(!cssid))
 -		return NULL;
 -
 -	return rcu_dereference(cssid->css);
 -}
 -EXPORT_SYMBOL_GPL(css_lookup);
 -
  /**
   * css_from_dir - get corresponding css from the dentry of a cgroup dir
   * @dentry: directory dentry of interest
diff --combined mm/memcontrol.c
index f1a0ae6e11b8,ec8582b3a232..7aa0d405b148
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -45,6 -45,7 +45,7 @@@
  #include <linux/swapops.h>
  #include <linux/spinlock.h>
  #include <linux/eventfd.h>
+ #include <linux/poll.h>
  #include <linux/sort.h>
  #include <linux/fs.h>
  #include <linux/seq_file.h>
@@@ -55,11 -56,11 +56,12 @@@
  #include <linux/cpu.h>
  #include <linux/oom.h>
  #include <linux/lockdep.h>
+ #include <linux/file.h>
  #include "internal.h"
  #include <net/sock.h>
  #include <net/ip.h>
  #include <net/tcp_memcontrol.h>
 +#include "slab.h"
  
  #include <asm/uaccess.h>
  
@@@ -227,6 -228,46 +229,46 @@@ struct mem_cgroup_eventfd_list 
  	struct eventfd_ctx *eventfd;
  };
  
+ /*
+  * cgroup_event represents events which userspace want to receive.
+  */
+ struct mem_cgroup_event {
+ 	/*
+ 	 * memcg which the event belongs to.
+ 	 */
+ 	struct mem_cgroup *memcg;
+ 	/*
+ 	 * eventfd to signal userspace about the event.
+ 	 */
+ 	struct eventfd_ctx *eventfd;
+ 	/*
+ 	 * Each of these stored in a list by the cgroup.
+ 	 */
+ 	struct list_head list;
+ 	/*
+ 	 * register_event() callback will be used to add new userspace
+ 	 * waiter for changes related to this event.  Use eventfd_signal()
+ 	 * on eventfd to send notification to userspace.
+ 	 */
+ 	int (*register_event)(struct mem_cgroup *memcg,
+ 			      struct eventfd_ctx *eventfd, const char *args);
+ 	/*
+ 	 * unregister_event() callback will be called when userspace closes
+ 	 * the eventfd or on cgroup removing.  This callback must be set,
+ 	 * if you want provide notification functionality.
+ 	 */
+ 	void (*unregister_event)(struct mem_cgroup *memcg,
+ 				 struct eventfd_ctx *eventfd);
+ 	/*
+ 	 * All fields below needed to unregister event when
+ 	 * userspace closes eventfd.
+ 	 */
+ 	poll_table pt;
+ 	wait_queue_head_t *wqh;
+ 	wait_queue_t wait;
+ 	struct work_struct remove;
+ };
+ 
  static void mem_cgroup_threshold(struct mem_cgroup *memcg);
  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
  
@@@ -313,7 -354,7 +355,7 @@@ struct mem_cgroup 
  
  	atomic_t	dead_count;
  #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
 -	struct tcp_memcontrol tcp_mem;
 +	struct cg_proto tcp_mem;
  #endif
  #if defined(CONFIG_MEMCG_KMEM)
  	/* analogous to slab_common's slab_caches list. per-memcg */
@@@ -331,6 -372,10 +373,10 @@@
  	atomic_t	numainfo_updating;
  #endif
  
+ 	/* List of events which userspace want to receive */
+ 	struct list_head event_list;
+ 	spinlock_t event_list_lock;
+ 
  	struct mem_cgroup_per_node *nodeinfo[0];
  	/* WARNING: nodeinfo must be the last member here */
  };
@@@ -490,39 -535,11 +536,34 @@@ struct cgroup_subsys_state *vmpressure_
  	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
  }
  
- struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
- {
- 	return &mem_cgroup_from_css(css)->vmpressure;
- }
- 
  static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
  {
  	return (memcg == root_mem_cgroup);
  }
  
 +/*
 + * We restrict the id in the range of [1, 65535], so it can fit into
 + * an unsigned short.
 + */
 +#define MEM_CGROUP_ID_MAX	USHRT_MAX
 +
 +static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 +{
 +	/*
 +	 * The ID of the root cgroup is 0, but memcg treat 0 as an
 +	 * invalid ID, so we return (cgroup_id + 1).
 +	 */
 +	return memcg->css.cgroup->id + 1;
 +}
 +
 +static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 +{
 +	struct cgroup_subsys_state *css;
 +
 +	css = css_from_id(id - 1, &mem_cgroup_subsys);
 +	return mem_cgroup_from_css(css);
 +}
 +
  /* Writing them here to avoid exposing memcg's inner layout */
  #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
  
@@@ -575,13 -592,13 +616,13 @@@ struct cg_proto *tcp_proto_cgroup(struc
  	if (!memcg || mem_cgroup_is_root(memcg))
  		return NULL;
  
 -	return &memcg->tcp_mem.cg_proto;
 +	return &memcg->tcp_mem;
  }
  EXPORT_SYMBOL(tcp_proto_cgroup);
  
  static void disarm_sock_keys(struct mem_cgroup *memcg)
  {
 -	if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
 +	if (!memcg_proto_activated(&memcg->tcp_mem))
  		return;
  	static_key_slow_dec(&memcg_socket_limit_enabled);
  }
@@@ -594,11 -611,16 +635,11 @@@ static void disarm_sock_keys(struct mem
  #ifdef CONFIG_MEMCG_KMEM
  /*
   * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
 - * There are two main reasons for not using the css_id for this:
 - *  1) this works better in sparse environments, where we have a lot of memcgs,
 - *     but only a few kmem-limited. Or also, if we have, for instance, 200
 - *     memcgs, and none but the 200th is kmem-limited, we'd have to have a
 - *     200 entry array for that.
 - *
 - *  2) In order not to violate the cgroup API, we would like to do all memory
 - *     allocation in ->create(). At that point, we haven't yet allocated the
 - *     css_id. Having a separate index prevents us from messing with the cgroup
 - *     core for this
 + * The main reason for not using cgroup id for this:
 + *  this works better in sparse environments, where we have a lot of memcgs,
 + *  but only a few kmem-limited. Or also, if we have, for instance, 200
 + *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 + *  200 entry array for that.
   *
   * The current size of the caches array is stored in
   * memcg_limited_groups_array_size.  It will double each time we have to
@@@ -613,14 -635,14 +654,14 @@@ int memcg_limited_groups_array_size
   * cgroups is a reasonable guess. In the future, it could be a parameter or
   * tunable, but that is strictly not necessary.
   *
 - * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
 + * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
   * this constant directly from cgroup, but it is understandable that this is
   * better kept as an internal representation in cgroup.c. In any case, the
 - * css_id space is not getting any smaller, and we don't have to necessarily
 + * cgrp_id space is not getting any smaller, and we don't have to necessarily
   * increase ours as well if it increases.
   */
  #define MEMCG_CACHES_MIN_SIZE 4
 -#define MEMCG_CACHES_MAX_SIZE 65535
 +#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
  
  /*
   * A lot of the calls to the cache allocation functions are expected to be
@@@ -1427,7 -1449,7 +1468,7 @@@ bool __mem_cgroup_same_or_subtree(cons
  		return true;
  	if (!root_memcg->use_hierarchy || !memcg)
  		return false;
 -	return css_is_ancestor(&memcg->css, &root_memcg->css);
 +	return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
  }
  
  static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
@@@ -2845,10 -2867,15 +2886,10 @@@ static void __mem_cgroup_cancel_local_c
   */
  static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
  {
 -	struct cgroup_subsys_state *css;
 -
  	/* ID 0 is unused ID */
  	if (!id)
  		return NULL;
 -	css = css_lookup(&mem_cgroup_subsys, id);
 -	if (!css)
 -		return NULL;
 -	return mem_cgroup_from_css(css);
 +	return mem_cgroup_from_id(id);
  }
  
  struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@@ -2969,7 -2996,7 +3010,7 @@@ static struct kmem_cache *memcg_params_
  
  	VM_BUG_ON(p->is_root_cache);
  	cachep = p->root_cache;
 -	return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
 +	return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
  }
  
  #ifdef CONFIG_SLABINFO
@@@ -2998,14 -3025,21 +3039,14 @@@ static int memcg_charge_kmem(struct mem
  	struct res_counter *fail_res;
  	struct mem_cgroup *_memcg;
  	int ret = 0;
 -	bool may_oom;
  
  	ret = res_counter_charge(&memcg->kmem, size, &fail_res);
  	if (ret)
  		return ret;
  
 -	/*
 -	 * Conditions under which we can wait for the oom_killer. Those are
 -	 * the same conditions tested by the core page allocator
 -	 */
 -	may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
 -
  	_memcg = memcg;
  	ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
 -				      &_memcg, may_oom);
 +				      &_memcg, oom_gfp_allowed(gfp));
  
  	if (ret == -EINTR)  {
  		/*
@@@ -3145,7 -3179,7 +3186,7 @@@ int memcg_update_cache_size(struct kmem
  {
  	struct memcg_cache_params *cur_params = s->memcg_params;
  
 -	VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
 +	VM_BUG_ON(!is_root_cache(s));
  
  	if (num_groups > memcg_limited_groups_array_size) {
  		int i;
@@@ -3406,7 -3440,7 +3447,7 @@@ static struct kmem_cache *memcg_create_
  	idx = memcg_cache_id(memcg);
  
  	mutex_lock(&memcg_cache_mutex);
 -	new_cachep = cachep->memcg_params->memcg_caches[idx];
 +	new_cachep = cache_from_memcg_idx(cachep, idx);
  	if (new_cachep) {
  		css_put(&memcg->css);
  		goto out;
@@@ -3452,8 -3486,8 +3493,8 @@@ void kmem_cache_destroy_memcg_children(
  	 * we'll take the set_limit_mutex to protect ourselves against this.
  	 */
  	mutex_lock(&set_limit_mutex);
 -	for (i = 0; i < memcg_limited_groups_array_size; i++) {
 -		c = s->memcg_params->memcg_caches[i];
 +	for_each_memcg_cache_index(i) {
 +		c = cache_from_memcg_idx(s, i);
  		if (!c)
  			continue;
  
@@@ -3586,8 -3620,8 +3627,8 @@@ struct kmem_cache *__memcg_kmem_get_cac
  	 * code updating memcg_caches will issue a write barrier to match this.
  	 */
  	read_barrier_depends();
 -	if (likely(cachep->memcg_params->memcg_caches[idx])) {
 -		cachep = cachep->memcg_params->memcg_caches[idx];
 +	if (likely(cache_from_memcg_idx(cachep, idx))) {
 +		cachep = cache_from_memcg_idx(cachep, idx);
  		goto out;
  	}
  
@@@ -4357,7 -4391,7 +4398,7 @@@ mem_cgroup_uncharge_swapcache(struct pa
  	 * css_get() was called in uncharge().
  	 */
  	if (do_swap_account && swapout && memcg)
 -		swap_cgroup_record(ent, css_id(&memcg->css));
 +		swap_cgroup_record(ent, mem_cgroup_id(memcg));
  }
  #endif
  
@@@ -4409,8 -4443,8 +4450,8 @@@ static int mem_cgroup_move_swap_account
  {
  	unsigned short old_id, new_id;
  
 -	old_id = css_id(&from->css);
 -	new_id = css_id(&to->css);
 +	old_id = mem_cgroup_id(from);
 +	new_id = mem_cgroup_id(to);
  
  	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
  		mem_cgroup_swap_statistics(from, false);
@@@ -5383,50 -5417,45 +5424,50 @@@ static int mem_cgroup_move_charge_write
  static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
  				struct cftype *cft, struct seq_file *m)
  {
 +	struct numa_stat {
 +		const char *name;
 +		unsigned int lru_mask;
 +	};
 +
 +	static const struct numa_stat stats[] = {
 +		{ "total", LRU_ALL },
 +		{ "file", LRU_ALL_FILE },
 +		{ "anon", LRU_ALL_ANON },
 +		{ "unevictable", BIT(LRU_UNEVICTABLE) },
 +	};
 +	const struct numa_stat *stat;
  	int nid;
 -	unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
 -	unsigned long node_nr;
 +	unsigned long nr;
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
 -	total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
 -	seq_printf(m, "total=%lu", total_nr);
 -	for_each_node_state(nid, N_MEMORY) {
 -		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
 -		seq_printf(m, " N%d=%lu", nid, node_nr);
 -	}
 -	seq_putc(m, '\n');
 -
 -	file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
 -	seq_printf(m, "file=%lu", file_nr);
 -	for_each_node_state(nid, N_MEMORY) {
 -		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
 -				LRU_ALL_FILE);
 -		seq_printf(m, " N%d=%lu", nid, node_nr);
 -	}
 -	seq_putc(m, '\n');
 -
 -	anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
 -	seq_printf(m, "anon=%lu", anon_nr);
 -	for_each_node_state(nid, N_MEMORY) {
 -		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
 -				LRU_ALL_ANON);
 -		seq_printf(m, " N%d=%lu", nid, node_nr);
 +	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
 +		nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
 +		seq_printf(m, "%s=%lu", stat->name, nr);
 +		for_each_node_state(nid, N_MEMORY) {
 +			nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
 +							  stat->lru_mask);
 +			seq_printf(m, " N%d=%lu", nid, nr);
 +		}
 +		seq_putc(m, '\n');
 +	}
 +
 +	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
 +		struct mem_cgroup *iter;
 +
 +		nr = 0;
 +		for_each_mem_cgroup_tree(iter, memcg)
 +			nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
 +		seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
 +		for_each_node_state(nid, N_MEMORY) {
 +			nr = 0;
 +			for_each_mem_cgroup_tree(iter, memcg)
 +				nr += mem_cgroup_node_nr_lru_pages(
 +					iter, nid, stat->lru_mask);
 +			seq_printf(m, " N%d=%lu", nid, nr);
 +		}
 +		seq_putc(m, '\n');
  	}
 -	seq_putc(m, '\n');
  
 -	unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
 -	seq_printf(m, "unevictable=%lu", unevictable_nr);
 -	for_each_node_state(nid, N_MEMORY) {
 -		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
 -				BIT(LRU_UNEVICTABLE));
 -		seq_printf(m, " N%d=%lu", nid, node_nr);
 -	}
 -	seq_putc(m, '\n');
  	return 0;
  }
  #endif /* CONFIG_NUMA */
@@@ -5648,13 -5677,11 +5689,11 @@@ static void mem_cgroup_oom_notify(struc
  		mem_cgroup_oom_notify_cb(iter);
  }
  
- static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
- 	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ 	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
  {
- 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  	struct mem_cgroup_thresholds *thresholds;
  	struct mem_cgroup_threshold_ary *new;
- 	enum res_type type = MEMFILE_TYPE(cft->private);
  	u64 threshold, usage;
  	int i, size, ret;
  
@@@ -5731,13 -5758,23 +5770,23 @@@ unlock
  	return ret;
  }
  
- static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
- 	struct cftype *cft, struct eventfd_ctx *eventfd)
+ static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ 	struct eventfd_ctx *eventfd, const char *args)
+ {
+ 	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+ }
+ 
+ static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ 	struct eventfd_ctx *eventfd, const char *args)
+ {
+ 	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+ }
+ 
+ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ 	struct eventfd_ctx *eventfd, enum res_type type)
  {
- 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  	struct mem_cgroup_thresholds *thresholds;
  	struct mem_cgroup_threshold_ary *new;
- 	enum res_type type = MEMFILE_TYPE(cft->private);
  	u64 usage;
  	int i, j, size;
  
@@@ -5810,14 -5847,23 +5859,23 @@@ unlock
  	mutex_unlock(&memcg->thresholds_lock);
  }
  
- static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
- 	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+ static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ 	struct eventfd_ctx *eventfd)
+ {
+ 	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+ }
+ 
+ static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ 	struct eventfd_ctx *eventfd)
+ {
+ 	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+ }
+ 
+ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+ 	struct eventfd_ctx *eventfd, const char *args)
  {
- 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  	struct mem_cgroup_eventfd_list *event;
- 	enum res_type type = MEMFILE_TYPE(cft->private);
  
- 	BUG_ON(type != _OOM_TYPE);
  	event = kmalloc(sizeof(*event),	GFP_KERNEL);
  	if (!event)
  		return -ENOMEM;
@@@ -5835,14 -5881,10 +5893,10 @@@
  	return 0;
  }
  
- static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
- 	struct cftype *cft, struct eventfd_ctx *eventfd)
+ static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
+ 	struct eventfd_ctx *eventfd)
  {
- 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  	struct mem_cgroup_eventfd_list *ev, *tmp;
- 	enum res_type type = MEMFILE_TYPE(cft->private);
- 
- 	BUG_ON(type != _OOM_TYPE);
  
  	spin_lock(&memcg_oom_lock);
  
@@@ -5959,13 -6001,233 +6013,233 @@@ static void kmem_cgroup_css_offline(str
  }
  #endif
  
+ /*
+  * DO NOT USE IN NEW FILES.
+  *
+  * "cgroup.event_control" implementation.
+  *
+  * This is way over-engineered.  It tries to support fully configurable
+  * events for each user.  Such level of flexibility is completely
+  * unnecessary especially in the light of the planned unified hierarchy.
+  *
+  * Please deprecate this and replace with something simpler if at all
+  * possible.
+  */
+ 
+ /*
+  * Unregister event and free resources.
+  *
+  * Gets called from workqueue.
+  */
+ static void memcg_event_remove(struct work_struct *work)
+ {
+ 	struct mem_cgroup_event *event =
+ 		container_of(work, struct mem_cgroup_event, remove);
+ 	struct mem_cgroup *memcg = event->memcg;
+ 
+ 	remove_wait_queue(event->wqh, &event->wait);
+ 
+ 	event->unregister_event(memcg, event->eventfd);
+ 
+ 	/* Notify userspace the event is going away. */
+ 	eventfd_signal(event->eventfd, 1);
+ 
+ 	eventfd_ctx_put(event->eventfd);
+ 	kfree(event);
+ 	css_put(&memcg->css);
+ }
+ 
+ /*
+  * Gets called on POLLHUP on eventfd when user closes it.
+  *
+  * Called with wqh->lock held and interrupts disabled.
+  */
+ static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+ 			    int sync, void *key)
+ {
+ 	struct mem_cgroup_event *event =
+ 		container_of(wait, struct mem_cgroup_event, wait);
+ 	struct mem_cgroup *memcg = event->memcg;
+ 	unsigned long flags = (unsigned long)key;
+ 
+ 	if (flags & POLLHUP) {
+ 		/*
+ 		 * If the event has been detached at cgroup removal, we
+ 		 * can simply return knowing the other side will cleanup
+ 		 * for us.
+ 		 *
+ 		 * We can't race against event freeing since the other
+ 		 * side will require wqh->lock via remove_wait_queue(),
+ 		 * which we hold.
+ 		 */
+ 		spin_lock(&memcg->event_list_lock);
+ 		if (!list_empty(&event->list)) {
+ 			list_del_init(&event->list);
+ 			/*
+ 			 * We are in atomic context, but cgroup_event_remove()
+ 			 * may sleep, so we have to call it in workqueue.
+ 			 */
+ 			schedule_work(&event->remove);
+ 		}
+ 		spin_unlock(&memcg->event_list_lock);
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ static void memcg_event_ptable_queue_proc(struct file *file,
+ 		wait_queue_head_t *wqh, poll_table *pt)
+ {
+ 	struct mem_cgroup_event *event =
+ 		container_of(pt, struct mem_cgroup_event, pt);
+ 
+ 	event->wqh = wqh;
+ 	add_wait_queue(wqh, &event->wait);
+ }
+ 
+ /*
+  * DO NOT USE IN NEW FILES.
+  *
+  * Parse input and register new cgroup event handler.
+  *
+  * Input must be in format '<event_fd> <control_fd> <args>'.
+  * Interpretation of args is defined by control file implementation.
+  */
+ static int memcg_write_event_control(struct cgroup_subsys_state *css,
+ 				     struct cftype *cft, const char *buffer)
+ {
+ 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ 	struct mem_cgroup_event *event;
+ 	struct cgroup_subsys_state *cfile_css;
+ 	unsigned int efd, cfd;
+ 	struct fd efile;
+ 	struct fd cfile;
+ 	const char *name;
+ 	char *endp;
+ 	int ret;
+ 
+ 	efd = simple_strtoul(buffer, &endp, 10);
+ 	if (*endp != ' ')
+ 		return -EINVAL;
+ 	buffer = endp + 1;
+ 
+ 	cfd = simple_strtoul(buffer, &endp, 10);
+ 	if ((*endp != ' ') && (*endp != '\0'))
+ 		return -EINVAL;
+ 	buffer = endp + 1;
+ 
+ 	event = kzalloc(sizeof(*event), GFP_KERNEL);
+ 	if (!event)
+ 		return -ENOMEM;
+ 
+ 	event->memcg = memcg;
+ 	INIT_LIST_HEAD(&event->list);
+ 	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+ 	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+ 	INIT_WORK(&event->remove, memcg_event_remove);
+ 
+ 	efile = fdget(efd);
+ 	if (!efile.file) {
+ 		ret = -EBADF;
+ 		goto out_kfree;
+ 	}
+ 
+ 	event->eventfd = eventfd_ctx_fileget(efile.file);
+ 	if (IS_ERR(event->eventfd)) {
+ 		ret = PTR_ERR(event->eventfd);
+ 		goto out_put_efile;
+ 	}
+ 
+ 	cfile = fdget(cfd);
+ 	if (!cfile.file) {
+ 		ret = -EBADF;
+ 		goto out_put_eventfd;
+ 	}
+ 
+ 	/* the process need read permission on control file */
+ 	/* AV: shouldn't we check that it's been opened for read instead? */
+ 	ret = inode_permission(file_inode(cfile.file), MAY_READ);
+ 	if (ret < 0)
+ 		goto out_put_cfile;
+ 
+ 	/*
+ 	 * Determine the event callbacks and set them in @event.  This used
+ 	 * to be done via struct cftype but cgroup core no longer knows
+ 	 * about these events.  The following is crude but the whole thing
+ 	 * is for compatibility anyway.
+ 	 *
+ 	 * DO NOT ADD NEW FILES.
+ 	 */
+ 	name = cfile.file->f_dentry->d_name.name;
+ 
+ 	if (!strcmp(name, "memory.usage_in_bytes")) {
+ 		event->register_event = mem_cgroup_usage_register_event;
+ 		event->unregister_event = mem_cgroup_usage_unregister_event;
+ 	} else if (!strcmp(name, "memory.oom_control")) {
+ 		event->register_event = mem_cgroup_oom_register_event;
+ 		event->unregister_event = mem_cgroup_oom_unregister_event;
+ 	} else if (!strcmp(name, "memory.pressure_level")) {
+ 		event->register_event = vmpressure_register_event;
+ 		event->unregister_event = vmpressure_unregister_event;
+ 	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+ 		event->register_event = memsw_cgroup_usage_register_event;
+ 		event->unregister_event = memsw_cgroup_usage_unregister_event;
+ 	} else {
+ 		ret = -EINVAL;
+ 		goto out_put_cfile;
+ 	}
+ 
+ 	/*
+ 	 * Verify @cfile should belong to @css.  Also, remaining events are
+ 	 * automatically removed on cgroup destruction but the removal is
+ 	 * asynchronous, so take an extra ref on @css.
+ 	 */
+ 	rcu_read_lock();
+ 
+ 	ret = -EINVAL;
+ 	cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
+ 				 &mem_cgroup_subsys);
+ 	if (cfile_css == css && css_tryget(css))
+ 		ret = 0;
+ 
+ 	rcu_read_unlock();
+ 	if (ret)
+ 		goto out_put_cfile;
+ 
+ 	ret = event->register_event(memcg, event->eventfd, buffer);
+ 	if (ret)
+ 		goto out_put_css;
+ 
+ 	efile.file->f_op->poll(efile.file, &event->pt);
+ 
+ 	spin_lock(&memcg->event_list_lock);
+ 	list_add(&event->list, &memcg->event_list);
+ 	spin_unlock(&memcg->event_list_lock);
+ 
+ 	fdput(cfile);
+ 	fdput(efile);
+ 
+ 	return 0;
+ 
+ out_put_css:
+ 	css_put(css);
+ out_put_cfile:
+ 	fdput(cfile);
+ out_put_eventfd:
+ 	eventfd_ctx_put(event->eventfd);
+ out_put_efile:
+ 	fdput(efile);
+ out_kfree:
+ 	kfree(event);
+ 
+ 	return ret;
+ }
+ 
  static struct cftype mem_cgroup_files[] = {
  	{
  		.name = "usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
  		.read = mem_cgroup_read,
- 		.register_event = mem_cgroup_usage_register_event,
- 		.unregister_event = mem_cgroup_usage_unregister_event,
  	},
  	{
  		.name = "max_usage_in_bytes",
@@@ -6005,6 -6267,12 +6279,12 @@@
  		.write_u64 = mem_cgroup_hierarchy_write,
  		.read_u64 = mem_cgroup_hierarchy_read,
  	},
+ 	{
+ 		.name = "cgroup.event_control",		/* XXX: for compat */
+ 		.write_string = memcg_write_event_control,
+ 		.flags = CFTYPE_NO_PREFIX,
+ 		.mode = S_IWUGO,
+ 	},
  	{
  		.name = "swappiness",
  		.read_u64 = mem_cgroup_swappiness_read,
@@@ -6019,14 -6287,10 +6299,10 @@@
  		.name = "oom_control",
  		.read_map = mem_cgroup_oom_control_read,
  		.write_u64 = mem_cgroup_oom_control_write,
- 		.register_event = mem_cgroup_oom_register_event,
- 		.unregister_event = mem_cgroup_oom_unregister_event,
  		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
  	},
  	{
  		.name = "pressure_level",
- 		.register_event = vmpressure_register_event,
- 		.unregister_event = vmpressure_unregister_event,
  	},
  #ifdef CONFIG_NUMA
  	{
@@@ -6074,8 -6338,6 +6350,6 @@@ static struct cftype memsw_cgroup_files
  		.name = "memsw.usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
  		.read = mem_cgroup_read,
- 		.register_event = mem_cgroup_usage_register_event,
- 		.unregister_event = mem_cgroup_usage_unregister_event,
  	},
  	{
  		.name = "memsw.max_usage_in_bytes",
@@@ -6178,6 -6440,7 +6452,6 @@@ static void __mem_cgroup_free(struct me
  	size_t size = memcg_size();
  
  	mem_cgroup_remove_from_trees(memcg);
 -	free_css_id(&mem_cgroup_subsys, &memcg->css);
  
  	for_each_node(node)
  		free_mem_cgroup_per_zone_info(memcg, node);
@@@ -6265,6 -6528,8 +6539,8 @@@ mem_cgroup_css_alloc(struct cgroup_subs
  	mutex_init(&memcg->thresholds_lock);
  	spin_lock_init(&memcg->move_lock);
  	vmpressure_init(&memcg->vmpressure);
+ 	INIT_LIST_HEAD(&memcg->event_list);
+ 	spin_lock_init(&memcg->event_list_lock);
  
  	return &memcg->css;
  
@@@ -6280,9 -6545,6 +6556,9 @@@ mem_cgroup_css_online(struct cgroup_sub
  	struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
  	int error = 0;
  
 +	if (css->cgroup->id > MEM_CGROUP_ID_MAX)
 +		return -ENOSPC;
 +
  	if (!parent)
  		return 0;
  
@@@ -6340,6 -6602,19 +6616,19 @@@ static void mem_cgroup_invalidate_recla
  static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ 	struct mem_cgroup_event *event, *tmp;
+ 
+ 	/*
+ 	 * Unregister events and notify userspace.
+ 	 * Notify userspace about cgroup removing only after rmdir of cgroup
+ 	 * directory to avoid race between userspace and kernelspace.
+ 	 */
+ 	spin_lock(&memcg->event_list_lock);
+ 	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
+ 		list_del_init(&event->list);
+ 		schedule_work(&event->remove);
+ 	}
+ 	spin_unlock(&memcg->event_list_lock);
  
  	kmem_cgroup_css_offline(memcg);
  
@@@ -6554,7 -6829,7 +6843,7 @@@ static enum mc_target_type get_mctgt_ty
  	}
  	/* There is a swap entry and a page doesn't exist or isn't charged */
  	if (ent.val && !ret &&
 -			css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
 +	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
  		ret = MC_TARGET_SWAP;
  		if (target)
  			target->ent = ent;
@@@ -6605,10 -6880,10 +6894,10 @@@ static int mem_cgroup_count_precharge_p
  	pte_t *pte;
  	spinlock_t *ptl;
  
 -	if (pmd_trans_huge_lock(pmd, vma) == 1) {
 +	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
  		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
  			mc.precharge += HPAGE_PMD_NR;
 -		spin_unlock(&vma->vm_mm->page_table_lock);
 +		spin_unlock(ptl);
  		return 0;
  	}
  
@@@ -6797,9 -7072,9 +7086,9 @@@ static int mem_cgroup_move_charge_pte_r
  	 *    to be unlocked in __split_huge_page_splitting(), where the main
  	 *    part of thp split is not executed yet.
  	 */
 -	if (pmd_trans_huge_lock(pmd, vma) == 1) {
 +	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
  		if (mc.precharge < HPAGE_PMD_NR) {
 -			spin_unlock(&vma->vm_mm->page_table_lock);
 +			spin_unlock(ptl);
  			return 0;
  		}
  		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
@@@ -6816,7 -7091,7 +7105,7 @@@
  			}
  			put_page(page);
  		}
 -		spin_unlock(&vma->vm_mm->page_table_lock);
 +		spin_unlock(ptl);
  		return 0;
  	}
  
@@@ -6974,6 -7249,7 +7263,6 @@@ struct cgroup_subsys mem_cgroup_subsys 
  	.bind = mem_cgroup_bind,
  	.base_cftypes = mem_cgroup_files,
  	.early_init = 0,
 -	.use_id = 1,
  };
  
  #ifdef CONFIG_MEMCG_SWAP