From: Tejun Heo Date: Fri, 22 Nov 2013 23:32:25 +0000 (-0500) Subject: cgroup: Merge branch 'memcg_event' into for-3.14 X-Git-Tag: v3.14-rc1~136^2~35 X-Git-Url: https://repo.jachan.dev/linux.git/commitdiff_plain/edab95103d3a1eb5e3faf977eae4ad0b5bf5669c?hp=-c cgroup: Merge branch 'memcg_event' into for-3.14 Merge v3.12 based patch series to move cgroup_event implementation to memcg into for-3.14. The following two commits cause a conflict in kernel/cgroup.c 2ff2a7d03bbe4 ("cgroup: kill css_id") 79bd9814e5ec9 ("cgroup, memcg: move cgroup_event implementation to memcg") Each patch removes a struct definition from kernel/cgroup.c. As the two are adjacent, they cause a context conflict. Easily resolved by removing both structs. Signed-off-by: Tejun Heo --- edab95103d3a1eb5e3faf977eae4ad0b5bf5669c diff --combined include/linux/cgroup.h index 39c1d9469677,8d9fa8967c9e..492fa01ec2d3 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@@ -29,7 -29,6 +29,6 @@@ struct cgroup_subsys struct inode; struct cgroup; struct css_id; - struct eventfd_ctx; extern int cgroup_init_early(void); extern int cgroup_init(void); @@@ -239,10 -238,6 +238,6 @@@ struct cgroup struct rcu_head rcu_head; struct work_struct destroy_work; - /* List of events which userspace want to receive */ - struct list_head event_list; - spinlock_t event_list_lock; - /* directory xattrs */ struct simple_xattrs xattrs; }; @@@ -506,25 -501,6 +501,6 @@@ struct cftype int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); int (*release)(struct inode *inode, struct file *file); - - /* - * register_event() callback will be used to add new userspace - * waiter for changes related to the cftype. Implement it if - * you want to provide this functionality. Use eventfd_signal() - * on eventfd to send notification to userspace. - */ - int (*register_event)(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, - const char *args); - /* - * unregister_event() callback will be called when userspace - * closes the eventfd or on cgroup removing. - * This callback must be implemented, if you want provide - * notification functionality. - */ - void (*unregister_event)(struct cgroup_subsys_state *css, - struct cftype *cft, - struct eventfd_ctx *eventfd); }; /* @@@ -612,6 -588,11 +588,6 @@@ struct cgroup_subsys int subsys_id; int disabled; int early_init; - /* - * True if this subsys uses ID. ID is not available before cgroup_init() - * (not available in early_init time.) - */ - bool use_id; /* * If %false, this subsystem is properly hierarchical - @@@ -637,6 -618,9 +613,6 @@@ */ struct cgroupfs_root *root; struct list_head sibling; - /* used when use_id == true */ - struct idr idr; - spinlock_t id_lock; /* list of cftype_sets */ struct list_head cftsets; @@@ -867,6 -851,35 +843,6 @@@ int css_scan_tasks(struct cgroup_subsys int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); -/* - * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works - * if cgroup_subsys.use_id == true. It can be used for looking up and scanning. - * CSS ID is assigned at cgroup allocation (create) automatically - * and removed when subsys calls free_css_id() function. This is because - * the lifetime of cgroup_subsys_state is subsys's matter. - * - * Looking up and scanning function should be called under rcu_read_lock(). - * Taking cgroup_mutex is not necessary for following calls. - * But the css returned by this routine can be "not populated yet" or "being - * destroyed". The caller should check css and cgroup's status. - */ - -/* - * Typically Called at ->destroy(), or somewhere the subsys frees - * cgroup_subsys_state. - */ -void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); - -/* Find a cgroup_subsys_state which has given ID */ - -struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id); - -/* Returns true if root is ancestor of cg */ -bool css_is_ancestor(struct cgroup_subsys_state *cg, - const struct cgroup_subsys_state *root); - -/* Get id and depth of css */ -unsigned short css_id(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); diff --combined init/Kconfig index 79383d3aa5dc,3ca5b8110b0c..93f344337172 --- a/init/Kconfig +++ b/init/Kconfig @@@ -284,7 -284,7 +284,7 @@@ config AUDI config AUDITSYSCALL bool "Enable system-call auditing support" - depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM && AEABI && !OABI_COMPAT)) + depends on AUDIT && (X86 || PARISC || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM && AEABI && !OABI_COMPAT)) default y if SECURITY_SELINUX help Enable low-overhead system-call auditing infrastructure that @@@ -301,6 -301,20 +301,6 @@@ config AUDIT_TRE depends on AUDITSYSCALL select FSNOTIFY -config AUDIT_LOGINUID_IMMUTABLE - bool "Make audit loginuid immutable" - depends on AUDIT - help - The config option toggles if a task setting its loginuid requires - CAP_SYS_AUDITCONTROL or if that task should require no special permissions - but should instead only allow setting its loginuid if it was never - previously set. On systems which use systemd or a similar central - process to restart login services this should be set to true. On older - systems in which an admin would typically have to directly stop and - start processes this should be set to false. Setting this to true allows - one to drop potentially dangerous capabilites from the login tasks, - but may not be backwards compatible with older init systems. - source "kernel/irq/Kconfig" source "kernel/time/Kconfig" @@@ -340,8 -354,7 +340,8 @@@ config VIRT_CPU_ACCOUNTING_NATIV config VIRT_CPU_ACCOUNTING_GEN bool "Full dynticks CPU time accounting" - depends on HAVE_CONTEXT_TRACKING && 64BIT + depends on HAVE_CONTEXT_TRACKING + depends on HAVE_VIRT_CPU_ACCOUNTING_GEN select VIRT_CPU_ACCOUNTING select CONTEXT_TRACKING help @@@ -831,7 -844,7 +831,7 @@@ config NUMA_BALANCING_DEFAULT_ENABLE default y depends on NUMA_BALANCING help - If set, autonumic NUMA balancing will be enabled if running on a NUMA + If set, automatic NUMA balancing will be enabled if running on a NUMA machine. config NUMA_BALANCING @@@ -842,13 -855,12 +842,12 @@@ help This option adds support for automatic NUMA aware memory/task placement. The mechanism is quite primitive and is based on migrating memory when - it is references to the node the task is running on. + it has references to the node the task is running on. This system will be inactive on UMA systems. menuconfig CGROUPS boolean "Control Group support" - depends on EVENTFD help This option adds support for grouping sets of processes together, for use with process control subsystems such as Cpusets, CFS, memory @@@ -915,6 -927,7 +914,7 @@@ config MEMC bool "Memory Resource Controller for Control Groups" depends on RESOURCE_COUNTERS select MM_OWNER + select EVENTFD help Provides a memory resource controller that manages both anonymous memory and page cache. (See Documentation/cgroups/memory.txt) @@@ -1154,7 -1167,6 +1154,6 @@@ config UIDGID_STRICT_TYPE_CHECK config SCHED_AUTOGROUP bool "Automatic process group scheduling" - select EVENTFD select CGROUPS select CGROUP_SCHED select FAIR_GROUP_SCHED @@@ -1655,18 -1667,6 +1654,18 @@@ config BASE_SMAL default 0 if BASE_FULL default 1 if !BASE_FULL +config SYSTEM_TRUSTED_KEYRING + bool "Provide system-wide ring of trusted keys" + depends on KEYS + help + Provide a system keyring to which trusted keys can be added. Keys in + the keyring are considered to be trusted. Keys may be added at will + by the kernel from compiled-in data and from hardware key stores, but + userspace may only add extra keys if those keys can be verified by + keys already in the keyring. + + Keys in this keyring are used by module signature checking. + menuconfig MODULES bool "Enable loadable module support" option modules @@@ -1740,7 -1740,6 +1739,7 @@@ config MODULE_SRCVERSION_AL config MODULE_SIG bool "Module signature verification" depends on MODULES + select SYSTEM_TRUSTED_KEYRING select KEYS select CRYPTO select ASYMMETRIC_KEY_TYPE diff --combined kernel/cgroup.c index a7b98ee35ef7,c0248e16461d..be42967f4f1a --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@@ -56,11 -56,8 +56,8 @@@ #include #include #include /* TODO: replace with more sophisticated array */ - #include - #include #include /* used in cgroup_attach_task */ #include - #include #include @@@ -89,14 -86,6 +86,14 @@@ static DEFINE_MUTEX(cgroup_mutex) static DEFINE_MUTEX(cgroup_root_mutex); +/* + * cgroup destruction makes heavy use of work items and there can be a lot + * of concurrent destructions. Use a separate workqueue so that cgroup + * destruction work items don't end up filling up max_active of system_wq + * which may lead to deadlock. + */ +static struct workqueue_struct *cgroup_destroy_wq; + /* * Generate an array of cgroup subsystem pointers. At boot time, this is * populated with the built in subsystems, and modular subsystems are @@@ -132,36 -121,38 +129,6 @@@ struct cfent struct simple_xattrs xattrs; }; --/* - * cgroup_event represents events which userspace want to receive. - * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when - * cgroup_subsys->use_id != 0. -- */ - struct cgroup_event { -#define CSS_ID_MAX (65535) -struct css_id { -- /* - * css which the event belongs to. - * The css to which this ID points. This pointer is set to valid value - * after cgroup is populated. If cgroup is removed, this will be NULL. - * This pointer is expected to be RCU-safe because destroy() - * is called after synchronize_rcu(). But for safe use, css_tryget() - * should be used for avoiding race. -- */ - struct cgroup_subsys_state *css; - struct cgroup_subsys_state __rcu *css; -- /* - * Control file which the event associated. - * ID of this css. -- */ - struct cftype *cft; - unsigned short id; -- /* - * eventfd to signal userspace about the event. - * Depth in hierarchy which this ID belongs to. -- */ - struct eventfd_ctx *eventfd; - unsigned short depth; -- /* - * Each of these stored in a list by the cgroup. - * ID is freed by RCU. (and lookup routine is RCU safe.) -- */ - struct list_head list; - struct rcu_head rcu_head; -- /* - * All fields below needed to unregister event when - * userspace closes eventfd. - * Hierarchy of CSS ID belongs to. -- */ - poll_table pt; - wait_queue_head_t *wqh; - wait_queue_t wait; - struct work_struct remove; - unsigned short stack[0]; /* Array of Length (depth+1) */ --}; -- /* The list of hierarchy roots */ static LIST_HEAD(cgroup_roots); @@@ -363,6 -354,9 +330,6 @@@ struct cgrp_cset_link static struct css_set init_css_set; static struct cgrp_cset_link init_cgrp_cset_link; -static int cgroup_init_idr(struct cgroup_subsys *ss, - struct cgroup_subsys_state *css); - /* * css_set_lock protects the list of css_set objects, and the chain of * tasks off each css_set. Nests outside task->alloc_lock due to @@@ -814,6 -808,8 +781,6 @@@ static struct backing_dev_info cgroup_b .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -static int alloc_css_id(struct cgroup_subsys_state *child_css); - static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) { struct inode *inode = new_inode(sb); @@@ -879,7 -875,7 +846,7 @@@ static void cgroup_free_rcu(struct rcu_ struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); - schedule_work(&cgrp->destroy_work); + queue_work(cgroup_destroy_wq, &cgrp->destroy_work); } static void cgroup_diput(struct dentry *dentry, struct inode *inode) @@@ -903,6 -899,11 +870,6 @@@ iput(inode); } -static int cgroup_delete(const struct dentry *d) -{ - return 1; -} - static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@@ -1351,8 -1352,6 +1318,6 @@@ static void init_cgroup_housekeeping(st INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->dummy_css.cgroup = cgrp; - INIT_LIST_HEAD(&cgrp->event_list); - spin_lock_init(&cgrp->event_list_lock); simple_xattrs_init(&cgrp->xattrs); } @@@ -1489,7 -1488,7 +1454,7 @@@ static int cgroup_get_rootdir(struct su { static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, - .d_delete = cgroup_delete, + .d_delete = always_delete_dentry, }; struct inode *inode = @@@ -2626,16 -2625,6 +2591,6 @@@ static const struct inode_operations cg .removexattr = cgroup_removexattr, }; - /* - * Check if a file is a control file - */ - static inline struct cftype *__file_cft(struct file *file) - { - if (file_inode(file)->i_fop != &cgroup_file_operations) - return ERR_PTR(-EINVAL); - return __d_cft(file->f_dentry); - } - static int cgroup_create_file(struct dentry *dentry, umode_t mode, struct super_block *sb) { @@@ -3915,202 -3904,6 +3870,6 @@@ static void cgroup_dput(struct cgroup * deactivate_super(sb); } - /* - * Unregister event and free resources. - * - * Gets called from workqueue. - */ - static void cgroup_event_remove(struct work_struct *work) - { - struct cgroup_event *event = container_of(work, struct cgroup_event, - remove); - struct cgroup_subsys_state *css = event->css; - - remove_wait_queue(event->wqh, &event->wait); - - event->cft->unregister_event(css, event->cft, event->eventfd); - - /* Notify userspace the event is going away. */ - eventfd_signal(event->eventfd, 1); - - eventfd_ctx_put(event->eventfd); - kfree(event); - css_put(css); - } - - /* - * Gets called on POLLHUP on eventfd when user closes it. - * - * Called with wqh->lock held and interrupts disabled. - */ - static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, - int sync, void *key) - { - struct cgroup_event *event = container_of(wait, - struct cgroup_event, wait); - struct cgroup *cgrp = event->css->cgroup; - unsigned long flags = (unsigned long)key; - - if (flags & POLLHUP) { - /* - * If the event has been detached at cgroup removal, we - * can simply return knowing the other side will cleanup - * for us. - * - * We can't race against event freeing since the other - * side will require wqh->lock via remove_wait_queue(), - * which we hold. - */ - spin_lock(&cgrp->event_list_lock); - if (!list_empty(&event->list)) { - list_del_init(&event->list); - /* - * We are in atomic context, but cgroup_event_remove() - * may sleep, so we have to call it in workqueue. - */ - schedule_work(&event->remove); - } - spin_unlock(&cgrp->event_list_lock); - } - - return 0; - } - - static void cgroup_event_ptable_queue_proc(struct file *file, - wait_queue_head_t *wqh, poll_table *pt) - { - struct cgroup_event *event = container_of(pt, - struct cgroup_event, pt); - - event->wqh = wqh; - add_wait_queue(wqh, &event->wait); - } - - /* - * Parse input and register new cgroup event handler. - * - * Input must be in format ' '. - * Interpretation of args is defined by control file implementation. - */ - static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, - struct cftype *cft, const char *buffer) - { - struct cgroup *cgrp = dummy_css->cgroup; - struct cgroup_event *event; - struct cgroup_subsys_state *cfile_css; - unsigned int efd, cfd; - struct fd efile; - struct fd cfile; - char *endp; - int ret; - - efd = simple_strtoul(buffer, &endp, 10); - if (*endp != ' ') - return -EINVAL; - buffer = endp + 1; - - cfd = simple_strtoul(buffer, &endp, 10); - if ((*endp != ' ') && (*endp != '\0')) - return -EINVAL; - buffer = endp + 1; - - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return -ENOMEM; - - INIT_LIST_HEAD(&event->list); - init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); - init_waitqueue_func_entry(&event->wait, cgroup_event_wake); - INIT_WORK(&event->remove, cgroup_event_remove); - - efile = fdget(efd); - if (!efile.file) { - ret = -EBADF; - goto out_kfree; - } - - event->eventfd = eventfd_ctx_fileget(efile.file); - if (IS_ERR(event->eventfd)) { - ret = PTR_ERR(event->eventfd); - goto out_put_efile; - } - - cfile = fdget(cfd); - if (!cfile.file) { - ret = -EBADF; - goto out_put_eventfd; - } - - /* the process need read permission on control file */ - /* AV: shouldn't we check that it's been opened for read instead? */ - ret = inode_permission(file_inode(cfile.file), MAY_READ); - if (ret < 0) - goto out_put_cfile; - - event->cft = __file_cft(cfile.file); - if (IS_ERR(event->cft)) { - ret = PTR_ERR(event->cft); - goto out_put_cfile; - } - - if (!event->cft->ss) { - ret = -EBADF; - goto out_put_cfile; - } - - /* - * Determine the css of @cfile, verify it belongs to the same - * cgroup as cgroup.event_control, and associate @event with it. - * Remaining events are automatically removed on cgroup destruction - * but the removal is asynchronous, so take an extra ref. - */ - rcu_read_lock(); - - ret = -EINVAL; - event->css = cgroup_css(cgrp, event->cft->ss); - cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); - if (event->css && event->css == cfile_css && css_tryget(event->css)) - ret = 0; - - rcu_read_unlock(); - if (ret) - goto out_put_cfile; - - if (!event->cft->register_event || !event->cft->unregister_event) { - ret = -EINVAL; - goto out_put_css; - } - - ret = event->cft->register_event(event->css, event->cft, - event->eventfd, buffer); - if (ret) - goto out_put_css; - - efile.file->f_op->poll(efile.file, &event->pt); - - spin_lock(&cgrp->event_list_lock); - list_add(&event->list, &cgrp->event_list); - spin_unlock(&cgrp->event_list_lock); - - fdput(cfile); - fdput(efile); - - return 0; - - out_put_css: - css_put(event->css); - out_put_cfile: - fdput(cfile); - out_put_eventfd: - eventfd_ctx_put(event->eventfd); - out_put_efile: - fdput(efile); - out_kfree: - kfree(event); - - return ret; - } - static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@@ -4135,11 -3928,6 +3894,6 @@@ static struct cftype cgroup_base_files[ .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, - { - .name = "cgroup.event_control", - .write_string = cgroup_write_event_control, - .mode = S_IWUGO, - }, { .name = "cgroup.clone_children", .flags = CFTYPE_INSANE, @@@ -4206,6 -3994,21 +3960,6 @@@ static int cgroup_populate_dir(struct c goto err; } } - - /* This cgroup is ready now */ - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - struct css_id *id = rcu_dereference_protected(css->id, true); - - /* - * Update id->css pointer and make this css visible from - * CSS ID functions. This pointer will be dereferened - * from RCU-read-side without locks. - */ - if (id) - rcu_assign_pointer(id->css, css); - } - return 0; err: cgroup_clear_dir(cgrp, subsys_mask); @@@ -4257,7 -4060,7 +4011,7 @@@ static void css_free_rcu_fn(struct rcu_ * css_put(). dput() requires process context which we don't have. */ INIT_WORK(&css->destroy_work, css_free_work_fn); - schedule_work(&css->destroy_work); + queue_work(cgroup_destroy_wq, &css->destroy_work); } static void css_release(struct percpu_ref *ref) @@@ -4274,6 -4077,7 +4028,6 @@@ static void init_css(struct cgroup_subs css->cgroup = cgrp; css->ss = ss; css->flags = 0; - css->id = NULL; if (cgrp->parent) css->parent = cgroup_css(cgrp->parent, ss); @@@ -4405,6 -4209,12 +4159,6 @@@ static long cgroup_create(struct cgrou goto err_free_all; init_css(css, ss, cgrp); - - if (ss->use_id) { - err = alloc_css_id(css); - if (err) - goto err_free_all; - } } /* @@@ -4547,7 -4357,7 +4301,7 @@@ static void css_killed_ref_fn(struct pe container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_killed_work_fn); - schedule_work(&css->destroy_work); + queue_work(cgroup_destroy_wq, &css->destroy_work); } /** @@@ -4610,7 -4420,6 +4364,6 @@@ static int cgroup_destroy_locked(struc __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct dentry *d = cgrp->dentry; - struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; struct cgroup *child; bool empty; @@@ -4685,18 -4494,6 +4438,6 @@@ dget(d); cgroup_d_remove_dir(d); - /* - * Unregister events and notify userspace. - * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace. - */ - spin_lock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { - list_del_init(&event->list); - schedule_work(&event->remove); - } - spin_unlock(&cgrp->event_list_lock); - return 0; }; @@@ -4869,6 -4666,12 +4610,6 @@@ int __init_or_module cgroup_load_subsys /* our new subsystem will be attached to the dummy hierarchy. */ init_css(css, ss, cgroup_dummy_top); - /* init_idr must be after init_css() because it sets css->id. */ - if (ss->use_id) { - ret = cgroup_init_idr(ss, css); - if (ret) - goto err_unload; - } /* * Now we need to entangle the css into the existing css_sets. unlike @@@ -4934,6 -4737,9 +4675,6 @@@ void cgroup_unload_subsys(struct cgroup offline_css(cgroup_css(cgroup_dummy_top, ss)); - if (ss->use_id) - idr_destroy(&ss->idr); - /* deassign the subsys_id */ cgroup_subsys[ss->subsys_id] = NULL; @@@ -4960,7 -4766,8 +4701,7 @@@ /* * remove subsystem's css from the cgroup_dummy_top and free it - * need to free before marking as null because ss->css_free needs - * the cgrp->subsys pointer to find their state. note that this - * also takes care of freeing the css_id. + * the cgrp->subsys pointer to find their state. */ ss->css_free(cgroup_css(cgroup_dummy_top, ss)); RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); @@@ -5031,6 -4838,8 +4772,6 @@@ int __init cgroup_init(void for_each_builtin_subsys(ss, i) { if (!ss->early_init) cgroup_init_subsys(ss); - if (ss->use_id) - cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); } /* allocate id for the dummy hierarchy */ @@@ -5071,22 -4880,6 +4812,22 @@@ out return err; } +static int __init cgroup_wq_init(void) +{ + /* + * There isn't much point in executing destruction path in + * parallel. Good chunk is serialized with cgroup_mutex anyway. + * Use 1 for @max_active. + * + * We would prefer to do this in cgroup_init() above, but that + * is called before init_workqueues(): so leave this until after. + */ + cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); + BUG_ON(!cgroup_destroy_wq); + return 0; +} +core_initcall(cgroup_wq_init); + /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy @@@ -5466,6 -5259,181 +5207,6 @@@ static int __init cgroup_disable(char * } __setup("cgroup_disable=", cgroup_disable); -/* - * Functons for CSS ID. - */ - -/* to get ID other than 0, this should be called when !cgroup_is_dead() */ -unsigned short css_id(struct cgroup_subsys_state *css) -{ - struct css_id *cssid; - - /* - * This css_id() can return correct value when somone has refcnt - * on this or this is under rcu_read_lock(). Once css->id is allocated, - * it's unchanged until freed. - */ - cssid = rcu_dereference_raw(css->id); - - if (cssid) - return cssid->id; - return 0; -} -EXPORT_SYMBOL_GPL(css_id); - -/** - * css_is_ancestor - test "root" css is an ancestor of "child" - * @child: the css to be tested. - * @root: the css supporsed to be an ancestor of the child. - * - * Returns true if "root" is an ancestor of "child" in its hierarchy. Because - * this function reads css->id, the caller must hold rcu_read_lock(). - * But, considering usual usage, the csses should be valid objects after test. - * Assuming that the caller will do some action to the child if this returns - * returns true, the caller must take "child";s reference count. - * If "child" is valid object and this returns true, "root" is valid, too. - */ - -bool css_is_ancestor(struct cgroup_subsys_state *child, - const struct cgroup_subsys_state *root) -{ - struct css_id *child_id; - struct css_id *root_id; - - child_id = rcu_dereference(child->id); - if (!child_id) - return false; - root_id = rcu_dereference(root->id); - if (!root_id) - return false; - if (child_id->depth < root_id->depth) - return false; - if (child_id->stack[root_id->depth] != root_id->id) - return false; - return true; -} - -void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) -{ - struct css_id *id = rcu_dereference_protected(css->id, true); - - /* When this is called before css_id initialization, id can be NULL */ - if (!id) - return; - - BUG_ON(!ss->use_id); - - rcu_assign_pointer(id->css, NULL); - rcu_assign_pointer(css->id, NULL); - spin_lock(&ss->id_lock); - idr_remove(&ss->idr, id->id); - spin_unlock(&ss->id_lock); - kfree_rcu(id, rcu_head); -} -EXPORT_SYMBOL_GPL(free_css_id); - -/* - * This is called by init or create(). Then, calls to this function are - * always serialized (By cgroup_mutex() at create()). - */ - -static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) -{ - struct css_id *newid; - int ret, size; - - BUG_ON(!ss->use_id); - - size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); - newid = kzalloc(size, GFP_KERNEL); - if (!newid) - return ERR_PTR(-ENOMEM); - - idr_preload(GFP_KERNEL); - spin_lock(&ss->id_lock); - /* Don't use 0. allocates an ID of 1-65535 */ - ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT); - spin_unlock(&ss->id_lock); - idr_preload_end(); - - /* Returns error when there are no free spaces for new ID.*/ - if (ret < 0) - goto err_out; - - newid->id = ret; - newid->depth = depth; - return newid; -err_out: - kfree(newid); - return ERR_PTR(ret); - -} - -static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, - struct cgroup_subsys_state *rootcss) -{ - struct css_id *newid; - - spin_lock_init(&ss->id_lock); - idr_init(&ss->idr); - - newid = get_new_cssid(ss, 0); - if (IS_ERR(newid)) - return PTR_ERR(newid); - - newid->stack[0] = newid->id; - RCU_INIT_POINTER(newid->css, rootcss); - RCU_INIT_POINTER(rootcss->id, newid); - return 0; -} - -static int alloc_css_id(struct cgroup_subsys_state *child_css) -{ - struct cgroup_subsys_state *parent_css = css_parent(child_css); - struct css_id *child_id, *parent_id; - int i, depth; - - parent_id = rcu_dereference_protected(parent_css->id, true); - depth = parent_id->depth + 1; - - child_id = get_new_cssid(child_css->ss, depth); - if (IS_ERR(child_id)) - return PTR_ERR(child_id); - - for (i = 0; i < depth; i++) - child_id->stack[i] = parent_id->stack[i]; - child_id->stack[depth] = child_id->id; - /* - * child_id->css pointer will be set after this cgroup is available - * see cgroup_populate_dir() - */ - rcu_assign_pointer(child_css->id, child_id); - - return 0; -} - -/** - * css_lookup - lookup css by id - * @ss: cgroup subsys to be looked into. - * @id: the id - * - * Returns pointer to cgroup_subsys_state if there is valid one with id. - * NULL if not. Should be called under rcu_read_lock() - */ -struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) -{ - struct css_id *cssid = NULL; - - BUG_ON(!ss->use_id); - cssid = idr_find(&ss->idr, id); - - if (unlikely(!cssid)) - return NULL; - - return rcu_dereference(cssid->css); -} -EXPORT_SYMBOL_GPL(css_lookup); - /** * css_from_dir - get corresponding css from the dentry of a cgroup dir * @dentry: directory dentry of interest diff --combined mm/memcontrol.c index f1a0ae6e11b8,ec8582b3a232..7aa0d405b148 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@@ -45,6 -45,7 +45,7 @@@ #include #include #include + #include #include #include #include @@@ -55,11 -56,11 +56,12 @@@ #include #include #include + #include #include "internal.h" #include #include #include +#include "slab.h" #include @@@ -227,6 -228,46 +229,46 @@@ struct mem_cgroup_eventfd_list struct eventfd_ctx *eventfd; }; + /* + * cgroup_event represents events which userspace want to receive. + */ + struct mem_cgroup_event { + /* + * memcg which the event belongs to. + */ + struct mem_cgroup *memcg; + /* + * eventfd to signal userspace about the event. + */ + struct eventfd_ctx *eventfd; + /* + * Each of these stored in a list by the cgroup. + */ + struct list_head list; + /* + * register_event() callback will be used to add new userspace + * waiter for changes related to this event. Use eventfd_signal() + * on eventfd to send notification to userspace. + */ + int (*register_event)(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args); + /* + * unregister_event() callback will be called when userspace closes + * the eventfd or on cgroup removing. This callback must be set, + * if you want provide notification functionality. + */ + void (*unregister_event)(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd); + /* + * All fields below needed to unregister event when + * userspace closes eventfd. + */ + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_t wait; + struct work_struct remove; + }; + static void mem_cgroup_threshold(struct mem_cgroup *memcg); static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); @@@ -313,7 -354,7 +355,7 @@@ struct mem_cgroup atomic_t dead_count; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) - struct tcp_memcontrol tcp_mem; + struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) /* analogous to slab_common's slab_caches list. per-memcg */ @@@ -331,6 -372,10 +373,10 @@@ atomic_t numainfo_updating; #endif + /* List of events which userspace want to receive */ + struct list_head event_list; + spinlock_t event_list_lock; + struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ }; @@@ -490,39 -535,11 +536,34 @@@ struct cgroup_subsys_state *vmpressure_ return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; } - struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) - { - return &mem_cgroup_from_css(css)->vmpressure; - } - static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); } +/* + * We restrict the id in the range of [1, 65535], so it can fit into + * an unsigned short. + */ +#define MEM_CGROUP_ID_MAX USHRT_MAX + +static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) +{ + /* + * The ID of the root cgroup is 0, but memcg treat 0 as an + * invalid ID, so we return (cgroup_id + 1). + */ + return memcg->css.cgroup->id + 1; +} + +static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + struct cgroup_subsys_state *css; + + css = css_from_id(id - 1, &mem_cgroup_subsys); + return mem_cgroup_from_css(css); +} + /* Writing them here to avoid exposing memcg's inner layout */ #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) @@@ -575,13 -592,13 +616,13 @@@ struct cg_proto *tcp_proto_cgroup(struc if (!memcg || mem_cgroup_is_root(memcg)) return NULL; - return &memcg->tcp_mem.cg_proto; + return &memcg->tcp_mem; } EXPORT_SYMBOL(tcp_proto_cgroup); static void disarm_sock_keys(struct mem_cgroup *memcg) { - if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) + if (!memcg_proto_activated(&memcg->tcp_mem)) return; static_key_slow_dec(&memcg_socket_limit_enabled); } @@@ -594,11 -611,16 +635,11 @@@ static void disarm_sock_keys(struct mem #ifdef CONFIG_MEMCG_KMEM /* * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. - * There are two main reasons for not using the css_id for this: - * 1) this works better in sparse environments, where we have a lot of memcgs, - * but only a few kmem-limited. Or also, if we have, for instance, 200 - * memcgs, and none but the 200th is kmem-limited, we'd have to have a - * 200 entry array for that. - * - * 2) In order not to violate the cgroup API, we would like to do all memory - * allocation in ->create(). At that point, we haven't yet allocated the - * css_id. Having a separate index prevents us from messing with the cgroup - * core for this + * The main reason for not using cgroup id for this: + * this works better in sparse environments, where we have a lot of memcgs, + * but only a few kmem-limited. Or also, if we have, for instance, 200 + * memcgs, and none but the 200th is kmem-limited, we'd have to have a + * 200 entry array for that. * * The current size of the caches array is stored in * memcg_limited_groups_array_size. It will double each time we have to @@@ -613,14 -635,14 +654,14 @@@ int memcg_limited_groups_array_size * cgroups is a reasonable guess. In the future, it could be a parameter or * tunable, but that is strictly not necessary. * - * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get + * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get * this constant directly from cgroup, but it is understandable that this is * better kept as an internal representation in cgroup.c. In any case, the - * css_id space is not getting any smaller, and we don't have to necessarily + * cgrp_id space is not getting any smaller, and we don't have to necessarily * increase ours as well if it increases. */ #define MEMCG_CACHES_MIN_SIZE 4 -#define MEMCG_CACHES_MAX_SIZE 65535 +#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX /* * A lot of the calls to the cache allocation functions are expected to be @@@ -1427,7 -1449,7 +1468,7 @@@ bool __mem_cgroup_same_or_subtree(cons return true; if (!root_memcg->use_hierarchy || !memcg) return false; - return css_is_ancestor(&memcg->css, &root_memcg->css); + return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); } static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, @@@ -2845,10 -2867,15 +2886,10 @@@ static void __mem_cgroup_cancel_local_c */ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) { - struct cgroup_subsys_state *css; - /* ID 0 is unused ID */ if (!id) return NULL; - css = css_lookup(&mem_cgroup_subsys, id); - if (!css) - return NULL; - return mem_cgroup_from_css(css); + return mem_cgroup_from_id(id); } struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) @@@ -2969,7 -2996,7 +3010,7 @@@ static struct kmem_cache *memcg_params_ VM_BUG_ON(p->is_root_cache); cachep = p->root_cache; - return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; + return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); } #ifdef CONFIG_SLABINFO @@@ -2998,14 -3025,21 +3039,14 @@@ static int memcg_charge_kmem(struct mem struct res_counter *fail_res; struct mem_cgroup *_memcg; int ret = 0; - bool may_oom; ret = res_counter_charge(&memcg->kmem, size, &fail_res); if (ret) return ret; - /* - * Conditions under which we can wait for the oom_killer. Those are - * the same conditions tested by the core page allocator - */ - may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); - _memcg = memcg; ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, - &_memcg, may_oom); + &_memcg, oom_gfp_allowed(gfp)); if (ret == -EINTR) { /* @@@ -3145,7 -3179,7 +3186,7 @@@ int memcg_update_cache_size(struct kmem { struct memcg_cache_params *cur_params = s->memcg_params; - VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); + VM_BUG_ON(!is_root_cache(s)); if (num_groups > memcg_limited_groups_array_size) { int i; @@@ -3406,7 -3440,7 +3447,7 @@@ static struct kmem_cache *memcg_create_ idx = memcg_cache_id(memcg); mutex_lock(&memcg_cache_mutex); - new_cachep = cachep->memcg_params->memcg_caches[idx]; + new_cachep = cache_from_memcg_idx(cachep, idx); if (new_cachep) { css_put(&memcg->css); goto out; @@@ -3452,8 -3486,8 +3493,8 @@@ void kmem_cache_destroy_memcg_children( * we'll take the set_limit_mutex to protect ourselves against this. */ mutex_lock(&set_limit_mutex); - for (i = 0; i < memcg_limited_groups_array_size; i++) { - c = s->memcg_params->memcg_caches[i]; + for_each_memcg_cache_index(i) { + c = cache_from_memcg_idx(s, i); if (!c) continue; @@@ -3586,8 -3620,8 +3627,8 @@@ struct kmem_cache *__memcg_kmem_get_cac * code updating memcg_caches will issue a write barrier to match this. */ read_barrier_depends(); - if (likely(cachep->memcg_params->memcg_caches[idx])) { - cachep = cachep->memcg_params->memcg_caches[idx]; + if (likely(cache_from_memcg_idx(cachep, idx))) { + cachep = cache_from_memcg_idx(cachep, idx); goto out; } @@@ -4357,7 -4391,7 +4398,7 @@@ mem_cgroup_uncharge_swapcache(struct pa * css_get() was called in uncharge(). */ if (do_swap_account && swapout && memcg) - swap_cgroup_record(ent, css_id(&memcg->css)); + swap_cgroup_record(ent, mem_cgroup_id(memcg)); } #endif @@@ -4409,8 -4443,8 +4450,8 @@@ static int mem_cgroup_move_swap_account { unsigned short old_id, new_id; - old_id = css_id(&from->css); - new_id = css_id(&to->css); + old_id = mem_cgroup_id(from); + new_id = mem_cgroup_id(to); if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { mem_cgroup_swap_statistics(from, false); @@@ -5383,50 -5417,45 +5424,50 @@@ static int mem_cgroup_move_charge_write static int memcg_numa_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *m) { + struct numa_stat { + const char *name; + unsigned int lru_mask; + }; + + static const struct numa_stat stats[] = { + { "total", LRU_ALL }, + { "file", LRU_ALL_FILE }, + { "anon", LRU_ALL_ANON }, + { "unevictable", BIT(LRU_UNEVICTABLE) }, + }; + const struct numa_stat *stat; int nid; - unsigned long total_nr, file_nr, anon_nr, unevictable_nr; - unsigned long node_nr; + unsigned long nr; struct mem_cgroup *memcg = mem_cgroup_from_css(css); - total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); - seq_printf(m, "total=%lu", total_nr); - for_each_node_state(nid, N_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); - seq_printf(m, " N%d=%lu", nid, node_nr); - } - seq_putc(m, '\n'); - - file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); - seq_printf(m, "file=%lu", file_nr); - for_each_node_state(nid, N_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - LRU_ALL_FILE); - seq_printf(m, " N%d=%lu", nid, node_nr); - } - seq_putc(m, '\n'); - - anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); - seq_printf(m, "anon=%lu", anon_nr); - for_each_node_state(nid, N_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - LRU_ALL_ANON); - seq_printf(m, " N%d=%lu", nid, node_nr); + for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { + nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); + seq_printf(m, "%s=%lu", stat->name, nr); + for_each_node_state(nid, N_MEMORY) { + nr = mem_cgroup_node_nr_lru_pages(memcg, nid, + stat->lru_mask); + seq_printf(m, " N%d=%lu", nid, nr); + } + seq_putc(m, '\n'); + } + + for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { + struct mem_cgroup *iter; + + nr = 0; + for_each_mem_cgroup_tree(iter, memcg) + nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); + seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); + for_each_node_state(nid, N_MEMORY) { + nr = 0; + for_each_mem_cgroup_tree(iter, memcg) + nr += mem_cgroup_node_nr_lru_pages( + iter, nid, stat->lru_mask); + seq_printf(m, " N%d=%lu", nid, nr); + } + seq_putc(m, '\n'); } - seq_putc(m, '\n'); - unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); - seq_printf(m, "unevictable=%lu", unevictable_nr); - for_each_node_state(nid, N_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - BIT(LRU_UNEVICTABLE)); - seq_printf(m, " N%d=%lu", nid, node_nr); - } - seq_putc(m, '\n'); return 0; } #endif /* CONFIG_NUMA */ @@@ -5648,13 -5677,11 +5689,11 @@@ static void mem_cgroup_oom_notify(struc mem_cgroup_oom_notify_cb(iter); } - static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) + static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args, enum res_type type) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - enum res_type type = MEMFILE_TYPE(cft->private); u64 threshold, usage; int i, size, ret; @@@ -5731,13 -5758,23 +5770,23 @@@ unlock return ret; } - static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd) + static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) + { + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); + } + + static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) + { + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); + } + + static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, enum res_type type) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - enum res_type type = MEMFILE_TYPE(cft->private); u64 usage; int i, j, size; @@@ -5810,14 -5847,23 +5859,23 @@@ unlock mutex_unlock(&memcg->thresholds_lock); } - static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) + static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) + { + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); + } + + static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) + { + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); + } + + static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *event; - enum res_type type = MEMFILE_TYPE(cft->private); - BUG_ON(type != _OOM_TYPE); event = kmalloc(sizeof(*event), GFP_KERNEL); if (!event) return -ENOMEM; @@@ -5835,14 -5881,10 +5893,10 @@@ return 0; } - static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd) + static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *ev, *tmp; - enum res_type type = MEMFILE_TYPE(cft->private); - - BUG_ON(type != _OOM_TYPE); spin_lock(&memcg_oom_lock); @@@ -5959,13 -6001,233 +6013,233 @@@ static void kmem_cgroup_css_offline(str } #endif + /* + * DO NOT USE IN NEW FILES. + * + * "cgroup.event_control" implementation. + * + * This is way over-engineered. It tries to support fully configurable + * events for each user. Such level of flexibility is completely + * unnecessary especially in the light of the planned unified hierarchy. + * + * Please deprecate this and replace with something simpler if at all + * possible. + */ + + /* + * Unregister event and free resources. + * + * Gets called from workqueue. + */ + static void memcg_event_remove(struct work_struct *work) + { + struct mem_cgroup_event *event = + container_of(work, struct mem_cgroup_event, remove); + struct mem_cgroup *memcg = event->memcg; + + remove_wait_queue(event->wqh, &event->wait); + + event->unregister_event(memcg, event->eventfd); + + /* Notify userspace the event is going away. */ + eventfd_signal(event->eventfd, 1); + + eventfd_ctx_put(event->eventfd); + kfree(event); + css_put(&memcg->css); + } + + /* + * Gets called on POLLHUP on eventfd when user closes it. + * + * Called with wqh->lock held and interrupts disabled. + */ + static int memcg_event_wake(wait_queue_t *wait, unsigned mode, + int sync, void *key) + { + struct mem_cgroup_event *event = + container_of(wait, struct mem_cgroup_event, wait); + struct mem_cgroup *memcg = event->memcg; + unsigned long flags = (unsigned long)key; + + if (flags & POLLHUP) { + /* + * If the event has been detached at cgroup removal, we + * can simply return knowing the other side will cleanup + * for us. + * + * We can't race against event freeing since the other + * side will require wqh->lock via remove_wait_queue(), + * which we hold. + */ + spin_lock(&memcg->event_list_lock); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + /* + * We are in atomic context, but cgroup_event_remove() + * may sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + spin_unlock(&memcg->event_list_lock); + } + + return 0; + } + + static void memcg_event_ptable_queue_proc(struct file *file, + wait_queue_head_t *wqh, poll_table *pt) + { + struct mem_cgroup_event *event = + container_of(pt, struct mem_cgroup_event, pt); + + event->wqh = wqh; + add_wait_queue(wqh, &event->wait); + } + + /* + * DO NOT USE IN NEW FILES. + * + * Parse input and register new cgroup event handler. + * + * Input must be in format ' '. + * Interpretation of args is defined by control file implementation. + */ + static int memcg_write_event_control(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) + { + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup_event *event; + struct cgroup_subsys_state *cfile_css; + unsigned int efd, cfd; + struct fd efile; + struct fd cfile; + const char *name; + char *endp; + int ret; + + efd = simple_strtoul(buffer, &endp, 10); + if (*endp != ' ') + return -EINVAL; + buffer = endp + 1; + + cfd = simple_strtoul(buffer, &endp, 10); + if ((*endp != ' ') && (*endp != '\0')) + return -EINVAL; + buffer = endp + 1; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + event->memcg = memcg; + INIT_LIST_HEAD(&event->list); + init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); + init_waitqueue_func_entry(&event->wait, memcg_event_wake); + INIT_WORK(&event->remove, memcg_event_remove); + + efile = fdget(efd); + if (!efile.file) { + ret = -EBADF; + goto out_kfree; + } + + event->eventfd = eventfd_ctx_fileget(efile.file); + if (IS_ERR(event->eventfd)) { + ret = PTR_ERR(event->eventfd); + goto out_put_efile; + } + + cfile = fdget(cfd); + if (!cfile.file) { + ret = -EBADF; + goto out_put_eventfd; + } + + /* the process need read permission on control file */ + /* AV: shouldn't we check that it's been opened for read instead? */ + ret = inode_permission(file_inode(cfile.file), MAY_READ); + if (ret < 0) + goto out_put_cfile; + + /* + * Determine the event callbacks and set them in @event. This used + * to be done via struct cftype but cgroup core no longer knows + * about these events. The following is crude but the whole thing + * is for compatibility anyway. + * + * DO NOT ADD NEW FILES. + */ + name = cfile.file->f_dentry->d_name.name; + + if (!strcmp(name, "memory.usage_in_bytes")) { + event->register_event = mem_cgroup_usage_register_event; + event->unregister_event = mem_cgroup_usage_unregister_event; + } else if (!strcmp(name, "memory.oom_control")) { + event->register_event = mem_cgroup_oom_register_event; + event->unregister_event = mem_cgroup_oom_unregister_event; + } else if (!strcmp(name, "memory.pressure_level")) { + event->register_event = vmpressure_register_event; + event->unregister_event = vmpressure_unregister_event; + } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { + event->register_event = memsw_cgroup_usage_register_event; + event->unregister_event = memsw_cgroup_usage_unregister_event; + } else { + ret = -EINVAL; + goto out_put_cfile; + } + + /* + * Verify @cfile should belong to @css. Also, remaining events are + * automatically removed on cgroup destruction but the removal is + * asynchronous, so take an extra ref on @css. + */ + rcu_read_lock(); + + ret = -EINVAL; + cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, + &mem_cgroup_subsys); + if (cfile_css == css && css_tryget(css)) + ret = 0; + + rcu_read_unlock(); + if (ret) + goto out_put_cfile; + + ret = event->register_event(memcg, event->eventfd, buffer); + if (ret) + goto out_put_css; + + efile.file->f_op->poll(efile.file, &event->pt); + + spin_lock(&memcg->event_list_lock); + list_add(&event->list, &memcg->event_list); + spin_unlock(&memcg->event_list_lock); + + fdput(cfile); + fdput(efile); + + return 0; + + out_put_css: + css_put(css); + out_put_cfile: + fdput(cfile); + out_put_eventfd: + eventfd_ctx_put(event->eventfd); + out_put_efile: + fdput(efile); + out_kfree: + kfree(event); + + return ret; + } + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), .read = mem_cgroup_read, - .register_event = mem_cgroup_usage_register_event, - .unregister_event = mem_cgroup_usage_unregister_event, }, { .name = "max_usage_in_bytes", @@@ -6005,6 -6267,12 +6279,12 @@@ .write_u64 = mem_cgroup_hierarchy_write, .read_u64 = mem_cgroup_hierarchy_read, }, + { + .name = "cgroup.event_control", /* XXX: for compat */ + .write_string = memcg_write_event_control, + .flags = CFTYPE_NO_PREFIX, + .mode = S_IWUGO, + }, { .name = "swappiness", .read_u64 = mem_cgroup_swappiness_read, @@@ -6019,14 -6287,10 +6299,10 @@@ .name = "oom_control", .read_map = mem_cgroup_oom_control_read, .write_u64 = mem_cgroup_oom_control_write, - .register_event = mem_cgroup_oom_register_event, - .unregister_event = mem_cgroup_oom_unregister_event, .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), }, { .name = "pressure_level", - .register_event = vmpressure_register_event, - .unregister_event = vmpressure_unregister_event, }, #ifdef CONFIG_NUMA { @@@ -6074,8 -6338,6 +6350,6 @@@ static struct cftype memsw_cgroup_files .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), .read = mem_cgroup_read, - .register_event = mem_cgroup_usage_register_event, - .unregister_event = mem_cgroup_usage_unregister_event, }, { .name = "memsw.max_usage_in_bytes", @@@ -6178,6 -6440,7 +6452,6 @@@ static void __mem_cgroup_free(struct me size_t size = memcg_size(); mem_cgroup_remove_from_trees(memcg); - free_css_id(&mem_cgroup_subsys, &memcg->css); for_each_node(node) free_mem_cgroup_per_zone_info(memcg, node); @@@ -6265,6 -6528,8 +6539,8 @@@ mem_cgroup_css_alloc(struct cgroup_subs mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); vmpressure_init(&memcg->vmpressure); + INIT_LIST_HEAD(&memcg->event_list); + spin_lock_init(&memcg->event_list_lock); return &memcg->css; @@@ -6280,9 -6545,6 +6556,9 @@@ mem_cgroup_css_online(struct cgroup_sub struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); int error = 0; + if (css->cgroup->id > MEM_CGROUP_ID_MAX) + return -ENOSPC; + if (!parent) return 0; @@@ -6340,6 -6602,19 +6616,19 @@@ static void mem_cgroup_invalidate_recla static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup_event *event, *tmp; + + /* + * Unregister events and notify userspace. + * Notify userspace about cgroup removing only after rmdir of cgroup + * directory to avoid race between userspace and kernelspace. + */ + spin_lock(&memcg->event_list_lock); + list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { + list_del_init(&event->list); + schedule_work(&event->remove); + } + spin_unlock(&memcg->event_list_lock); kmem_cgroup_css_offline(memcg); @@@ -6554,7 -6829,7 +6843,7 @@@ static enum mc_target_type get_mctgt_ty } /* There is a swap entry and a page doesn't exist or isn't charged */ if (ent.val && !ret && - css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { + mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { ret = MC_TARGET_SWAP; if (target) target->ent = ent; @@@ -6605,10 -6880,10 +6894,10 @@@ static int mem_cgroup_count_precharge_p pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } @@@ -6797,9 -7072,9 +7086,9 @@@ static int mem_cgroup_move_charge_pte_r * to be unlocked in __split_huge_page_splitting(), where the main * part of thp split is not executed yet. */ - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { if (mc.precharge < HPAGE_PMD_NR) { - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); @@@ -6816,7 -7091,7 +7105,7 @@@ } put_page(page); } - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } @@@ -6974,6 -7249,7 +7263,6 @@@ struct cgroup_subsys mem_cgroup_subsys .bind = mem_cgroup_bind, .base_cftypes = mem_cgroup_files, .early_init = 0, - .use_id = 1, }; #ifdef CONFIG_MEMCG_SWAP