]>
Commit | Line | Data |
---|---|---|
1b1e1344 RG |
1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | ||
d12f6d22 RG |
3 | #include <linux/memcontrol.h> |
4 | #include <linux/swap.h> | |
5 | #include <linux/mm_inline.h> | |
e548ad4a RG |
6 | #include <linux/pagewalk.h> |
7 | #include <linux/backing-dev.h> | |
8 | #include <linux/swap_cgroup.h> | |
66d60c42 RG |
9 | #include <linux/eventfd.h> |
10 | #include <linux/poll.h> | |
11 | #include <linux/sort.h> | |
12 | #include <linux/file.h> | |
ea1e8796 | 13 | #include <linux/seq_buf.h> |
d12f6d22 | 14 | |
e548ad4a RG |
15 | #include "internal.h" |
16 | #include "swap.h" | |
1b1e1344 | 17 | #include "memcontrol-v1.h" |
d12f6d22 RG |
18 | |
19 | /* | |
20 | * Cgroups above their limits are maintained in a RB-Tree, independent of | |
21 | * their hierarchy representation | |
22 | */ | |
23 | ||
24 | struct mem_cgroup_tree_per_node { | |
25 | struct rb_root rb_root; | |
26 | struct rb_node *rb_rightmost; | |
27 | spinlock_t lock; | |
28 | }; | |
29 | ||
30 | struct mem_cgroup_tree { | |
31 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | |
32 | }; | |
33 | ||
34 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | |
35 | ||
36 | /* | |
37 | * Maximum loops in mem_cgroup_soft_reclaim(), used for soft | |
38 | * limit reclaim to prevent infinite loops, if they ever occur. | |
39 | */ | |
40 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 | |
41 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 | |
42 | ||
e548ad4a RG |
43 | /* Stuffs for move charges at task migration. */ |
44 | /* | |
45 | * Types of charges to be moved. | |
46 | */ | |
5316b497 RG |
47 | #define MOVE_ANON 0x1ULL |
48 | #define MOVE_FILE 0x2ULL | |
e548ad4a RG |
49 | #define MOVE_MASK (MOVE_ANON | MOVE_FILE) |
50 | ||
51 | /* "mc" and its members are protected by cgroup_mutex */ | |
52 | static struct move_charge_struct { | |
53 | spinlock_t lock; /* for from, to */ | |
54 | struct mm_struct *mm; | |
55 | struct mem_cgroup *from; | |
56 | struct mem_cgroup *to; | |
57 | unsigned long flags; | |
58 | unsigned long precharge; | |
59 | unsigned long moved_charge; | |
60 | unsigned long moved_swap; | |
61 | struct task_struct *moving_task; /* a task moving charges */ | |
62 | wait_queue_head_t waitq; /* a waitq for other context */ | |
63 | } mc = { | |
64 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), | |
65 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | |
66 | }; | |
67 | ||
66d60c42 RG |
68 | /* for OOM */ |
69 | struct mem_cgroup_eventfd_list { | |
70 | struct list_head list; | |
71 | struct eventfd_ctx *eventfd; | |
72 | }; | |
73 | ||
74 | /* | |
75 | * cgroup_event represents events which userspace want to receive. | |
76 | */ | |
77 | struct mem_cgroup_event { | |
78 | /* | |
79 | * memcg which the event belongs to. | |
80 | */ | |
81 | struct mem_cgroup *memcg; | |
82 | /* | |
83 | * eventfd to signal userspace about the event. | |
84 | */ | |
85 | struct eventfd_ctx *eventfd; | |
86 | /* | |
87 | * Each of these stored in a list by the cgroup. | |
88 | */ | |
89 | struct list_head list; | |
90 | /* | |
91 | * register_event() callback will be used to add new userspace | |
92 | * waiter for changes related to this event. Use eventfd_signal() | |
93 | * on eventfd to send notification to userspace. | |
94 | */ | |
95 | int (*register_event)(struct mem_cgroup *memcg, | |
96 | struct eventfd_ctx *eventfd, const char *args); | |
97 | /* | |
98 | * unregister_event() callback will be called when userspace closes | |
99 | * the eventfd or on cgroup removing. This callback must be set, | |
100 | * if you want provide notification functionality. | |
101 | */ | |
102 | void (*unregister_event)(struct mem_cgroup *memcg, | |
103 | struct eventfd_ctx *eventfd); | |
104 | /* | |
105 | * All fields below needed to unregister event when | |
106 | * userspace closes eventfd. | |
107 | */ | |
108 | poll_table pt; | |
109 | wait_queue_head_t *wqh; | |
110 | wait_queue_entry_t wait; | |
111 | struct work_struct remove; | |
112 | }; | |
113 | ||
ea1e8796 RG |
114 | #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) |
115 | #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) | |
116 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | |
117 | ||
118 | enum { | |
119 | RES_USAGE, | |
120 | RES_LIMIT, | |
121 | RES_MAX_USAGE, | |
122 | RES_FAILCNT, | |
123 | RES_SOFT_LIMIT, | |
124 | }; | |
125 | ||
292fc2e0 RG |
126 | #ifdef CONFIG_LOCKDEP |
127 | static struct lockdep_map memcg_oom_lock_dep_map = { | |
128 | .name = "memcg_oom_lock", | |
129 | }; | |
130 | #endif | |
131 | ||
132 | DEFINE_SPINLOCK(memcg_oom_lock); | |
66d60c42 | 133 | |
d12f6d22 RG |
134 | static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, |
135 | struct mem_cgroup_tree_per_node *mctz, | |
136 | unsigned long new_usage_in_excess) | |
137 | { | |
138 | struct rb_node **p = &mctz->rb_root.rb_node; | |
139 | struct rb_node *parent = NULL; | |
140 | struct mem_cgroup_per_node *mz_node; | |
141 | bool rightmost = true; | |
142 | ||
143 | if (mz->on_tree) | |
144 | return; | |
145 | ||
146 | mz->usage_in_excess = new_usage_in_excess; | |
147 | if (!mz->usage_in_excess) | |
148 | return; | |
149 | while (*p) { | |
150 | parent = *p; | |
151 | mz_node = rb_entry(parent, struct mem_cgroup_per_node, | |
152 | tree_node); | |
153 | if (mz->usage_in_excess < mz_node->usage_in_excess) { | |
154 | p = &(*p)->rb_left; | |
155 | rightmost = false; | |
156 | } else { | |
157 | p = &(*p)->rb_right; | |
158 | } | |
159 | } | |
160 | ||
161 | if (rightmost) | |
162 | mctz->rb_rightmost = &mz->tree_node; | |
163 | ||
164 | rb_link_node(&mz->tree_node, parent, p); | |
165 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | |
166 | mz->on_tree = true; | |
167 | } | |
168 | ||
169 | static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, | |
170 | struct mem_cgroup_tree_per_node *mctz) | |
171 | { | |
172 | if (!mz->on_tree) | |
173 | return; | |
174 | ||
175 | if (&mz->tree_node == mctz->rb_rightmost) | |
176 | mctz->rb_rightmost = rb_prev(&mz->tree_node); | |
177 | ||
178 | rb_erase(&mz->tree_node, &mctz->rb_root); | |
179 | mz->on_tree = false; | |
180 | } | |
181 | ||
182 | static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, | |
183 | struct mem_cgroup_tree_per_node *mctz) | |
184 | { | |
185 | unsigned long flags; | |
186 | ||
187 | spin_lock_irqsave(&mctz->lock, flags); | |
188 | __mem_cgroup_remove_exceeded(mz, mctz); | |
189 | spin_unlock_irqrestore(&mctz->lock, flags); | |
190 | } | |
191 | ||
192 | static unsigned long soft_limit_excess(struct mem_cgroup *memcg) | |
193 | { | |
194 | unsigned long nr_pages = page_counter_read(&memcg->memory); | |
195 | unsigned long soft_limit = READ_ONCE(memcg->soft_limit); | |
196 | unsigned long excess = 0; | |
197 | ||
198 | if (nr_pages > soft_limit) | |
199 | excess = nr_pages - soft_limit; | |
200 | ||
201 | return excess; | |
202 | } | |
203 | ||
34926e10 | 204 | static void memcg1_update_tree(struct mem_cgroup *memcg, int nid) |
d12f6d22 RG |
205 | { |
206 | unsigned long excess; | |
207 | struct mem_cgroup_per_node *mz; | |
208 | struct mem_cgroup_tree_per_node *mctz; | |
209 | ||
210 | if (lru_gen_enabled()) { | |
211 | if (soft_limit_excess(memcg)) | |
212 | lru_gen_soft_reclaim(memcg, nid); | |
213 | return; | |
214 | } | |
215 | ||
216 | mctz = soft_limit_tree.rb_tree_per_node[nid]; | |
217 | if (!mctz) | |
218 | return; | |
219 | /* | |
220 | * Necessary to update all ancestors when hierarchy is used. | |
221 | * because their event counter is not touched. | |
222 | */ | |
223 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { | |
224 | mz = memcg->nodeinfo[nid]; | |
225 | excess = soft_limit_excess(memcg); | |
226 | /* | |
227 | * We have to update the tree if mz is on RB-tree or | |
228 | * mem is over its softlimit. | |
229 | */ | |
230 | if (excess || mz->on_tree) { | |
231 | unsigned long flags; | |
232 | ||
233 | spin_lock_irqsave(&mctz->lock, flags); | |
234 | /* if on-tree, remove it */ | |
235 | if (mz->on_tree) | |
236 | __mem_cgroup_remove_exceeded(mz, mctz); | |
237 | /* | |
238 | * Insert again. mz->usage_in_excess will be updated. | |
239 | * If excess is 0, no tree ops. | |
240 | */ | |
241 | __mem_cgroup_insert_exceeded(mz, mctz, excess); | |
242 | spin_unlock_irqrestore(&mctz->lock, flags); | |
243 | } | |
244 | } | |
245 | } | |
246 | ||
87024f58 | 247 | void memcg1_remove_from_trees(struct mem_cgroup *memcg) |
d12f6d22 RG |
248 | { |
249 | struct mem_cgroup_tree_per_node *mctz; | |
250 | struct mem_cgroup_per_node *mz; | |
251 | int nid; | |
252 | ||
253 | for_each_node(nid) { | |
254 | mz = memcg->nodeinfo[nid]; | |
255 | mctz = soft_limit_tree.rb_tree_per_node[nid]; | |
256 | if (mctz) | |
257 | mem_cgroup_remove_exceeded(mz, mctz); | |
258 | } | |
259 | } | |
260 | ||
261 | static struct mem_cgroup_per_node * | |
262 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) | |
263 | { | |
264 | struct mem_cgroup_per_node *mz; | |
265 | ||
266 | retry: | |
267 | mz = NULL; | |
268 | if (!mctz->rb_rightmost) | |
269 | goto done; /* Nothing to reclaim from */ | |
270 | ||
271 | mz = rb_entry(mctz->rb_rightmost, | |
272 | struct mem_cgroup_per_node, tree_node); | |
273 | /* | |
274 | * Remove the node now but someone else can add it back, | |
275 | * we will to add it back at the end of reclaim to its correct | |
276 | * position in the tree. | |
277 | */ | |
278 | __mem_cgroup_remove_exceeded(mz, mctz); | |
279 | if (!soft_limit_excess(mz->memcg) || | |
280 | !css_tryget(&mz->memcg->css)) | |
281 | goto retry; | |
282 | done: | |
283 | return mz; | |
284 | } | |
285 | ||
286 | static struct mem_cgroup_per_node * | |
287 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) | |
288 | { | |
289 | struct mem_cgroup_per_node *mz; | |
290 | ||
291 | spin_lock_irq(&mctz->lock); | |
292 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | |
293 | spin_unlock_irq(&mctz->lock); | |
294 | return mz; | |
295 | } | |
296 | ||
297 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, | |
298 | pg_data_t *pgdat, | |
299 | gfp_t gfp_mask, | |
300 | unsigned long *total_scanned) | |
301 | { | |
302 | struct mem_cgroup *victim = NULL; | |
303 | int total = 0; | |
304 | int loop = 0; | |
305 | unsigned long excess; | |
306 | unsigned long nr_scanned; | |
307 | struct mem_cgroup_reclaim_cookie reclaim = { | |
308 | .pgdat = pgdat, | |
309 | }; | |
310 | ||
311 | excess = soft_limit_excess(root_memcg); | |
312 | ||
313 | while (1) { | |
314 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); | |
315 | if (!victim) { | |
316 | loop++; | |
317 | if (loop >= 2) { | |
318 | /* | |
319 | * If we have not been able to reclaim | |
320 | * anything, it might because there are | |
321 | * no reclaimable pages under this hierarchy | |
322 | */ | |
323 | if (!total) | |
324 | break; | |
325 | /* | |
326 | * We want to do more targeted reclaim. | |
327 | * excess >> 2 is not to excessive so as to | |
328 | * reclaim too much, nor too less that we keep | |
329 | * coming back to reclaim from this cgroup | |
330 | */ | |
331 | if (total >= (excess >> 2) || | |
332 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) | |
333 | break; | |
334 | } | |
335 | continue; | |
336 | } | |
337 | total += mem_cgroup_shrink_node(victim, gfp_mask, false, | |
338 | pgdat, &nr_scanned); | |
339 | *total_scanned += nr_scanned; | |
340 | if (!soft_limit_excess(root_memcg)) | |
341 | break; | |
342 | } | |
343 | mem_cgroup_iter_break(root_memcg, victim); | |
344 | return total; | |
345 | } | |
346 | ||
87024f58 | 347 | unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, |
d12f6d22 RG |
348 | gfp_t gfp_mask, |
349 | unsigned long *total_scanned) | |
350 | { | |
351 | unsigned long nr_reclaimed = 0; | |
352 | struct mem_cgroup_per_node *mz, *next_mz = NULL; | |
353 | unsigned long reclaimed; | |
354 | int loop = 0; | |
355 | struct mem_cgroup_tree_per_node *mctz; | |
356 | unsigned long excess; | |
357 | ||
358 | if (lru_gen_enabled()) | |
359 | return 0; | |
360 | ||
361 | if (order > 0) | |
362 | return 0; | |
363 | ||
364 | mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; | |
365 | ||
366 | /* | |
367 | * Do not even bother to check the largest node if the root | |
368 | * is empty. Do it lockless to prevent lock bouncing. Races | |
369 | * are acceptable as soft limit is best effort anyway. | |
370 | */ | |
371 | if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) | |
372 | return 0; | |
373 | ||
374 | /* | |
375 | * This loop can run a while, specially if mem_cgroup's continuously | |
376 | * keep exceeding their soft limit and putting the system under | |
377 | * pressure | |
378 | */ | |
379 | do { | |
380 | if (next_mz) | |
381 | mz = next_mz; | |
382 | else | |
383 | mz = mem_cgroup_largest_soft_limit_node(mctz); | |
384 | if (!mz) | |
385 | break; | |
386 | ||
387 | reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, | |
388 | gfp_mask, total_scanned); | |
389 | nr_reclaimed += reclaimed; | |
390 | spin_lock_irq(&mctz->lock); | |
391 | ||
392 | /* | |
393 | * If we failed to reclaim anything from this memory cgroup | |
394 | * it is time to move on to the next cgroup | |
395 | */ | |
396 | next_mz = NULL; | |
397 | if (!reclaimed) | |
398 | next_mz = __mem_cgroup_largest_soft_limit_node(mctz); | |
399 | ||
400 | excess = soft_limit_excess(mz->memcg); | |
401 | /* | |
402 | * One school of thought says that we should not add | |
403 | * back the node to the tree if reclaim returns 0. | |
404 | * But our reclaim could return 0, simply because due | |
405 | * to priority we are exposing a smaller subset of | |
406 | * memory to reclaim from. Consider this as a longer | |
407 | * term TODO. | |
408 | */ | |
409 | /* If excess == 0, no tree ops */ | |
410 | __mem_cgroup_insert_exceeded(mz, mctz, excess); | |
411 | spin_unlock_irq(&mctz->lock); | |
412 | css_put(&mz->memcg->css); | |
413 | loop++; | |
414 | /* | |
415 | * Could not reclaim anything and there are no more | |
416 | * mem cgroups to try or we seem to be looping without | |
417 | * reclaiming anything. | |
418 | */ | |
419 | if (!nr_reclaimed && | |
420 | (next_mz == NULL || | |
421 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | |
422 | break; | |
423 | } while (!nr_reclaimed); | |
424 | if (next_mz) | |
425 | css_put(&next_mz->memcg->css); | |
426 | return nr_reclaimed; | |
427 | } | |
428 | ||
e548ad4a RG |
429 | /* |
430 | * A routine for checking "mem" is under move_account() or not. | |
431 | * | |
432 | * Checking a cgroup is mc.from or mc.to or under hierarchy of | |
433 | * moving cgroups. This is for waiting at high-memory pressure | |
434 | * caused by "move". | |
435 | */ | |
436 | static bool mem_cgroup_under_move(struct mem_cgroup *memcg) | |
437 | { | |
438 | struct mem_cgroup *from; | |
439 | struct mem_cgroup *to; | |
440 | bool ret = false; | |
441 | /* | |
442 | * Unlike task_move routines, we access mc.to, mc.from not under | |
443 | * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. | |
444 | */ | |
445 | spin_lock(&mc.lock); | |
446 | from = mc.from; | |
447 | to = mc.to; | |
448 | if (!from) | |
449 | goto unlock; | |
450 | ||
451 | ret = mem_cgroup_is_descendant(from, memcg) || | |
452 | mem_cgroup_is_descendant(to, memcg); | |
453 | unlock: | |
454 | spin_unlock(&mc.lock); | |
455 | return ret; | |
456 | } | |
457 | ||
b9eaacb1 | 458 | bool memcg1_wait_acct_move(struct mem_cgroup *memcg) |
e548ad4a RG |
459 | { |
460 | if (mc.moving_task && current != mc.moving_task) { | |
461 | if (mem_cgroup_under_move(memcg)) { | |
462 | DEFINE_WAIT(wait); | |
463 | prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); | |
464 | /* moving charge context might have finished. */ | |
465 | if (mc.moving_task) | |
466 | schedule(); | |
467 | finish_wait(&mc.waitq, &wait); | |
468 | return true; | |
469 | } | |
470 | } | |
471 | return false; | |
472 | } | |
473 | ||
474 | /** | |
475 | * folio_memcg_lock - Bind a folio to its memcg. | |
476 | * @folio: The folio. | |
477 | * | |
478 | * This function prevents unlocked LRU folios from being moved to | |
479 | * another cgroup. | |
480 | * | |
481 | * It ensures lifetime of the bound memcg. The caller is responsible | |
482 | * for the lifetime of the folio. | |
483 | */ | |
484 | void folio_memcg_lock(struct folio *folio) | |
485 | { | |
486 | struct mem_cgroup *memcg; | |
487 | unsigned long flags; | |
488 | ||
489 | /* | |
490 | * The RCU lock is held throughout the transaction. The fast | |
491 | * path can get away without acquiring the memcg->move_lock | |
492 | * because page moving starts with an RCU grace period. | |
493 | */ | |
494 | rcu_read_lock(); | |
495 | ||
496 | if (mem_cgroup_disabled()) | |
497 | return; | |
498 | again: | |
499 | memcg = folio_memcg(folio); | |
500 | if (unlikely(!memcg)) | |
501 | return; | |
502 | ||
503 | #ifdef CONFIG_PROVE_LOCKING | |
504 | local_irq_save(flags); | |
505 | might_lock(&memcg->move_lock); | |
506 | local_irq_restore(flags); | |
507 | #endif | |
508 | ||
509 | if (atomic_read(&memcg->moving_account) <= 0) | |
510 | return; | |
511 | ||
512 | spin_lock_irqsave(&memcg->move_lock, flags); | |
513 | if (memcg != folio_memcg(folio)) { | |
514 | spin_unlock_irqrestore(&memcg->move_lock, flags); | |
515 | goto again; | |
516 | } | |
517 | ||
518 | /* | |
519 | * When charge migration first begins, we can have multiple | |
520 | * critical sections holding the fast-path RCU lock and one | |
521 | * holding the slowpath move_lock. Track the task who has the | |
522 | * move_lock for folio_memcg_unlock(). | |
523 | */ | |
524 | memcg->move_lock_task = current; | |
525 | memcg->move_lock_flags = flags; | |
526 | } | |
527 | ||
528 | static void __folio_memcg_unlock(struct mem_cgroup *memcg) | |
529 | { | |
530 | if (memcg && memcg->move_lock_task == current) { | |
531 | unsigned long flags = memcg->move_lock_flags; | |
532 | ||
533 | memcg->move_lock_task = NULL; | |
534 | memcg->move_lock_flags = 0; | |
535 | ||
536 | spin_unlock_irqrestore(&memcg->move_lock, flags); | |
537 | } | |
538 | ||
539 | rcu_read_unlock(); | |
540 | } | |
541 | ||
542 | /** | |
543 | * folio_memcg_unlock - Release the binding between a folio and its memcg. | |
544 | * @folio: The folio. | |
545 | * | |
546 | * This releases the binding created by folio_memcg_lock(). This does | |
547 | * not change the accounting of this folio to its memcg, but it does | |
548 | * permit others to change it. | |
549 | */ | |
550 | void folio_memcg_unlock(struct folio *folio) | |
551 | { | |
552 | __folio_memcg_unlock(folio_memcg(folio)); | |
553 | } | |
554 | ||
555 | #ifdef CONFIG_SWAP | |
556 | /** | |
557 | * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. | |
558 | * @entry: swap entry to be moved | |
559 | * @from: mem_cgroup which the entry is moved from | |
560 | * @to: mem_cgroup which the entry is moved to | |
561 | * | |
562 | * It succeeds only when the swap_cgroup's record for this entry is the same | |
563 | * as the mem_cgroup's id of @from. | |
564 | * | |
565 | * Returns 0 on success, -EINVAL on failure. | |
566 | * | |
567 | * The caller must have charged to @to, IOW, called page_counter_charge() about | |
568 | * both res and memsw, and called css_get(). | |
569 | */ | |
570 | static int mem_cgroup_move_swap_account(swp_entry_t entry, | |
571 | struct mem_cgroup *from, struct mem_cgroup *to) | |
572 | { | |
573 | unsigned short old_id, new_id; | |
574 | ||
575 | old_id = mem_cgroup_id(from); | |
576 | new_id = mem_cgroup_id(to); | |
577 | ||
578 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { | |
579 | mod_memcg_state(from, MEMCG_SWAP, -1); | |
580 | mod_memcg_state(to, MEMCG_SWAP, 1); | |
581 | return 0; | |
582 | } | |
583 | return -EINVAL; | |
584 | } | |
585 | #else | |
586 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |
587 | struct mem_cgroup *from, struct mem_cgroup *to) | |
588 | { | |
589 | return -EINVAL; | |
590 | } | |
591 | #endif | |
592 | ||
ea1e8796 | 593 | static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, |
e548ad4a RG |
594 | struct cftype *cft) |
595 | { | |
596 | return mem_cgroup_from_css(css)->move_charge_at_immigrate; | |
597 | } | |
598 | ||
599 | #ifdef CONFIG_MMU | |
ea1e8796 | 600 | static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, |
e548ad4a RG |
601 | struct cftype *cft, u64 val) |
602 | { | |
603 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | |
604 | ||
605 | pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " | |
606 | "Please report your usecase to [email protected] if you " | |
607 | "depend on this functionality.\n"); | |
608 | ||
609 | if (val & ~MOVE_MASK) | |
610 | return -EINVAL; | |
611 | ||
612 | /* | |
613 | * No kind of locking is needed in here, because ->can_attach() will | |
614 | * check this value once in the beginning of the process, and then carry | |
615 | * on with stale data. This means that changes to this value will only | |
616 | * affect task migrations starting after the change. | |
617 | */ | |
618 | memcg->move_charge_at_immigrate = val; | |
619 | return 0; | |
620 | } | |
621 | #else | |
ea1e8796 | 622 | static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, |
e548ad4a RG |
623 | struct cftype *cft, u64 val) |
624 | { | |
625 | return -ENOSYS; | |
626 | } | |
627 | #endif | |
628 | ||
629 | #ifdef CONFIG_MMU | |
630 | /* Handlers for move charge at task migration. */ | |
631 | static int mem_cgroup_do_precharge(unsigned long count) | |
632 | { | |
633 | int ret; | |
634 | ||
635 | /* Try a single bulk charge without reclaim first, kswapd may wake */ | |
636 | ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); | |
637 | if (!ret) { | |
638 | mc.precharge += count; | |
639 | return ret; | |
640 | } | |
641 | ||
642 | /* Try charges one by one with reclaim, but do not retry */ | |
643 | while (count--) { | |
644 | ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); | |
645 | if (ret) | |
646 | return ret; | |
647 | mc.precharge++; | |
648 | cond_resched(); | |
649 | } | |
650 | return 0; | |
651 | } | |
652 | ||
653 | union mc_target { | |
654 | struct folio *folio; | |
655 | swp_entry_t ent; | |
656 | }; | |
657 | ||
658 | enum mc_target_type { | |
659 | MC_TARGET_NONE = 0, | |
660 | MC_TARGET_PAGE, | |
661 | MC_TARGET_SWAP, | |
662 | MC_TARGET_DEVICE, | |
663 | }; | |
664 | ||
665 | static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | |
666 | unsigned long addr, pte_t ptent) | |
667 | { | |
668 | struct page *page = vm_normal_page(vma, addr, ptent); | |
669 | ||
670 | if (!page) | |
671 | return NULL; | |
672 | if (PageAnon(page)) { | |
673 | if (!(mc.flags & MOVE_ANON)) | |
674 | return NULL; | |
675 | } else { | |
676 | if (!(mc.flags & MOVE_FILE)) | |
677 | return NULL; | |
678 | } | |
679 | get_page(page); | |
680 | ||
681 | return page; | |
682 | } | |
683 | ||
684 | #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) | |
685 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | |
686 | pte_t ptent, swp_entry_t *entry) | |
687 | { | |
688 | struct page *page = NULL; | |
689 | swp_entry_t ent = pte_to_swp_entry(ptent); | |
690 | ||
691 | if (!(mc.flags & MOVE_ANON)) | |
692 | return NULL; | |
693 | ||
694 | /* | |
695 | * Handle device private pages that are not accessible by the CPU, but | |
696 | * stored as special swap entries in the page table. | |
697 | */ | |
698 | if (is_device_private_entry(ent)) { | |
699 | page = pfn_swap_entry_to_page(ent); | |
700 | if (!get_page_unless_zero(page)) | |
701 | return NULL; | |
702 | return page; | |
703 | } | |
704 | ||
705 | if (non_swap_entry(ent)) | |
706 | return NULL; | |
707 | ||
708 | /* | |
709 | * Because swap_cache_get_folio() updates some statistics counter, | |
710 | * we call find_get_page() with swapper_space directly. | |
711 | */ | |
712 | page = find_get_page(swap_address_space(ent), swap_cache_index(ent)); | |
713 | entry->val = ent.val; | |
714 | ||
715 | return page; | |
716 | } | |
717 | #else | |
718 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | |
719 | pte_t ptent, swp_entry_t *entry) | |
720 | { | |
721 | return NULL; | |
722 | } | |
723 | #endif | |
724 | ||
725 | static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |
726 | unsigned long addr, pte_t ptent) | |
727 | { | |
728 | unsigned long index; | |
729 | struct folio *folio; | |
730 | ||
731 | if (!vma->vm_file) /* anonymous vma */ | |
732 | return NULL; | |
733 | if (!(mc.flags & MOVE_FILE)) | |
734 | return NULL; | |
735 | ||
736 | /* folio is moved even if it's not RSS of this task(page-faulted). */ | |
737 | /* shmem/tmpfs may report page out on swap: account for that too. */ | |
738 | index = linear_page_index(vma, addr); | |
739 | folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index); | |
740 | if (IS_ERR(folio)) | |
741 | return NULL; | |
742 | return folio_file_page(folio, index); | |
743 | } | |
744 | ||
745 | /** | |
746 | * mem_cgroup_move_account - move account of the folio | |
747 | * @folio: The folio. | |
748 | * @compound: charge the page as compound or small page | |
749 | * @from: mem_cgroup which the folio is moved from. | |
750 | * @to: mem_cgroup which the folio is moved to. @from != @to. | |
751 | * | |
752 | * The folio must be locked and not on the LRU. | |
753 | * | |
754 | * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" | |
755 | * from old cgroup. | |
756 | */ | |
757 | static int mem_cgroup_move_account(struct folio *folio, | |
758 | bool compound, | |
759 | struct mem_cgroup *from, | |
760 | struct mem_cgroup *to) | |
761 | { | |
762 | struct lruvec *from_vec, *to_vec; | |
763 | struct pglist_data *pgdat; | |
764 | unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1; | |
765 | int nid, ret; | |
766 | ||
767 | VM_BUG_ON(from == to); | |
768 | VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); | |
769 | VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); | |
770 | VM_BUG_ON(compound && !folio_test_large(folio)); | |
771 | ||
772 | ret = -EINVAL; | |
773 | if (folio_memcg(folio) != from) | |
774 | goto out; | |
775 | ||
776 | pgdat = folio_pgdat(folio); | |
777 | from_vec = mem_cgroup_lruvec(from, pgdat); | |
778 | to_vec = mem_cgroup_lruvec(to, pgdat); | |
779 | ||
780 | folio_memcg_lock(folio); | |
781 | ||
782 | if (folio_test_anon(folio)) { | |
783 | if (folio_mapped(folio)) { | |
784 | __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); | |
785 | __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); | |
786 | if (folio_test_pmd_mappable(folio)) { | |
787 | __mod_lruvec_state(from_vec, NR_ANON_THPS, | |
788 | -nr_pages); | |
789 | __mod_lruvec_state(to_vec, NR_ANON_THPS, | |
790 | nr_pages); | |
791 | } | |
792 | } | |
793 | } else { | |
794 | __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); | |
795 | __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); | |
796 | ||
797 | if (folio_test_swapbacked(folio)) { | |
798 | __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); | |
799 | __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); | |
800 | } | |
801 | ||
802 | if (folio_mapped(folio)) { | |
803 | __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); | |
804 | __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); | |
805 | } | |
806 | ||
807 | if (folio_test_dirty(folio)) { | |
808 | struct address_space *mapping = folio_mapping(folio); | |
809 | ||
810 | if (mapping_can_writeback(mapping)) { | |
811 | __mod_lruvec_state(from_vec, NR_FILE_DIRTY, | |
812 | -nr_pages); | |
813 | __mod_lruvec_state(to_vec, NR_FILE_DIRTY, | |
814 | nr_pages); | |
815 | } | |
816 | } | |
817 | } | |
818 | ||
819 | #ifdef CONFIG_SWAP | |
820 | if (folio_test_swapcache(folio)) { | |
821 | __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages); | |
822 | __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages); | |
823 | } | |
824 | #endif | |
825 | if (folio_test_writeback(folio)) { | |
826 | __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); | |
827 | __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); | |
828 | } | |
829 | ||
830 | /* | |
831 | * All state has been migrated, let's switch to the new memcg. | |
832 | * | |
833 | * It is safe to change page's memcg here because the page | |
834 | * is referenced, charged, isolated, and locked: we can't race | |
835 | * with (un)charging, migration, LRU putback, or anything else | |
836 | * that would rely on a stable page's memory cgroup. | |
837 | * | |
838 | * Note that folio_memcg_lock is a memcg lock, not a page lock, | |
839 | * to save space. As soon as we switch page's memory cgroup to a | |
840 | * new memcg that isn't locked, the above state can change | |
841 | * concurrently again. Make sure we're truly done with it. | |
842 | */ | |
843 | smp_mb(); | |
844 | ||
845 | css_get(&to->css); | |
846 | css_put(&from->css); | |
847 | ||
848 | folio->memcg_data = (unsigned long)to; | |
849 | ||
850 | __folio_memcg_unlock(from); | |
851 | ||
852 | ret = 0; | |
853 | nid = folio_nid(folio); | |
854 | ||
855 | local_irq_disable(); | |
856 | mem_cgroup_charge_statistics(to, nr_pages); | |
cc7b8504 | 857 | memcg1_check_events(to, nid); |
e548ad4a | 858 | mem_cgroup_charge_statistics(from, -nr_pages); |
cc7b8504 | 859 | memcg1_check_events(from, nid); |
e548ad4a RG |
860 | local_irq_enable(); |
861 | out: | |
862 | return ret; | |
863 | } | |
864 | ||
865 | /** | |
866 | * get_mctgt_type - get target type of moving charge | |
867 | * @vma: the vma the pte to be checked belongs | |
868 | * @addr: the address corresponding to the pte to be checked | |
869 | * @ptent: the pte to be checked | |
870 | * @target: the pointer the target page or swap ent will be stored(can be NULL) | |
871 | * | |
872 | * Context: Called with pte lock held. | |
873 | * Return: | |
874 | * * MC_TARGET_NONE - If the pte is not a target for move charge. | |
875 | * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for | |
876 | * move charge. If @target is not NULL, the folio is stored in target->folio | |
877 | * with extra refcnt taken (Caller should release it). | |
878 | * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a | |
879 | * target for charge migration. If @target is not NULL, the entry is | |
880 | * stored in target->ent. | |
881 | * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and | |
882 | * thus not on the lru. For now such page is charged like a regular page | |
883 | * would be as it is just special memory taking the place of a regular page. | |
884 | * See Documentations/vm/hmm.txt and include/linux/hmm.h | |
885 | */ | |
886 | static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | |
887 | unsigned long addr, pte_t ptent, union mc_target *target) | |
888 | { | |
889 | struct page *page = NULL; | |
890 | struct folio *folio; | |
891 | enum mc_target_type ret = MC_TARGET_NONE; | |
892 | swp_entry_t ent = { .val = 0 }; | |
893 | ||
894 | if (pte_present(ptent)) | |
895 | page = mc_handle_present_pte(vma, addr, ptent); | |
896 | else if (pte_none_mostly(ptent)) | |
897 | /* | |
898 | * PTE markers should be treated as a none pte here, separated | |
899 | * from other swap handling below. | |
900 | */ | |
901 | page = mc_handle_file_pte(vma, addr, ptent); | |
902 | else if (is_swap_pte(ptent)) | |
903 | page = mc_handle_swap_pte(vma, ptent, &ent); | |
904 | ||
905 | if (page) | |
906 | folio = page_folio(page); | |
907 | if (target && page) { | |
908 | if (!folio_trylock(folio)) { | |
909 | folio_put(folio); | |
910 | return ret; | |
911 | } | |
912 | /* | |
913 | * page_mapped() must be stable during the move. This | |
914 | * pte is locked, so if it's present, the page cannot | |
915 | * become unmapped. If it isn't, we have only partial | |
916 | * control over the mapped state: the page lock will | |
917 | * prevent new faults against pagecache and swapcache, | |
918 | * so an unmapped page cannot become mapped. However, | |
919 | * if the page is already mapped elsewhere, it can | |
920 | * unmap, and there is nothing we can do about it. | |
921 | * Alas, skip moving the page in this case. | |
922 | */ | |
923 | if (!pte_present(ptent) && page_mapped(page)) { | |
924 | folio_unlock(folio); | |
925 | folio_put(folio); | |
926 | return ret; | |
927 | } | |
928 | } | |
929 | ||
930 | if (!page && !ent.val) | |
931 | return ret; | |
932 | if (page) { | |
933 | /* | |
934 | * Do only loose check w/o serialization. | |
935 | * mem_cgroup_move_account() checks the page is valid or | |
936 | * not under LRU exclusion. | |
937 | */ | |
938 | if (folio_memcg(folio) == mc.from) { | |
939 | ret = MC_TARGET_PAGE; | |
940 | if (folio_is_device_private(folio) || | |
941 | folio_is_device_coherent(folio)) | |
942 | ret = MC_TARGET_DEVICE; | |
943 | if (target) | |
944 | target->folio = folio; | |
945 | } | |
946 | if (!ret || !target) { | |
947 | if (target) | |
948 | folio_unlock(folio); | |
949 | folio_put(folio); | |
950 | } | |
951 | } | |
952 | /* | |
953 | * There is a swap entry and a page doesn't exist or isn't charged. | |
954 | * But we cannot move a tail-page in a THP. | |
955 | */ | |
956 | if (ent.val && !ret && (!page || !PageTransCompound(page)) && | |
957 | mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { | |
958 | ret = MC_TARGET_SWAP; | |
959 | if (target) | |
960 | target->ent = ent; | |
961 | } | |
962 | return ret; | |
963 | } | |
964 | ||
965 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | |
966 | /* | |
967 | * We don't consider PMD mapped swapping or file mapped pages because THP does | |
968 | * not support them for now. | |
969 | * Caller should make sure that pmd_trans_huge(pmd) is true. | |
970 | */ | |
971 | static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | |
972 | unsigned long addr, pmd_t pmd, union mc_target *target) | |
973 | { | |
974 | struct page *page = NULL; | |
975 | struct folio *folio; | |
976 | enum mc_target_type ret = MC_TARGET_NONE; | |
977 | ||
978 | if (unlikely(is_swap_pmd(pmd))) { | |
979 | VM_BUG_ON(thp_migration_supported() && | |
980 | !is_pmd_migration_entry(pmd)); | |
981 | return ret; | |
982 | } | |
983 | page = pmd_page(pmd); | |
984 | VM_BUG_ON_PAGE(!page || !PageHead(page), page); | |
985 | folio = page_folio(page); | |
986 | if (!(mc.flags & MOVE_ANON)) | |
987 | return ret; | |
988 | if (folio_memcg(folio) == mc.from) { | |
989 | ret = MC_TARGET_PAGE; | |
990 | if (target) { | |
991 | folio_get(folio); | |
992 | if (!folio_trylock(folio)) { | |
993 | folio_put(folio); | |
994 | return MC_TARGET_NONE; | |
995 | } | |
996 | target->folio = folio; | |
997 | } | |
998 | } | |
999 | return ret; | |
1000 | } | |
1001 | #else | |
1002 | static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | |
1003 | unsigned long addr, pmd_t pmd, union mc_target *target) | |
1004 | { | |
1005 | return MC_TARGET_NONE; | |
1006 | } | |
1007 | #endif | |
1008 | ||
1009 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |
1010 | unsigned long addr, unsigned long end, | |
1011 | struct mm_walk *walk) | |
1012 | { | |
1013 | struct vm_area_struct *vma = walk->vma; | |
1014 | pte_t *pte; | |
1015 | spinlock_t *ptl; | |
1016 | ||
1017 | ptl = pmd_trans_huge_lock(pmd, vma); | |
1018 | if (ptl) { | |
1019 | /* | |
1020 | * Note their can not be MC_TARGET_DEVICE for now as we do not | |
1021 | * support transparent huge page with MEMORY_DEVICE_PRIVATE but | |
1022 | * this might change. | |
1023 | */ | |
1024 | if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) | |
1025 | mc.precharge += HPAGE_PMD_NR; | |
1026 | spin_unlock(ptl); | |
1027 | return 0; | |
1028 | } | |
1029 | ||
1030 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | |
1031 | if (!pte) | |
1032 | return 0; | |
1033 | for (; addr != end; pte++, addr += PAGE_SIZE) | |
1034 | if (get_mctgt_type(vma, addr, ptep_get(pte), NULL)) | |
1035 | mc.precharge++; /* increment precharge temporarily */ | |
1036 | pte_unmap_unlock(pte - 1, ptl); | |
1037 | cond_resched(); | |
1038 | ||
1039 | return 0; | |
1040 | } | |
1041 | ||
1042 | static const struct mm_walk_ops precharge_walk_ops = { | |
1043 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | |
1044 | .walk_lock = PGWALK_RDLOCK, | |
1045 | }; | |
1046 | ||
1047 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |
1048 | { | |
1049 | unsigned long precharge; | |
1050 | ||
1051 | mmap_read_lock(mm); | |
1052 | walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL); | |
1053 | mmap_read_unlock(mm); | |
1054 | ||
1055 | precharge = mc.precharge; | |
1056 | mc.precharge = 0; | |
1057 | ||
1058 | return precharge; | |
1059 | } | |
1060 | ||
1061 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) | |
1062 | { | |
1063 | unsigned long precharge = mem_cgroup_count_precharge(mm); | |
1064 | ||
1065 | VM_BUG_ON(mc.moving_task); | |
1066 | mc.moving_task = current; | |
1067 | return mem_cgroup_do_precharge(precharge); | |
1068 | } | |
1069 | ||
1070 | /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ | |
1071 | static void __mem_cgroup_clear_mc(void) | |
1072 | { | |
1073 | struct mem_cgroup *from = mc.from; | |
1074 | struct mem_cgroup *to = mc.to; | |
1075 | ||
1076 | /* we must uncharge all the leftover precharges from mc.to */ | |
1077 | if (mc.precharge) { | |
1078 | mem_cgroup_cancel_charge(mc.to, mc.precharge); | |
1079 | mc.precharge = 0; | |
1080 | } | |
1081 | /* | |
1082 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so | |
1083 | * we must uncharge here. | |
1084 | */ | |
1085 | if (mc.moved_charge) { | |
1086 | mem_cgroup_cancel_charge(mc.from, mc.moved_charge); | |
1087 | mc.moved_charge = 0; | |
1088 | } | |
1089 | /* we must fixup refcnts and charges */ | |
1090 | if (mc.moved_swap) { | |
1091 | /* uncharge swap account from the old cgroup */ | |
1092 | if (!mem_cgroup_is_root(mc.from)) | |
1093 | page_counter_uncharge(&mc.from->memsw, mc.moved_swap); | |
1094 | ||
1095 | mem_cgroup_id_put_many(mc.from, mc.moved_swap); | |
1096 | ||
1097 | /* | |
1098 | * we charged both to->memory and to->memsw, so we | |
1099 | * should uncharge to->memory. | |
1100 | */ | |
1101 | if (!mem_cgroup_is_root(mc.to)) | |
1102 | page_counter_uncharge(&mc.to->memory, mc.moved_swap); | |
1103 | ||
1104 | mc.moved_swap = 0; | |
1105 | } | |
8d49b699 RG |
1106 | memcg1_oom_recover(from); |
1107 | memcg1_oom_recover(to); | |
e548ad4a RG |
1108 | wake_up_all(&mc.waitq); |
1109 | } | |
1110 | ||
1111 | static void mem_cgroup_clear_mc(void) | |
1112 | { | |
1113 | struct mm_struct *mm = mc.mm; | |
1114 | ||
1115 | /* | |
1116 | * we must clear moving_task before waking up waiters at the end of | |
1117 | * task migration. | |
1118 | */ | |
1119 | mc.moving_task = NULL; | |
1120 | __mem_cgroup_clear_mc(); | |
1121 | spin_lock(&mc.lock); | |
1122 | mc.from = NULL; | |
1123 | mc.to = NULL; | |
1124 | mc.mm = NULL; | |
1125 | spin_unlock(&mc.lock); | |
1126 | ||
1127 | mmput(mm); | |
1128 | } | |
1129 | ||
b9eaacb1 | 1130 | int memcg1_can_attach(struct cgroup_taskset *tset) |
e548ad4a RG |
1131 | { |
1132 | struct cgroup_subsys_state *css; | |
1133 | struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ | |
1134 | struct mem_cgroup *from; | |
1135 | struct task_struct *leader, *p; | |
1136 | struct mm_struct *mm; | |
1137 | unsigned long move_flags; | |
1138 | int ret = 0; | |
1139 | ||
1140 | /* charge immigration isn't supported on the default hierarchy */ | |
1141 | if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) | |
1142 | return 0; | |
1143 | ||
1144 | /* | |
1145 | * Multi-process migrations only happen on the default hierarchy | |
1146 | * where charge immigration is not used. Perform charge | |
1147 | * immigration if @tset contains a leader and whine if there are | |
1148 | * multiple. | |
1149 | */ | |
1150 | p = NULL; | |
1151 | cgroup_taskset_for_each_leader(leader, css, tset) { | |
1152 | WARN_ON_ONCE(p); | |
1153 | p = leader; | |
1154 | memcg = mem_cgroup_from_css(css); | |
1155 | } | |
1156 | if (!p) | |
1157 | return 0; | |
1158 | ||
1159 | /* | |
1160 | * We are now committed to this value whatever it is. Changes in this | |
1161 | * tunable will only affect upcoming migrations, not the current one. | |
1162 | * So we need to save it, and keep it going. | |
1163 | */ | |
1164 | move_flags = READ_ONCE(memcg->move_charge_at_immigrate); | |
1165 | if (!move_flags) | |
1166 | return 0; | |
1167 | ||
1168 | from = mem_cgroup_from_task(p); | |
1169 | ||
1170 | VM_BUG_ON(from == memcg); | |
1171 | ||
1172 | mm = get_task_mm(p); | |
1173 | if (!mm) | |
1174 | return 0; | |
1175 | /* We move charges only when we move a owner of the mm */ | |
1176 | if (mm->owner == p) { | |
1177 | VM_BUG_ON(mc.from); | |
1178 | VM_BUG_ON(mc.to); | |
1179 | VM_BUG_ON(mc.precharge); | |
1180 | VM_BUG_ON(mc.moved_charge); | |
1181 | VM_BUG_ON(mc.moved_swap); | |
1182 | ||
1183 | spin_lock(&mc.lock); | |
1184 | mc.mm = mm; | |
1185 | mc.from = from; | |
1186 | mc.to = memcg; | |
1187 | mc.flags = move_flags; | |
1188 | spin_unlock(&mc.lock); | |
1189 | /* We set mc.moving_task later */ | |
1190 | ||
1191 | ret = mem_cgroup_precharge_mc(mm); | |
1192 | if (ret) | |
1193 | mem_cgroup_clear_mc(); | |
1194 | } else { | |
1195 | mmput(mm); | |
1196 | } | |
1197 | return ret; | |
1198 | } | |
1199 | ||
b9eaacb1 | 1200 | void memcg1_cancel_attach(struct cgroup_taskset *tset) |
e548ad4a RG |
1201 | { |
1202 | if (mc.to) | |
1203 | mem_cgroup_clear_mc(); | |
1204 | } | |
1205 | ||
1206 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |
1207 | unsigned long addr, unsigned long end, | |
1208 | struct mm_walk *walk) | |
1209 | { | |
1210 | int ret = 0; | |
1211 | struct vm_area_struct *vma = walk->vma; | |
1212 | pte_t *pte; | |
1213 | spinlock_t *ptl; | |
1214 | enum mc_target_type target_type; | |
1215 | union mc_target target; | |
1216 | struct folio *folio; | |
1217 | ||
1218 | ptl = pmd_trans_huge_lock(pmd, vma); | |
1219 | if (ptl) { | |
1220 | if (mc.precharge < HPAGE_PMD_NR) { | |
1221 | spin_unlock(ptl); | |
1222 | return 0; | |
1223 | } | |
1224 | target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); | |
1225 | if (target_type == MC_TARGET_PAGE) { | |
1226 | folio = target.folio; | |
1227 | if (folio_isolate_lru(folio)) { | |
1228 | if (!mem_cgroup_move_account(folio, true, | |
1229 | mc.from, mc.to)) { | |
1230 | mc.precharge -= HPAGE_PMD_NR; | |
1231 | mc.moved_charge += HPAGE_PMD_NR; | |
1232 | } | |
1233 | folio_putback_lru(folio); | |
1234 | } | |
1235 | folio_unlock(folio); | |
1236 | folio_put(folio); | |
1237 | } else if (target_type == MC_TARGET_DEVICE) { | |
1238 | folio = target.folio; | |
1239 | if (!mem_cgroup_move_account(folio, true, | |
1240 | mc.from, mc.to)) { | |
1241 | mc.precharge -= HPAGE_PMD_NR; | |
1242 | mc.moved_charge += HPAGE_PMD_NR; | |
1243 | } | |
1244 | folio_unlock(folio); | |
1245 | folio_put(folio); | |
1246 | } | |
1247 | spin_unlock(ptl); | |
1248 | return 0; | |
1249 | } | |
1250 | ||
1251 | retry: | |
1252 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | |
1253 | if (!pte) | |
1254 | return 0; | |
1255 | for (; addr != end; addr += PAGE_SIZE) { | |
1256 | pte_t ptent = ptep_get(pte++); | |
1257 | bool device = false; | |
1258 | swp_entry_t ent; | |
1259 | ||
1260 | if (!mc.precharge) | |
1261 | break; | |
1262 | ||
1263 | switch (get_mctgt_type(vma, addr, ptent, &target)) { | |
1264 | case MC_TARGET_DEVICE: | |
1265 | device = true; | |
1266 | fallthrough; | |
1267 | case MC_TARGET_PAGE: | |
1268 | folio = target.folio; | |
1269 | /* | |
1270 | * We can have a part of the split pmd here. Moving it | |
1271 | * can be done but it would be too convoluted so simply | |
1272 | * ignore such a partial THP and keep it in original | |
1273 | * memcg. There should be somebody mapping the head. | |
1274 | */ | |
1275 | if (folio_test_large(folio)) | |
1276 | goto put; | |
1277 | if (!device && !folio_isolate_lru(folio)) | |
1278 | goto put; | |
1279 | if (!mem_cgroup_move_account(folio, false, | |
1280 | mc.from, mc.to)) { | |
1281 | mc.precharge--; | |
1282 | /* we uncharge from mc.from later. */ | |
1283 | mc.moved_charge++; | |
1284 | } | |
1285 | if (!device) | |
1286 | folio_putback_lru(folio); | |
1287 | put: /* get_mctgt_type() gets & locks the page */ | |
1288 | folio_unlock(folio); | |
1289 | folio_put(folio); | |
1290 | break; | |
1291 | case MC_TARGET_SWAP: | |
1292 | ent = target.ent; | |
1293 | if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { | |
1294 | mc.precharge--; | |
1295 | mem_cgroup_id_get_many(mc.to, 1); | |
1296 | /* we fixup other refcnts and charges later. */ | |
1297 | mc.moved_swap++; | |
1298 | } | |
1299 | break; | |
1300 | default: | |
1301 | break; | |
1302 | } | |
1303 | } | |
1304 | pte_unmap_unlock(pte - 1, ptl); | |
1305 | cond_resched(); | |
1306 | ||
1307 | if (addr != end) { | |
1308 | /* | |
1309 | * We have consumed all precharges we got in can_attach(). | |
1310 | * We try charge one by one, but don't do any additional | |
1311 | * charges to mc.to if we have failed in charge once in attach() | |
1312 | * phase. | |
1313 | */ | |
1314 | ret = mem_cgroup_do_precharge(1); | |
1315 | if (!ret) | |
1316 | goto retry; | |
1317 | } | |
1318 | ||
1319 | return ret; | |
1320 | } | |
1321 | ||
1322 | static const struct mm_walk_ops charge_walk_ops = { | |
1323 | .pmd_entry = mem_cgroup_move_charge_pte_range, | |
1324 | .walk_lock = PGWALK_RDLOCK, | |
1325 | }; | |
1326 | ||
1327 | static void mem_cgroup_move_charge(void) | |
1328 | { | |
1329 | lru_add_drain_all(); | |
1330 | /* | |
1331 | * Signal folio_memcg_lock() to take the memcg's move_lock | |
1332 | * while we're moving its pages to another memcg. Then wait | |
1333 | * for already started RCU-only updates to finish. | |
1334 | */ | |
1335 | atomic_inc(&mc.from->moving_account); | |
1336 | synchronize_rcu(); | |
1337 | retry: | |
1338 | if (unlikely(!mmap_read_trylock(mc.mm))) { | |
1339 | /* | |
1340 | * Someone who are holding the mmap_lock might be waiting in | |
1341 | * waitq. So we cancel all extra charges, wake up all waiters, | |
1342 | * and retry. Because we cancel precharges, we might not be able | |
1343 | * to move enough charges, but moving charge is a best-effort | |
1344 | * feature anyway, so it wouldn't be a big problem. | |
1345 | */ | |
1346 | __mem_cgroup_clear_mc(); | |
1347 | cond_resched(); | |
1348 | goto retry; | |
1349 | } | |
1350 | /* | |
1351 | * When we have consumed all precharges and failed in doing | |
1352 | * additional charge, the page walk just aborts. | |
1353 | */ | |
1354 | walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL); | |
1355 | mmap_read_unlock(mc.mm); | |
1356 | atomic_dec(&mc.from->moving_account); | |
1357 | } | |
1358 | ||
b9eaacb1 | 1359 | void memcg1_move_task(void) |
e548ad4a RG |
1360 | { |
1361 | if (mc.to) { | |
1362 | mem_cgroup_move_charge(); | |
1363 | mem_cgroup_clear_mc(); | |
1364 | } | |
1365 | } | |
1366 | ||
1367 | #else /* !CONFIG_MMU */ | |
b9eaacb1 | 1368 | int memcg1_can_attach(struct cgroup_taskset *tset) |
e548ad4a RG |
1369 | { |
1370 | return 0; | |
1371 | } | |
b9eaacb1 | 1372 | void memcg1_cancel_attach(struct cgroup_taskset *tset) |
e548ad4a RG |
1373 | { |
1374 | } | |
b9eaacb1 | 1375 | void memcg1_move_task(void) |
e548ad4a RG |
1376 | { |
1377 | } | |
1378 | #endif | |
1379 | ||
66d60c42 RG |
1380 | static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) |
1381 | { | |
1382 | struct mem_cgroup_threshold_ary *t; | |
1383 | unsigned long usage; | |
1384 | int i; | |
1385 | ||
1386 | rcu_read_lock(); | |
1387 | if (!swap) | |
1388 | t = rcu_dereference(memcg->thresholds.primary); | |
1389 | else | |
1390 | t = rcu_dereference(memcg->memsw_thresholds.primary); | |
1391 | ||
1392 | if (!t) | |
1393 | goto unlock; | |
1394 | ||
1395 | usage = mem_cgroup_usage(memcg, swap); | |
1396 | ||
1397 | /* | |
1398 | * current_threshold points to threshold just below or equal to usage. | |
1399 | * If it's not true, a threshold was crossed after last | |
1400 | * call of __mem_cgroup_threshold(). | |
1401 | */ | |
1402 | i = t->current_threshold; | |
1403 | ||
1404 | /* | |
1405 | * Iterate backward over array of thresholds starting from | |
1406 | * current_threshold and check if a threshold is crossed. | |
1407 | * If none of thresholds below usage is crossed, we read | |
1408 | * only one element of the array here. | |
1409 | */ | |
1410 | for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) | |
1411 | eventfd_signal(t->entries[i].eventfd); | |
1412 | ||
1413 | /* i = current_threshold + 1 */ | |
1414 | i++; | |
1415 | ||
1416 | /* | |
1417 | * Iterate forward over array of thresholds starting from | |
1418 | * current_threshold+1 and check if a threshold is crossed. | |
1419 | * If none of thresholds above usage is crossed, we read | |
1420 | * only one element of the array here. | |
1421 | */ | |
1422 | for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) | |
1423 | eventfd_signal(t->entries[i].eventfd); | |
1424 | ||
1425 | /* Update current_threshold */ | |
1426 | t->current_threshold = i - 1; | |
1427 | unlock: | |
1428 | rcu_read_unlock(); | |
1429 | } | |
1430 | ||
1431 | static void mem_cgroup_threshold(struct mem_cgroup *memcg) | |
1432 | { | |
1433 | while (memcg) { | |
1434 | __mem_cgroup_threshold(memcg, false); | |
1435 | if (do_memsw_account()) | |
1436 | __mem_cgroup_threshold(memcg, true); | |
1437 | ||
1438 | memcg = parent_mem_cgroup(memcg); | |
1439 | } | |
1440 | } | |
1441 | ||
1442 | /* | |
1443 | * Check events in order. | |
1444 | * | |
1445 | */ | |
cc7b8504 | 1446 | void memcg1_check_events(struct mem_cgroup *memcg, int nid) |
66d60c42 RG |
1447 | { |
1448 | if (IS_ENABLED(CONFIG_PREEMPT_RT)) | |
1449 | return; | |
1450 | ||
1451 | /* threshold event is triggered in finer grain than soft limit */ | |
1452 | if (unlikely(mem_cgroup_event_ratelimit(memcg, | |
1453 | MEM_CGROUP_TARGET_THRESH))) { | |
1454 | bool do_softlimit; | |
1455 | ||
1456 | do_softlimit = mem_cgroup_event_ratelimit(memcg, | |
1457 | MEM_CGROUP_TARGET_SOFTLIMIT); | |
1458 | mem_cgroup_threshold(memcg); | |
1459 | if (unlikely(do_softlimit)) | |
1460 | memcg1_update_tree(memcg, nid); | |
1461 | } | |
1462 | } | |
1463 | ||
1464 | static int compare_thresholds(const void *a, const void *b) | |
1465 | { | |
1466 | const struct mem_cgroup_threshold *_a = a; | |
1467 | const struct mem_cgroup_threshold *_b = b; | |
1468 | ||
1469 | if (_a->threshold > _b->threshold) | |
1470 | return 1; | |
1471 | ||
1472 | if (_a->threshold < _b->threshold) | |
1473 | return -1; | |
1474 | ||
1475 | return 0; | |
1476 | } | |
1477 | ||
1478 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) | |
1479 | { | |
1480 | struct mem_cgroup_eventfd_list *ev; | |
1481 | ||
1482 | spin_lock(&memcg_oom_lock); | |
1483 | ||
1484 | list_for_each_entry(ev, &memcg->oom_notify, list) | |
1485 | eventfd_signal(ev->eventfd); | |
1486 | ||
1487 | spin_unlock(&memcg_oom_lock); | |
1488 | return 0; | |
1489 | } | |
1490 | ||
292fc2e0 | 1491 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) |
66d60c42 RG |
1492 | { |
1493 | struct mem_cgroup *iter; | |
1494 | ||
1495 | for_each_mem_cgroup_tree(iter, memcg) | |
1496 | mem_cgroup_oom_notify_cb(iter); | |
1497 | } | |
1498 | ||
1499 | static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | |
1500 | struct eventfd_ctx *eventfd, const char *args, enum res_type type) | |
1501 | { | |
1502 | struct mem_cgroup_thresholds *thresholds; | |
1503 | struct mem_cgroup_threshold_ary *new; | |
1504 | unsigned long threshold; | |
1505 | unsigned long usage; | |
1506 | int i, size, ret; | |
1507 | ||
1508 | ret = page_counter_memparse(args, "-1", &threshold); | |
1509 | if (ret) | |
1510 | return ret; | |
1511 | ||
1512 | mutex_lock(&memcg->thresholds_lock); | |
1513 | ||
1514 | if (type == _MEM) { | |
1515 | thresholds = &memcg->thresholds; | |
1516 | usage = mem_cgroup_usage(memcg, false); | |
1517 | } else if (type == _MEMSWAP) { | |
1518 | thresholds = &memcg->memsw_thresholds; | |
1519 | usage = mem_cgroup_usage(memcg, true); | |
1520 | } else | |
1521 | BUG(); | |
1522 | ||
1523 | /* Check if a threshold crossed before adding a new one */ | |
1524 | if (thresholds->primary) | |
1525 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | |
1526 | ||
1527 | size = thresholds->primary ? thresholds->primary->size + 1 : 1; | |
1528 | ||
1529 | /* Allocate memory for new array of thresholds */ | |
1530 | new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); | |
1531 | if (!new) { | |
1532 | ret = -ENOMEM; | |
1533 | goto unlock; | |
1534 | } | |
1535 | new->size = size; | |
1536 | ||
1537 | /* Copy thresholds (if any) to new array */ | |
1538 | if (thresholds->primary) | |
1539 | memcpy(new->entries, thresholds->primary->entries, | |
1540 | flex_array_size(new, entries, size - 1)); | |
1541 | ||
1542 | /* Add new threshold */ | |
1543 | new->entries[size - 1].eventfd = eventfd; | |
1544 | new->entries[size - 1].threshold = threshold; | |
1545 | ||
1546 | /* Sort thresholds. Registering of new threshold isn't time-critical */ | |
1547 | sort(new->entries, size, sizeof(*new->entries), | |
1548 | compare_thresholds, NULL); | |
1549 | ||
1550 | /* Find current threshold */ | |
1551 | new->current_threshold = -1; | |
1552 | for (i = 0; i < size; i++) { | |
1553 | if (new->entries[i].threshold <= usage) { | |
1554 | /* | |
1555 | * new->current_threshold will not be used until | |
1556 | * rcu_assign_pointer(), so it's safe to increment | |
1557 | * it here. | |
1558 | */ | |
1559 | ++new->current_threshold; | |
1560 | } else | |
1561 | break; | |
1562 | } | |
1563 | ||
1564 | /* Free old spare buffer and save old primary buffer as spare */ | |
1565 | kfree(thresholds->spare); | |
1566 | thresholds->spare = thresholds->primary; | |
1567 | ||
1568 | rcu_assign_pointer(thresholds->primary, new); | |
1569 | ||
1570 | /* To be sure that nobody uses thresholds */ | |
1571 | synchronize_rcu(); | |
1572 | ||
1573 | unlock: | |
1574 | mutex_unlock(&memcg->thresholds_lock); | |
1575 | ||
1576 | return ret; | |
1577 | } | |
1578 | ||
1579 | static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | |
1580 | struct eventfd_ctx *eventfd, const char *args) | |
1581 | { | |
1582 | return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); | |
1583 | } | |
1584 | ||
1585 | static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, | |
1586 | struct eventfd_ctx *eventfd, const char *args) | |
1587 | { | |
1588 | return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); | |
1589 | } | |
1590 | ||
1591 | static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | |
1592 | struct eventfd_ctx *eventfd, enum res_type type) | |
1593 | { | |
1594 | struct mem_cgroup_thresholds *thresholds; | |
1595 | struct mem_cgroup_threshold_ary *new; | |
1596 | unsigned long usage; | |
1597 | int i, j, size, entries; | |
1598 | ||
1599 | mutex_lock(&memcg->thresholds_lock); | |
1600 | ||
1601 | if (type == _MEM) { | |
1602 | thresholds = &memcg->thresholds; | |
1603 | usage = mem_cgroup_usage(memcg, false); | |
1604 | } else if (type == _MEMSWAP) { | |
1605 | thresholds = &memcg->memsw_thresholds; | |
1606 | usage = mem_cgroup_usage(memcg, true); | |
1607 | } else | |
1608 | BUG(); | |
1609 | ||
1610 | if (!thresholds->primary) | |
1611 | goto unlock; | |
1612 | ||
1613 | /* Check if a threshold crossed before removing */ | |
1614 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | |
1615 | ||
1616 | /* Calculate new number of threshold */ | |
1617 | size = entries = 0; | |
1618 | for (i = 0; i < thresholds->primary->size; i++) { | |
1619 | if (thresholds->primary->entries[i].eventfd != eventfd) | |
1620 | size++; | |
1621 | else | |
1622 | entries++; | |
1623 | } | |
1624 | ||
1625 | new = thresholds->spare; | |
1626 | ||
1627 | /* If no items related to eventfd have been cleared, nothing to do */ | |
1628 | if (!entries) | |
1629 | goto unlock; | |
1630 | ||
1631 | /* Set thresholds array to NULL if we don't have thresholds */ | |
1632 | if (!size) { | |
1633 | kfree(new); | |
1634 | new = NULL; | |
1635 | goto swap_buffers; | |
1636 | } | |
1637 | ||
1638 | new->size = size; | |
1639 | ||
1640 | /* Copy thresholds and find current threshold */ | |
1641 | new->current_threshold = -1; | |
1642 | for (i = 0, j = 0; i < thresholds->primary->size; i++) { | |
1643 | if (thresholds->primary->entries[i].eventfd == eventfd) | |
1644 | continue; | |
1645 | ||
1646 | new->entries[j] = thresholds->primary->entries[i]; | |
1647 | if (new->entries[j].threshold <= usage) { | |
1648 | /* | |
1649 | * new->current_threshold will not be used | |
1650 | * until rcu_assign_pointer(), so it's safe to increment | |
1651 | * it here. | |
1652 | */ | |
1653 | ++new->current_threshold; | |
1654 | } | |
1655 | j++; | |
1656 | } | |
1657 | ||
1658 | swap_buffers: | |
1659 | /* Swap primary and spare array */ | |
1660 | thresholds->spare = thresholds->primary; | |
1661 | ||
1662 | rcu_assign_pointer(thresholds->primary, new); | |
1663 | ||
1664 | /* To be sure that nobody uses thresholds */ | |
1665 | synchronize_rcu(); | |
1666 | ||
1667 | /* If all events are unregistered, free the spare array */ | |
1668 | if (!new) { | |
1669 | kfree(thresholds->spare); | |
1670 | thresholds->spare = NULL; | |
1671 | } | |
1672 | unlock: | |
1673 | mutex_unlock(&memcg->thresholds_lock); | |
1674 | } | |
1675 | ||
1676 | static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | |
1677 | struct eventfd_ctx *eventfd) | |
1678 | { | |
1679 | return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); | |
1680 | } | |
1681 | ||
1682 | static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | |
1683 | struct eventfd_ctx *eventfd) | |
1684 | { | |
1685 | return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); | |
1686 | } | |
1687 | ||
1688 | static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, | |
1689 | struct eventfd_ctx *eventfd, const char *args) | |
1690 | { | |
1691 | struct mem_cgroup_eventfd_list *event; | |
1692 | ||
1693 | event = kmalloc(sizeof(*event), GFP_KERNEL); | |
1694 | if (!event) | |
1695 | return -ENOMEM; | |
1696 | ||
1697 | spin_lock(&memcg_oom_lock); | |
1698 | ||
1699 | event->eventfd = eventfd; | |
1700 | list_add(&event->list, &memcg->oom_notify); | |
1701 | ||
1702 | /* already in OOM ? */ | |
1703 | if (memcg->under_oom) | |
1704 | eventfd_signal(eventfd); | |
1705 | spin_unlock(&memcg_oom_lock); | |
1706 | ||
1707 | return 0; | |
1708 | } | |
1709 | ||
1710 | static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, | |
1711 | struct eventfd_ctx *eventfd) | |
1712 | { | |
1713 | struct mem_cgroup_eventfd_list *ev, *tmp; | |
1714 | ||
1715 | spin_lock(&memcg_oom_lock); | |
1716 | ||
1717 | list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { | |
1718 | if (ev->eventfd == eventfd) { | |
1719 | list_del(&ev->list); | |
1720 | kfree(ev); | |
1721 | } | |
1722 | } | |
1723 | ||
1724 | spin_unlock(&memcg_oom_lock); | |
1725 | } | |
1726 | ||
1727 | /* | |
1728 | * DO NOT USE IN NEW FILES. | |
1729 | * | |
1730 | * "cgroup.event_control" implementation. | |
1731 | * | |
1732 | * This is way over-engineered. It tries to support fully configurable | |
1733 | * events for each user. Such level of flexibility is completely | |
1734 | * unnecessary especially in the light of the planned unified hierarchy. | |
1735 | * | |
1736 | * Please deprecate this and replace with something simpler if at all | |
1737 | * possible. | |
1738 | */ | |
1739 | ||
1740 | /* | |
1741 | * Unregister event and free resources. | |
1742 | * | |
1743 | * Gets called from workqueue. | |
1744 | */ | |
1745 | static void memcg_event_remove(struct work_struct *work) | |
1746 | { | |
1747 | struct mem_cgroup_event *event = | |
1748 | container_of(work, struct mem_cgroup_event, remove); | |
1749 | struct mem_cgroup *memcg = event->memcg; | |
1750 | ||
1751 | remove_wait_queue(event->wqh, &event->wait); | |
1752 | ||
1753 | event->unregister_event(memcg, event->eventfd); | |
1754 | ||
1755 | /* Notify userspace the event is going away. */ | |
1756 | eventfd_signal(event->eventfd); | |
1757 | ||
1758 | eventfd_ctx_put(event->eventfd); | |
1759 | kfree(event); | |
1760 | css_put(&memcg->css); | |
1761 | } | |
1762 | ||
1763 | /* | |
1764 | * Gets called on EPOLLHUP on eventfd when user closes it. | |
1765 | * | |
1766 | * Called with wqh->lock held and interrupts disabled. | |
1767 | */ | |
1768 | static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, | |
1769 | int sync, void *key) | |
1770 | { | |
1771 | struct mem_cgroup_event *event = | |
1772 | container_of(wait, struct mem_cgroup_event, wait); | |
1773 | struct mem_cgroup *memcg = event->memcg; | |
1774 | __poll_t flags = key_to_poll(key); | |
1775 | ||
1776 | if (flags & EPOLLHUP) { | |
1777 | /* | |
1778 | * If the event has been detached at cgroup removal, we | |
1779 | * can simply return knowing the other side will cleanup | |
1780 | * for us. | |
1781 | * | |
1782 | * We can't race against event freeing since the other | |
1783 | * side will require wqh->lock via remove_wait_queue(), | |
1784 | * which we hold. | |
1785 | */ | |
1786 | spin_lock(&memcg->event_list_lock); | |
1787 | if (!list_empty(&event->list)) { | |
1788 | list_del_init(&event->list); | |
1789 | /* | |
1790 | * We are in atomic context, but cgroup_event_remove() | |
1791 | * may sleep, so we have to call it in workqueue. | |
1792 | */ | |
1793 | schedule_work(&event->remove); | |
1794 | } | |
1795 | spin_unlock(&memcg->event_list_lock); | |
1796 | } | |
1797 | ||
1798 | return 0; | |
1799 | } | |
1800 | ||
1801 | static void memcg_event_ptable_queue_proc(struct file *file, | |
1802 | wait_queue_head_t *wqh, poll_table *pt) | |
1803 | { | |
1804 | struct mem_cgroup_event *event = | |
1805 | container_of(pt, struct mem_cgroup_event, pt); | |
1806 | ||
1807 | event->wqh = wqh; | |
1808 | add_wait_queue(wqh, &event->wait); | |
1809 | } | |
1810 | ||
1811 | /* | |
1812 | * DO NOT USE IN NEW FILES. | |
1813 | * | |
1814 | * Parse input and register new cgroup event handler. | |
1815 | * | |
1816 | * Input must be in format '<event_fd> <control_fd> <args>'. | |
1817 | * Interpretation of args is defined by control file implementation. | |
1818 | */ | |
ea1e8796 RG |
1819 | static ssize_t memcg_write_event_control(struct kernfs_open_file *of, |
1820 | char *buf, size_t nbytes, loff_t off) | |
66d60c42 RG |
1821 | { |
1822 | struct cgroup_subsys_state *css = of_css(of); | |
1823 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | |
1824 | struct mem_cgroup_event *event; | |
1825 | struct cgroup_subsys_state *cfile_css; | |
1826 | unsigned int efd, cfd; | |
1827 | struct fd efile; | |
1828 | struct fd cfile; | |
1829 | struct dentry *cdentry; | |
1830 | const char *name; | |
1831 | char *endp; | |
1832 | int ret; | |
1833 | ||
1834 | if (IS_ENABLED(CONFIG_PREEMPT_RT)) | |
1835 | return -EOPNOTSUPP; | |
1836 | ||
1837 | buf = strstrip(buf); | |
1838 | ||
1839 | efd = simple_strtoul(buf, &endp, 10); | |
1840 | if (*endp != ' ') | |
1841 | return -EINVAL; | |
1842 | buf = endp + 1; | |
1843 | ||
1844 | cfd = simple_strtoul(buf, &endp, 10); | |
046667c4 AV |
1845 | if (*endp == '\0') |
1846 | buf = endp; | |
1847 | else if (*endp == ' ') | |
1848 | buf = endp + 1; | |
1849 | else | |
66d60c42 | 1850 | return -EINVAL; |
66d60c42 RG |
1851 | |
1852 | event = kzalloc(sizeof(*event), GFP_KERNEL); | |
1853 | if (!event) | |
1854 | return -ENOMEM; | |
1855 | ||
1856 | event->memcg = memcg; | |
1857 | INIT_LIST_HEAD(&event->list); | |
1858 | init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); | |
1859 | init_waitqueue_func_entry(&event->wait, memcg_event_wake); | |
1860 | INIT_WORK(&event->remove, memcg_event_remove); | |
1861 | ||
1862 | efile = fdget(efd); | |
1863 | if (!efile.file) { | |
1864 | ret = -EBADF; | |
1865 | goto out_kfree; | |
1866 | } | |
1867 | ||
1868 | event->eventfd = eventfd_ctx_fileget(efile.file); | |
1869 | if (IS_ERR(event->eventfd)) { | |
1870 | ret = PTR_ERR(event->eventfd); | |
1871 | goto out_put_efile; | |
1872 | } | |
1873 | ||
1874 | cfile = fdget(cfd); | |
1875 | if (!cfile.file) { | |
1876 | ret = -EBADF; | |
1877 | goto out_put_eventfd; | |
1878 | } | |
1879 | ||
1880 | /* the process need read permission on control file */ | |
1881 | /* AV: shouldn't we check that it's been opened for read instead? */ | |
1882 | ret = file_permission(cfile.file, MAY_READ); | |
1883 | if (ret < 0) | |
1884 | goto out_put_cfile; | |
1885 | ||
1886 | /* | |
1887 | * The control file must be a regular cgroup1 file. As a regular cgroup | |
1888 | * file can't be renamed, it's safe to access its name afterwards. | |
1889 | */ | |
1890 | cdentry = cfile.file->f_path.dentry; | |
1891 | if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { | |
1892 | ret = -EINVAL; | |
1893 | goto out_put_cfile; | |
1894 | } | |
1895 | ||
1896 | /* | |
1897 | * Determine the event callbacks and set them in @event. This used | |
1898 | * to be done via struct cftype but cgroup core no longer knows | |
1899 | * about these events. The following is crude but the whole thing | |
1900 | * is for compatibility anyway. | |
1901 | * | |
1902 | * DO NOT ADD NEW FILES. | |
1903 | */ | |
1904 | name = cdentry->d_name.name; | |
1905 | ||
1906 | if (!strcmp(name, "memory.usage_in_bytes")) { | |
1907 | event->register_event = mem_cgroup_usage_register_event; | |
1908 | event->unregister_event = mem_cgroup_usage_unregister_event; | |
1909 | } else if (!strcmp(name, "memory.oom_control")) { | |
1910 | event->register_event = mem_cgroup_oom_register_event; | |
1911 | event->unregister_event = mem_cgroup_oom_unregister_event; | |
1912 | } else if (!strcmp(name, "memory.pressure_level")) { | |
1913 | event->register_event = vmpressure_register_event; | |
1914 | event->unregister_event = vmpressure_unregister_event; | |
1915 | } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { | |
1916 | event->register_event = memsw_cgroup_usage_register_event; | |
1917 | event->unregister_event = memsw_cgroup_usage_unregister_event; | |
1918 | } else { | |
1919 | ret = -EINVAL; | |
1920 | goto out_put_cfile; | |
1921 | } | |
1922 | ||
1923 | /* | |
1924 | * Verify @cfile should belong to @css. Also, remaining events are | |
1925 | * automatically removed on cgroup destruction but the removal is | |
1926 | * asynchronous, so take an extra ref on @css. | |
1927 | */ | |
1928 | cfile_css = css_tryget_online_from_dir(cdentry->d_parent, | |
1929 | &memory_cgrp_subsys); | |
1930 | ret = -EINVAL; | |
1931 | if (IS_ERR(cfile_css)) | |
1932 | goto out_put_cfile; | |
1933 | if (cfile_css != css) { | |
1934 | css_put(cfile_css); | |
1935 | goto out_put_cfile; | |
1936 | } | |
1937 | ||
1938 | ret = event->register_event(memcg, event->eventfd, buf); | |
1939 | if (ret) | |
1940 | goto out_put_css; | |
1941 | ||
1942 | vfs_poll(efile.file, &event->pt); | |
1943 | ||
1944 | spin_lock_irq(&memcg->event_list_lock); | |
1945 | list_add(&event->list, &memcg->event_list); | |
1946 | spin_unlock_irq(&memcg->event_list_lock); | |
1947 | ||
1948 | fdput(cfile); | |
1949 | fdput(efile); | |
1950 | ||
1951 | return nbytes; | |
1952 | ||
1953 | out_put_css: | |
1954 | css_put(css); | |
1955 | out_put_cfile: | |
1956 | fdput(cfile); | |
1957 | out_put_eventfd: | |
1958 | eventfd_ctx_put(event->eventfd); | |
1959 | out_put_efile: | |
1960 | fdput(efile); | |
1961 | out_kfree: | |
1962 | kfree(event); | |
1963 | ||
1964 | return ret; | |
1965 | } | |
1966 | ||
b5855a26 RG |
1967 | void memcg1_memcg_init(struct mem_cgroup *memcg) |
1968 | { | |
1969 | INIT_LIST_HEAD(&memcg->oom_notify); | |
1970 | mutex_init(&memcg->thresholds_lock); | |
1971 | spin_lock_init(&memcg->move_lock); | |
1972 | INIT_LIST_HEAD(&memcg->event_list); | |
1973 | spin_lock_init(&memcg->event_list_lock); | |
1974 | } | |
1975 | ||
66d60c42 RG |
1976 | void memcg1_css_offline(struct mem_cgroup *memcg) |
1977 | { | |
1978 | struct mem_cgroup_event *event, *tmp; | |
1979 | ||
1980 | /* | |
1981 | * Unregister events and notify userspace. | |
1982 | * Notify userspace about cgroup removing only after rmdir of cgroup | |
1983 | * directory to avoid race between userspace and kernelspace. | |
1984 | */ | |
1985 | spin_lock_irq(&memcg->event_list_lock); | |
1986 | list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { | |
1987 | list_del_init(&event->list); | |
1988 | schedule_work(&event->remove); | |
1989 | } | |
1990 | spin_unlock_irq(&memcg->event_list_lock); | |
1991 | } | |
1992 | ||
292fc2e0 RG |
1993 | /* |
1994 | * Check OOM-Killer is already running under our hierarchy. | |
1995 | * If someone is running, return false. | |
1996 | */ | |
1997 | static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) | |
1998 | { | |
1999 | struct mem_cgroup *iter, *failed = NULL; | |
2000 | ||
2001 | spin_lock(&memcg_oom_lock); | |
2002 | ||
2003 | for_each_mem_cgroup_tree(iter, memcg) { | |
2004 | if (iter->oom_lock) { | |
2005 | /* | |
2006 | * this subtree of our hierarchy is already locked | |
2007 | * so we cannot give a lock. | |
2008 | */ | |
2009 | failed = iter; | |
2010 | mem_cgroup_iter_break(memcg, iter); | |
2011 | break; | |
2012 | } else | |
2013 | iter->oom_lock = true; | |
2014 | } | |
2015 | ||
2016 | if (failed) { | |
2017 | /* | |
2018 | * OK, we failed to lock the whole subtree so we have | |
2019 | * to clean up what we set up to the failing subtree | |
2020 | */ | |
2021 | for_each_mem_cgroup_tree(iter, memcg) { | |
2022 | if (iter == failed) { | |
2023 | mem_cgroup_iter_break(memcg, iter); | |
2024 | break; | |
2025 | } | |
2026 | iter->oom_lock = false; | |
2027 | } | |
2028 | } else | |
2029 | mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); | |
2030 | ||
2031 | spin_unlock(&memcg_oom_lock); | |
2032 | ||
2033 | return !failed; | |
2034 | } | |
2035 | ||
2036 | static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) | |
2037 | { | |
2038 | struct mem_cgroup *iter; | |
2039 | ||
2040 | spin_lock(&memcg_oom_lock); | |
2041 | mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); | |
2042 | for_each_mem_cgroup_tree(iter, memcg) | |
2043 | iter->oom_lock = false; | |
2044 | spin_unlock(&memcg_oom_lock); | |
2045 | } | |
2046 | ||
2047 | static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) | |
2048 | { | |
2049 | struct mem_cgroup *iter; | |
2050 | ||
2051 | spin_lock(&memcg_oom_lock); | |
2052 | for_each_mem_cgroup_tree(iter, memcg) | |
2053 | iter->under_oom++; | |
2054 | spin_unlock(&memcg_oom_lock); | |
2055 | } | |
2056 | ||
2057 | static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) | |
2058 | { | |
2059 | struct mem_cgroup *iter; | |
2060 | ||
2061 | /* | |
2062 | * Be careful about under_oom underflows because a child memcg | |
2063 | * could have been added after mem_cgroup_mark_under_oom. | |
2064 | */ | |
2065 | spin_lock(&memcg_oom_lock); | |
2066 | for_each_mem_cgroup_tree(iter, memcg) | |
2067 | if (iter->under_oom > 0) | |
2068 | iter->under_oom--; | |
2069 | spin_unlock(&memcg_oom_lock); | |
2070 | } | |
2071 | ||
2072 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | |
2073 | ||
2074 | struct oom_wait_info { | |
2075 | struct mem_cgroup *memcg; | |
2076 | wait_queue_entry_t wait; | |
2077 | }; | |
2078 | ||
2079 | static int memcg_oom_wake_function(wait_queue_entry_t *wait, | |
2080 | unsigned mode, int sync, void *arg) | |
2081 | { | |
2082 | struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; | |
2083 | struct mem_cgroup *oom_wait_memcg; | |
2084 | struct oom_wait_info *oom_wait_info; | |
2085 | ||
2086 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); | |
2087 | oom_wait_memcg = oom_wait_info->memcg; | |
2088 | ||
2089 | if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && | |
2090 | !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) | |
2091 | return 0; | |
2092 | return autoremove_wake_function(wait, mode, sync, arg); | |
2093 | } | |
2094 | ||
8d49b699 | 2095 | void memcg1_oom_recover(struct mem_cgroup *memcg) |
292fc2e0 RG |
2096 | { |
2097 | /* | |
2098 | * For the following lockless ->under_oom test, the only required | |
2099 | * guarantee is that it must see the state asserted by an OOM when | |
2100 | * this function is called as a result of userland actions | |
2101 | * triggered by the notification of the OOM. This is trivially | |
2102 | * achieved by invoking mem_cgroup_mark_under_oom() before | |
2103 | * triggering notification. | |
2104 | */ | |
2105 | if (memcg && memcg->under_oom) | |
2106 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); | |
2107 | } | |
2108 | ||
2109 | /** | |
2110 | * mem_cgroup_oom_synchronize - complete memcg OOM handling | |
2111 | * @handle: actually kill/wait or just clean up the OOM state | |
2112 | * | |
2113 | * This has to be called at the end of a page fault if the memcg OOM | |
2114 | * handler was enabled. | |
2115 | * | |
2116 | * Memcg supports userspace OOM handling where failed allocations must | |
2117 | * sleep on a waitqueue until the userspace task resolves the | |
2118 | * situation. Sleeping directly in the charge context with all kinds | |
2119 | * of locks held is not a good idea, instead we remember an OOM state | |
2120 | * in the task and mem_cgroup_oom_synchronize() has to be called at | |
2121 | * the end of the page fault to complete the OOM handling. | |
2122 | * | |
2123 | * Returns %true if an ongoing memcg OOM situation was detected and | |
2124 | * completed, %false otherwise. | |
2125 | */ | |
2126 | bool mem_cgroup_oom_synchronize(bool handle) | |
2127 | { | |
2128 | struct mem_cgroup *memcg = current->memcg_in_oom; | |
2129 | struct oom_wait_info owait; | |
2130 | bool locked; | |
2131 | ||
2132 | /* OOM is global, do not handle */ | |
2133 | if (!memcg) | |
2134 | return false; | |
2135 | ||
2136 | if (!handle) | |
2137 | goto cleanup; | |
2138 | ||
2139 | owait.memcg = memcg; | |
2140 | owait.wait.flags = 0; | |
2141 | owait.wait.func = memcg_oom_wake_function; | |
2142 | owait.wait.private = current; | |
2143 | INIT_LIST_HEAD(&owait.wait.entry); | |
2144 | ||
2145 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | |
2146 | mem_cgroup_mark_under_oom(memcg); | |
2147 | ||
2148 | locked = mem_cgroup_oom_trylock(memcg); | |
2149 | ||
2150 | if (locked) | |
2151 | mem_cgroup_oom_notify(memcg); | |
2152 | ||
2153 | schedule(); | |
2154 | mem_cgroup_unmark_under_oom(memcg); | |
2155 | finish_wait(&memcg_oom_waitq, &owait.wait); | |
2156 | ||
2157 | if (locked) | |
2158 | mem_cgroup_oom_unlock(memcg); | |
2159 | cleanup: | |
2160 | current->memcg_in_oom = NULL; | |
2161 | css_put(&memcg->css); | |
2162 | return true; | |
2163 | } | |
2164 | ||
2165 | ||
2166 | bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked) | |
2167 | { | |
2168 | /* | |
2169 | * We are in the middle of the charge context here, so we | |
2170 | * don't want to block when potentially sitting on a callstack | |
2171 | * that holds all kinds of filesystem and mm locks. | |
2172 | * | |
2173 | * cgroup1 allows disabling the OOM killer and waiting for outside | |
2174 | * handling until the charge can succeed; remember the context and put | |
2175 | * the task to sleep at the end of the page fault when all locks are | |
2176 | * released. | |
2177 | * | |
2178 | * On the other hand, in-kernel OOM killer allows for an async victim | |
2179 | * memory reclaim (oom_reaper) and that means that we are not solely | |
2180 | * relying on the oom victim to make a forward progress and we can | |
2181 | * invoke the oom killer here. | |
2182 | * | |
2183 | * Please note that mem_cgroup_out_of_memory might fail to find a | |
2184 | * victim and then we have to bail out from the charge path. | |
2185 | */ | |
2186 | if (READ_ONCE(memcg->oom_kill_disable)) { | |
2187 | if (current->in_user_fault) { | |
2188 | css_get(&memcg->css); | |
2189 | current->memcg_in_oom = memcg; | |
2190 | } | |
2191 | return false; | |
2192 | } | |
2193 | ||
2194 | mem_cgroup_mark_under_oom(memcg); | |
2195 | ||
2196 | *locked = mem_cgroup_oom_trylock(memcg); | |
2197 | ||
2198 | if (*locked) | |
2199 | mem_cgroup_oom_notify(memcg); | |
2200 | ||
2201 | mem_cgroup_unmark_under_oom(memcg); | |
2202 | ||
2203 | return true; | |
2204 | } | |
2205 | ||
2206 | void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked) | |
2207 | { | |
2208 | if (locked) | |
2209 | mem_cgroup_oom_unlock(memcg); | |
2210 | } | |
2211 | ||
ea1e8796 RG |
2212 | static DEFINE_MUTEX(memcg_max_mutex); |
2213 | ||
2214 | static int mem_cgroup_resize_max(struct mem_cgroup *memcg, | |
2215 | unsigned long max, bool memsw) | |
2216 | { | |
2217 | bool enlarge = false; | |
2218 | bool drained = false; | |
2219 | int ret; | |
2220 | bool limits_invariant; | |
2221 | struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; | |
2222 | ||
2223 | do { | |
2224 | if (signal_pending(current)) { | |
2225 | ret = -EINTR; | |
2226 | break; | |
2227 | } | |
2228 | ||
2229 | mutex_lock(&memcg_max_mutex); | |
2230 | /* | |
2231 | * Make sure that the new limit (memsw or memory limit) doesn't | |
2232 | * break our basic invariant rule memory.max <= memsw.max. | |
2233 | */ | |
2234 | limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : | |
2235 | max <= memcg->memsw.max; | |
2236 | if (!limits_invariant) { | |
2237 | mutex_unlock(&memcg_max_mutex); | |
2238 | ret = -EINVAL; | |
2239 | break; | |
2240 | } | |
2241 | if (max > counter->max) | |
2242 | enlarge = true; | |
2243 | ret = page_counter_set_max(counter, max); | |
2244 | mutex_unlock(&memcg_max_mutex); | |
2245 | ||
2246 | if (!ret) | |
2247 | break; | |
2248 | ||
2249 | if (!drained) { | |
2250 | drain_all_stock(memcg); | |
2251 | drained = true; | |
2252 | continue; | |
2253 | } | |
2254 | ||
2255 | if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, | |
68cd9050 | 2256 | memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) { |
ea1e8796 RG |
2257 | ret = -EBUSY; |
2258 | break; | |
2259 | } | |
2260 | } while (true); | |
2261 | ||
2262 | if (!ret && enlarge) | |
2263 | memcg1_oom_recover(memcg); | |
2264 | ||
2265 | return ret; | |
2266 | } | |
2267 | ||
2268 | /* | |
2269 | * Reclaims as many pages from the given memcg as possible. | |
2270 | * | |
2271 | * Caller is responsible for holding css reference for memcg. | |
2272 | */ | |
2273 | static int mem_cgroup_force_empty(struct mem_cgroup *memcg) | |
2274 | { | |
2275 | int nr_retries = MAX_RECLAIM_RETRIES; | |
2276 | ||
2277 | /* we call try-to-free pages for make this cgroup empty */ | |
2278 | lru_add_drain_all(); | |
2279 | ||
2280 | drain_all_stock(memcg); | |
2281 | ||
2282 | /* try to free all pages in this cgroup */ | |
2283 | while (nr_retries && page_counter_read(&memcg->memory)) { | |
2284 | if (signal_pending(current)) | |
2285 | return -EINTR; | |
2286 | ||
2287 | if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, | |
68cd9050 | 2288 | MEMCG_RECLAIM_MAY_SWAP, NULL)) |
ea1e8796 RG |
2289 | nr_retries--; |
2290 | } | |
2291 | ||
2292 | return 0; | |
2293 | } | |
2294 | ||
2295 | static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, | |
2296 | char *buf, size_t nbytes, | |
2297 | loff_t off) | |
2298 | { | |
2299 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | |
2300 | ||
2301 | if (mem_cgroup_is_root(memcg)) | |
2302 | return -EINVAL; | |
2303 | return mem_cgroup_force_empty(memcg) ?: nbytes; | |
2304 | } | |
2305 | ||
2306 | static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, | |
2307 | struct cftype *cft) | |
2308 | { | |
2309 | return 1; | |
2310 | } | |
2311 | ||
2312 | static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, | |
2313 | struct cftype *cft, u64 val) | |
2314 | { | |
2315 | if (val == 1) | |
2316 | return 0; | |
2317 | ||
2318 | pr_warn_once("Non-hierarchical mode is deprecated. " | |
2319 | "Please report your usecase to [email protected] if you " | |
2320 | "depend on this functionality.\n"); | |
2321 | ||
2322 | return -EINVAL; | |
2323 | } | |
2324 | ||
2325 | static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, | |
2326 | struct cftype *cft) | |
2327 | { | |
2328 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | |
2329 | struct page_counter *counter; | |
2330 | ||
2331 | switch (MEMFILE_TYPE(cft->private)) { | |
2332 | case _MEM: | |
2333 | counter = &memcg->memory; | |
2334 | break; | |
2335 | case _MEMSWAP: | |
2336 | counter = &memcg->memsw; | |
2337 | break; | |
2338 | case _KMEM: | |
2339 | counter = &memcg->kmem; | |
2340 | break; | |
2341 | case _TCP: | |
2342 | counter = &memcg->tcpmem; | |
2343 | break; | |
2344 | default: | |
2345 | BUG(); | |
2346 | } | |
2347 | ||
2348 | switch (MEMFILE_ATTR(cft->private)) { | |
2349 | case RES_USAGE: | |
2350 | if (counter == &memcg->memory) | |
2351 | return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; | |
2352 | if (counter == &memcg->memsw) | |
2353 | return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; | |
2354 | return (u64)page_counter_read(counter) * PAGE_SIZE; | |
2355 | case RES_LIMIT: | |
2356 | return (u64)counter->max * PAGE_SIZE; | |
2357 | case RES_MAX_USAGE: | |
2358 | return (u64)counter->watermark * PAGE_SIZE; | |
2359 | case RES_FAILCNT: | |
2360 | return counter->failcnt; | |
2361 | case RES_SOFT_LIMIT: | |
2362 | return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE; | |
2363 | default: | |
2364 | BUG(); | |
2365 | } | |
2366 | } | |
2367 | ||
2368 | /* | |
2369 | * This function doesn't do anything useful. Its only job is to provide a read | |
2370 | * handler for a file so that cgroup_file_mode() will add read permissions. | |
2371 | */ | |
2372 | static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m, | |
2373 | __always_unused void *v) | |
2374 | { | |
2375 | return -EINVAL; | |
2376 | } | |
2377 | ||
2378 | static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) | |
2379 | { | |
2380 | int ret; | |
2381 | ||
2382 | mutex_lock(&memcg_max_mutex); | |
2383 | ||
2384 | ret = page_counter_set_max(&memcg->tcpmem, max); | |
2385 | if (ret) | |
2386 | goto out; | |
2387 | ||
2388 | if (!memcg->tcpmem_active) { | |
2389 | /* | |
2390 | * The active flag needs to be written after the static_key | |
2391 | * update. This is what guarantees that the socket activation | |
2392 | * function is the last one to run. See mem_cgroup_sk_alloc() | |
2393 | * for details, and note that we don't mark any socket as | |
2394 | * belonging to this memcg until that flag is up. | |
2395 | * | |
2396 | * We need to do this, because static_keys will span multiple | |
2397 | * sites, but we can't control their order. If we mark a socket | |
2398 | * as accounted, but the accounting functions are not patched in | |
2399 | * yet, we'll lose accounting. | |
2400 | * | |
2401 | * We never race with the readers in mem_cgroup_sk_alloc(), | |
2402 | * because when this value change, the code to process it is not | |
2403 | * patched in yet. | |
2404 | */ | |
2405 | static_branch_inc(&memcg_sockets_enabled_key); | |
2406 | memcg->tcpmem_active = true; | |
2407 | } | |
2408 | out: | |
2409 | mutex_unlock(&memcg_max_mutex); | |
2410 | return ret; | |
2411 | } | |
2412 | ||
2413 | /* | |
2414 | * The user of this function is... | |
2415 | * RES_LIMIT. | |
2416 | */ | |
2417 | static ssize_t mem_cgroup_write(struct kernfs_open_file *of, | |
2418 | char *buf, size_t nbytes, loff_t off) | |
2419 | { | |
2420 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | |
2421 | unsigned long nr_pages; | |
2422 | int ret; | |
2423 | ||
2424 | buf = strstrip(buf); | |
2425 | ret = page_counter_memparse(buf, "-1", &nr_pages); | |
2426 | if (ret) | |
2427 | return ret; | |
2428 | ||
2429 | switch (MEMFILE_ATTR(of_cft(of)->private)) { | |
2430 | case RES_LIMIT: | |
2431 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ | |
2432 | ret = -EINVAL; | |
2433 | break; | |
2434 | } | |
2435 | switch (MEMFILE_TYPE(of_cft(of)->private)) { | |
2436 | case _MEM: | |
2437 | ret = mem_cgroup_resize_max(memcg, nr_pages, false); | |
2438 | break; | |
2439 | case _MEMSWAP: | |
2440 | ret = mem_cgroup_resize_max(memcg, nr_pages, true); | |
2441 | break; | |
2442 | case _KMEM: | |
2443 | pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " | |
2444 | "Writing any value to this file has no effect. " | |
2445 | "Please report your usecase to [email protected] if you " | |
2446 | "depend on this functionality.\n"); | |
2447 | ret = 0; | |
2448 | break; | |
2449 | case _TCP: | |
2450 | ret = memcg_update_tcp_max(memcg, nr_pages); | |
2451 | break; | |
2452 | } | |
2453 | break; | |
2454 | case RES_SOFT_LIMIT: | |
2455 | if (IS_ENABLED(CONFIG_PREEMPT_RT)) { | |
2456 | ret = -EOPNOTSUPP; | |
2457 | } else { | |
2458 | WRITE_ONCE(memcg->soft_limit, nr_pages); | |
2459 | ret = 0; | |
2460 | } | |
2461 | break; | |
2462 | } | |
2463 | return ret ?: nbytes; | |
2464 | } | |
2465 | ||
2466 | static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, | |
2467 | size_t nbytes, loff_t off) | |
2468 | { | |
2469 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | |
2470 | struct page_counter *counter; | |
2471 | ||
2472 | switch (MEMFILE_TYPE(of_cft(of)->private)) { | |
2473 | case _MEM: | |
2474 | counter = &memcg->memory; | |
2475 | break; | |
2476 | case _MEMSWAP: | |
2477 | counter = &memcg->memsw; | |
2478 | break; | |
2479 | case _KMEM: | |
2480 | counter = &memcg->kmem; | |
2481 | break; | |
2482 | case _TCP: | |
2483 | counter = &memcg->tcpmem; | |
2484 | break; | |
2485 | default: | |
2486 | BUG(); | |
2487 | } | |
2488 | ||
2489 | switch (MEMFILE_ATTR(of_cft(of)->private)) { | |
2490 | case RES_MAX_USAGE: | |
2491 | page_counter_reset_watermark(counter); | |
2492 | break; | |
2493 | case RES_FAILCNT: | |
2494 | counter->failcnt = 0; | |
2495 | break; | |
2496 | default: | |
2497 | BUG(); | |
2498 | } | |
2499 | ||
2500 | return nbytes; | |
2501 | } | |
2502 | ||
2503 | #ifdef CONFIG_NUMA | |
2504 | ||
2505 | #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) | |
2506 | #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) | |
2507 | #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) | |
2508 | ||
2509 | static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, | |
2510 | int nid, unsigned int lru_mask, bool tree) | |
2511 | { | |
2512 | struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); | |
2513 | unsigned long nr = 0; | |
2514 | enum lru_list lru; | |
2515 | ||
2516 | VM_BUG_ON((unsigned)nid >= nr_node_ids); | |
2517 | ||
2518 | for_each_lru(lru) { | |
2519 | if (!(BIT(lru) & lru_mask)) | |
2520 | continue; | |
2521 | if (tree) | |
2522 | nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); | |
2523 | else | |
2524 | nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); | |
2525 | } | |
2526 | return nr; | |
2527 | } | |
2528 | ||
2529 | static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, | |
2530 | unsigned int lru_mask, | |
2531 | bool tree) | |
2532 | { | |
2533 | unsigned long nr = 0; | |
2534 | enum lru_list lru; | |
2535 | ||
2536 | for_each_lru(lru) { | |
2537 | if (!(BIT(lru) & lru_mask)) | |
2538 | continue; | |
2539 | if (tree) | |
2540 | nr += memcg_page_state(memcg, NR_LRU_BASE + lru); | |
2541 | else | |
2542 | nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); | |
2543 | } | |
2544 | return nr; | |
2545 | } | |
2546 | ||
2547 | static int memcg_numa_stat_show(struct seq_file *m, void *v) | |
2548 | { | |
2549 | struct numa_stat { | |
2550 | const char *name; | |
2551 | unsigned int lru_mask; | |
2552 | }; | |
2553 | ||
2554 | static const struct numa_stat stats[] = { | |
2555 | { "total", LRU_ALL }, | |
2556 | { "file", LRU_ALL_FILE }, | |
2557 | { "anon", LRU_ALL_ANON }, | |
2558 | { "unevictable", BIT(LRU_UNEVICTABLE) }, | |
2559 | }; | |
2560 | const struct numa_stat *stat; | |
2561 | int nid; | |
2562 | struct mem_cgroup *memcg = mem_cgroup_from_seq(m); | |
2563 | ||
2564 | mem_cgroup_flush_stats(memcg); | |
2565 | ||
2566 | for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { | |
2567 | seq_printf(m, "%s=%lu", stat->name, | |
2568 | mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, | |
2569 | false)); | |
2570 | for_each_node_state(nid, N_MEMORY) | |
2571 | seq_printf(m, " N%d=%lu", nid, | |
2572 | mem_cgroup_node_nr_lru_pages(memcg, nid, | |
2573 | stat->lru_mask, false)); | |
2574 | seq_putc(m, '\n'); | |
2575 | } | |
2576 | ||
2577 | for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { | |
2578 | ||
2579 | seq_printf(m, "hierarchical_%s=%lu", stat->name, | |
2580 | mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, | |
2581 | true)); | |
2582 | for_each_node_state(nid, N_MEMORY) | |
2583 | seq_printf(m, " N%d=%lu", nid, | |
2584 | mem_cgroup_node_nr_lru_pages(memcg, nid, | |
2585 | stat->lru_mask, true)); | |
2586 | seq_putc(m, '\n'); | |
2587 | } | |
2588 | ||
2589 | return 0; | |
2590 | } | |
2591 | #endif /* CONFIG_NUMA */ | |
2592 | ||
2593 | static const unsigned int memcg1_stats[] = { | |
2594 | NR_FILE_PAGES, | |
2595 | NR_ANON_MAPPED, | |
2596 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | |
2597 | NR_ANON_THPS, | |
2598 | #endif | |
2599 | NR_SHMEM, | |
2600 | NR_FILE_MAPPED, | |
2601 | NR_FILE_DIRTY, | |
2602 | NR_WRITEBACK, | |
2603 | WORKINGSET_REFAULT_ANON, | |
2604 | WORKINGSET_REFAULT_FILE, | |
2605 | #ifdef CONFIG_SWAP | |
2606 | MEMCG_SWAP, | |
2607 | NR_SWAPCACHE, | |
2608 | #endif | |
2609 | }; | |
2610 | ||
2611 | static const char *const memcg1_stat_names[] = { | |
2612 | "cache", | |
2613 | "rss", | |
2614 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | |
2615 | "rss_huge", | |
2616 | #endif | |
2617 | "shmem", | |
2618 | "mapped_file", | |
2619 | "dirty", | |
2620 | "writeback", | |
2621 | "workingset_refault_anon", | |
2622 | "workingset_refault_file", | |
2623 | #ifdef CONFIG_SWAP | |
2624 | "swap", | |
2625 | "swapcached", | |
2626 | #endif | |
2627 | }; | |
2628 | ||
2629 | /* Universal VM events cgroup1 shows, original sort order */ | |
2630 | static const unsigned int memcg1_events[] = { | |
2631 | PGPGIN, | |
2632 | PGPGOUT, | |
2633 | PGFAULT, | |
2634 | PGMAJFAULT, | |
2635 | }; | |
2636 | ||
2637 | void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) | |
2638 | { | |
2639 | unsigned long memory, memsw; | |
2640 | struct mem_cgroup *mi; | |
2641 | unsigned int i; | |
2642 | ||
2643 | BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); | |
2644 | ||
2645 | mem_cgroup_flush_stats(memcg); | |
2646 | ||
2647 | for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { | |
2648 | unsigned long nr; | |
2649 | ||
2650 | nr = memcg_page_state_local_output(memcg, memcg1_stats[i]); | |
2651 | seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr); | |
2652 | } | |
2653 | ||
2654 | for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) | |
2655 | seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]), | |
2656 | memcg_events_local(memcg, memcg1_events[i])); | |
2657 | ||
2658 | for (i = 0; i < NR_LRU_LISTS; i++) | |
2659 | seq_buf_printf(s, "%s %lu\n", lru_list_name(i), | |
2660 | memcg_page_state_local(memcg, NR_LRU_BASE + i) * | |
2661 | PAGE_SIZE); | |
2662 | ||
2663 | /* Hierarchical information */ | |
2664 | memory = memsw = PAGE_COUNTER_MAX; | |
2665 | for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { | |
2666 | memory = min(memory, READ_ONCE(mi->memory.max)); | |
2667 | memsw = min(memsw, READ_ONCE(mi->memsw.max)); | |
2668 | } | |
2669 | seq_buf_printf(s, "hierarchical_memory_limit %llu\n", | |
2670 | (u64)memory * PAGE_SIZE); | |
2671 | seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", | |
2672 | (u64)memsw * PAGE_SIZE); | |
2673 | ||
2674 | for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { | |
2675 | unsigned long nr; | |
2676 | ||
2677 | nr = memcg_page_state_output(memcg, memcg1_stats[i]); | |
2678 | seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], | |
2679 | (u64)nr); | |
2680 | } | |
2681 | ||
2682 | for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) | |
2683 | seq_buf_printf(s, "total_%s %llu\n", | |
2684 | vm_event_name(memcg1_events[i]), | |
2685 | (u64)memcg_events(memcg, memcg1_events[i])); | |
2686 | ||
2687 | for (i = 0; i < NR_LRU_LISTS; i++) | |
2688 | seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i), | |
2689 | (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * | |
2690 | PAGE_SIZE); | |
2691 | ||
2692 | #ifdef CONFIG_DEBUG_VM | |
2693 | { | |
2694 | pg_data_t *pgdat; | |
2695 | struct mem_cgroup_per_node *mz; | |
2696 | unsigned long anon_cost = 0; | |
2697 | unsigned long file_cost = 0; | |
2698 | ||
2699 | for_each_online_pgdat(pgdat) { | |
2700 | mz = memcg->nodeinfo[pgdat->node_id]; | |
2701 | ||
2702 | anon_cost += mz->lruvec.anon_cost; | |
2703 | file_cost += mz->lruvec.file_cost; | |
2704 | } | |
2705 | seq_buf_printf(s, "anon_cost %lu\n", anon_cost); | |
2706 | seq_buf_printf(s, "file_cost %lu\n", file_cost); | |
2707 | } | |
2708 | #endif | |
2709 | } | |
2710 | ||
2711 | static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, | |
2712 | struct cftype *cft) | |
2713 | { | |
2714 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | |
2715 | ||
2716 | return mem_cgroup_swappiness(memcg); | |
2717 | } | |
2718 | ||
2719 | static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, | |
2720 | struct cftype *cft, u64 val) | |
2721 | { | |
2722 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | |
2723 | ||
410abb20 | 2724 | if (val > MAX_SWAPPINESS) |
ea1e8796 RG |
2725 | return -EINVAL; |
2726 | ||
2727 | if (!mem_cgroup_is_root(memcg)) | |
2728 | WRITE_ONCE(memcg->swappiness, val); | |
2729 | else | |
2730 | WRITE_ONCE(vm_swappiness, val); | |
2731 | ||
2732 | return 0; | |
2733 | } | |
2734 | ||
2735 | static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) | |
2736 | { | |
2737 | struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); | |
2738 | ||
2739 | seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable)); | |
2740 | seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); | |
2741 | seq_printf(sf, "oom_kill %lu\n", | |
2742 | atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); | |
2743 | return 0; | |
2744 | } | |
2745 | ||
2746 | static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, | |
2747 | struct cftype *cft, u64 val) | |
2748 | { | |
2749 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | |
2750 | ||
2751 | /* cannot set to root cgroup and only 0 and 1 are allowed */ | |
2752 | if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) | |
2753 | return -EINVAL; | |
2754 | ||
2755 | WRITE_ONCE(memcg->oom_kill_disable, val); | |
2756 | if (!val) | |
2757 | memcg1_oom_recover(memcg); | |
2758 | ||
2759 | return 0; | |
2760 | } | |
2761 | ||
3a3b7fec | 2762 | #ifdef CONFIG_SLUB_DEBUG |
ea1e8796 RG |
2763 | static int mem_cgroup_slab_show(struct seq_file *m, void *p) |
2764 | { | |
2765 | /* | |
2766 | * Deprecated. | |
2767 | * Please, take a look at tools/cgroup/memcg_slabinfo.py . | |
2768 | */ | |
2769 | return 0; | |
2770 | } | |
2771 | #endif | |
2772 | ||
2773 | struct cftype mem_cgroup_legacy_files[] = { | |
2774 | { | |
2775 | .name = "usage_in_bytes", | |
2776 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | |
2777 | .read_u64 = mem_cgroup_read_u64, | |
2778 | }, | |
2779 | { | |
2780 | .name = "max_usage_in_bytes", | |
2781 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), | |
2782 | .write = mem_cgroup_reset, | |
2783 | .read_u64 = mem_cgroup_read_u64, | |
2784 | }, | |
2785 | { | |
2786 | .name = "limit_in_bytes", | |
2787 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), | |
2788 | .write = mem_cgroup_write, | |
2789 | .read_u64 = mem_cgroup_read_u64, | |
2790 | }, | |
2791 | { | |
2792 | .name = "soft_limit_in_bytes", | |
2793 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), | |
2794 | .write = mem_cgroup_write, | |
2795 | .read_u64 = mem_cgroup_read_u64, | |
2796 | }, | |
2797 | { | |
2798 | .name = "failcnt", | |
2799 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), | |
2800 | .write = mem_cgroup_reset, | |
2801 | .read_u64 = mem_cgroup_read_u64, | |
2802 | }, | |
2803 | { | |
2804 | .name = "stat", | |
2805 | .seq_show = memory_stat_show, | |
2806 | }, | |
2807 | { | |
2808 | .name = "force_empty", | |
2809 | .write = mem_cgroup_force_empty_write, | |
2810 | }, | |
2811 | { | |
2812 | .name = "use_hierarchy", | |
2813 | .write_u64 = mem_cgroup_hierarchy_write, | |
2814 | .read_u64 = mem_cgroup_hierarchy_read, | |
2815 | }, | |
2816 | { | |
2817 | .name = "cgroup.event_control", /* XXX: for compat */ | |
2818 | .write = memcg_write_event_control, | |
2819 | .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, | |
2820 | }, | |
2821 | { | |
2822 | .name = "swappiness", | |
2823 | .read_u64 = mem_cgroup_swappiness_read, | |
2824 | .write_u64 = mem_cgroup_swappiness_write, | |
2825 | }, | |
2826 | { | |
2827 | .name = "move_charge_at_immigrate", | |
2828 | .read_u64 = mem_cgroup_move_charge_read, | |
2829 | .write_u64 = mem_cgroup_move_charge_write, | |
2830 | }, | |
2831 | { | |
2832 | .name = "oom_control", | |
2833 | .seq_show = mem_cgroup_oom_control_read, | |
2834 | .write_u64 = mem_cgroup_oom_control_write, | |
2835 | }, | |
2836 | { | |
2837 | .name = "pressure_level", | |
2838 | .seq_show = mem_cgroup_dummy_seq_show, | |
2839 | }, | |
2840 | #ifdef CONFIG_NUMA | |
2841 | { | |
2842 | .name = "numa_stat", | |
2843 | .seq_show = memcg_numa_stat_show, | |
2844 | }, | |
2845 | #endif | |
2846 | { | |
2847 | .name = "kmem.limit_in_bytes", | |
2848 | .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), | |
2849 | .write = mem_cgroup_write, | |
2850 | .read_u64 = mem_cgroup_read_u64, | |
2851 | }, | |
2852 | { | |
2853 | .name = "kmem.usage_in_bytes", | |
2854 | .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), | |
2855 | .read_u64 = mem_cgroup_read_u64, | |
2856 | }, | |
2857 | { | |
2858 | .name = "kmem.failcnt", | |
2859 | .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), | |
2860 | .write = mem_cgroup_reset, | |
2861 | .read_u64 = mem_cgroup_read_u64, | |
2862 | }, | |
2863 | { | |
2864 | .name = "kmem.max_usage_in_bytes", | |
2865 | .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), | |
2866 | .write = mem_cgroup_reset, | |
2867 | .read_u64 = mem_cgroup_read_u64, | |
2868 | }, | |
3a3b7fec | 2869 | #ifdef CONFIG_SLUB_DEBUG |
ea1e8796 RG |
2870 | { |
2871 | .name = "kmem.slabinfo", | |
2872 | .seq_show = mem_cgroup_slab_show, | |
2873 | }, | |
2874 | #endif | |
2875 | { | |
2876 | .name = "kmem.tcp.limit_in_bytes", | |
2877 | .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), | |
2878 | .write = mem_cgroup_write, | |
2879 | .read_u64 = mem_cgroup_read_u64, | |
2880 | }, | |
2881 | { | |
2882 | .name = "kmem.tcp.usage_in_bytes", | |
2883 | .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), | |
2884 | .read_u64 = mem_cgroup_read_u64, | |
2885 | }, | |
2886 | { | |
2887 | .name = "kmem.tcp.failcnt", | |
2888 | .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), | |
2889 | .write = mem_cgroup_reset, | |
2890 | .read_u64 = mem_cgroup_read_u64, | |
2891 | }, | |
2892 | { | |
2893 | .name = "kmem.tcp.max_usage_in_bytes", | |
2894 | .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), | |
2895 | .write = mem_cgroup_reset, | |
2896 | .read_u64 = mem_cgroup_read_u64, | |
2897 | }, | |
2898 | { }, /* terminate */ | |
2899 | }; | |
2900 | ||
2901 | struct cftype memsw_files[] = { | |
2902 | { | |
2903 | .name = "memsw.usage_in_bytes", | |
2904 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | |
2905 | .read_u64 = mem_cgroup_read_u64, | |
2906 | }, | |
2907 | { | |
2908 | .name = "memsw.max_usage_in_bytes", | |
2909 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | |
2910 | .write = mem_cgroup_reset, | |
2911 | .read_u64 = mem_cgroup_read_u64, | |
2912 | }, | |
2913 | { | |
2914 | .name = "memsw.limit_in_bytes", | |
2915 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | |
2916 | .write = mem_cgroup_write, | |
2917 | .read_u64 = mem_cgroup_read_u64, | |
2918 | }, | |
2919 | { | |
2920 | .name = "memsw.failcnt", | |
2921 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | |
2922 | .write = mem_cgroup_reset, | |
2923 | .read_u64 = mem_cgroup_read_u64, | |
2924 | }, | |
2925 | { }, /* terminate */ | |
2926 | }; | |
2927 | ||
04fbe921 RG |
2928 | void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) |
2929 | { | |
2930 | if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { | |
2931 | if (nr_pages > 0) | |
2932 | page_counter_charge(&memcg->kmem, nr_pages); | |
2933 | else | |
2934 | page_counter_uncharge(&memcg->kmem, -nr_pages); | |
2935 | } | |
2936 | } | |
04fbe921 | 2937 | |
773e9ae7 RG |
2938 | bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, |
2939 | gfp_t gfp_mask) | |
2940 | { | |
2941 | struct page_counter *fail; | |
2942 | ||
2943 | if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { | |
2944 | memcg->tcpmem_pressure = 0; | |
2945 | return true; | |
2946 | } | |
2947 | memcg->tcpmem_pressure = 1; | |
2948 | if (gfp_mask & __GFP_NOFAIL) { | |
2949 | page_counter_charge(&memcg->tcpmem, nr_pages); | |
2950 | return true; | |
2951 | } | |
2952 | return false; | |
2953 | } | |
2954 | ||
d12f6d22 RG |
2955 | static int __init memcg1_init(void) |
2956 | { | |
2957 | int node; | |
2958 | ||
2959 | for_each_node(node) { | |
2960 | struct mem_cgroup_tree_per_node *rtpn; | |
2961 | ||
2962 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node); | |
2963 | ||
2964 | rtpn->rb_root = RB_ROOT; | |
2965 | rtpn->rb_rightmost = NULL; | |
2966 | spin_lock_init(&rtpn->lock); | |
2967 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | |
2968 | } | |
2969 | ||
2970 | return 0; | |
2971 | } | |
2972 | subsys_initcall(memcg1_init); |