]> Git Repo - linux.git/blame - mm/shrinker.c
mm: vmscan: move shrinker-related code into a separate file
[linux.git] / mm / shrinker.c
CommitLineData
96f7b2b9
QZ
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/memcontrol.h>
3#include <linux/rwsem.h>
4#include <linux/shrinker.h>
5#include <trace/events/vmscan.h>
6
7#include "internal.h"
8
9LIST_HEAD(shrinker_list);
10DECLARE_RWSEM(shrinker_rwsem);
11
12#ifdef CONFIG_MEMCG
13static int shrinker_nr_max;
14
15/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
16static inline int shrinker_map_size(int nr_items)
17{
18 return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
19}
20
21static inline int shrinker_defer_size(int nr_items)
22{
23 return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
24}
25
26void free_shrinker_info(struct mem_cgroup *memcg)
27{
28 struct mem_cgroup_per_node *pn;
29 struct shrinker_info *info;
30 int nid;
31
32 for_each_node(nid) {
33 pn = memcg->nodeinfo[nid];
34 info = rcu_dereference_protected(pn->shrinker_info, true);
35 kvfree(info);
36 rcu_assign_pointer(pn->shrinker_info, NULL);
37 }
38}
39
40int alloc_shrinker_info(struct mem_cgroup *memcg)
41{
42 struct shrinker_info *info;
43 int nid, size, ret = 0;
44 int map_size, defer_size = 0;
45
46 down_write(&shrinker_rwsem);
47 map_size = shrinker_map_size(shrinker_nr_max);
48 defer_size = shrinker_defer_size(shrinker_nr_max);
49 size = map_size + defer_size;
50 for_each_node(nid) {
51 info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
52 if (!info) {
53 free_shrinker_info(memcg);
54 ret = -ENOMEM;
55 break;
56 }
57 info->nr_deferred = (atomic_long_t *)(info + 1);
58 info->map = (void *)info->nr_deferred + defer_size;
59 info->map_nr_max = shrinker_nr_max;
60 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
61 }
62 up_write(&shrinker_rwsem);
63
64 return ret;
65}
66
67static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
68 int nid)
69{
70 return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
71 lockdep_is_held(&shrinker_rwsem));
72}
73
74static int expand_one_shrinker_info(struct mem_cgroup *memcg,
75 int map_size, int defer_size,
76 int old_map_size, int old_defer_size,
77 int new_nr_max)
78{
79 struct shrinker_info *new, *old;
80 struct mem_cgroup_per_node *pn;
81 int nid;
82 int size = map_size + defer_size;
83
84 for_each_node(nid) {
85 pn = memcg->nodeinfo[nid];
86 old = shrinker_info_protected(memcg, nid);
87 /* Not yet online memcg */
88 if (!old)
89 return 0;
90
91 /* Already expanded this shrinker_info */
92 if (new_nr_max <= old->map_nr_max)
93 continue;
94
95 new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
96 if (!new)
97 return -ENOMEM;
98
99 new->nr_deferred = (atomic_long_t *)(new + 1);
100 new->map = (void *)new->nr_deferred + defer_size;
101 new->map_nr_max = new_nr_max;
102
103 /* map: set all old bits, clear all new bits */
104 memset(new->map, (int)0xff, old_map_size);
105 memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
106 /* nr_deferred: copy old values, clear all new values */
107 memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
108 memset((void *)new->nr_deferred + old_defer_size, 0,
109 defer_size - old_defer_size);
110
111 rcu_assign_pointer(pn->shrinker_info, new);
112 kvfree_rcu(old, rcu);
113 }
114
115 return 0;
116}
117
118static int expand_shrinker_info(int new_id)
119{
120 int ret = 0;
121 int new_nr_max = round_up(new_id + 1, BITS_PER_LONG);
122 int map_size, defer_size = 0;
123 int old_map_size, old_defer_size = 0;
124 struct mem_cgroup *memcg;
125
126 if (!root_mem_cgroup)
127 goto out;
128
129 lockdep_assert_held(&shrinker_rwsem);
130
131 map_size = shrinker_map_size(new_nr_max);
132 defer_size = shrinker_defer_size(new_nr_max);
133 old_map_size = shrinker_map_size(shrinker_nr_max);
134 old_defer_size = shrinker_defer_size(shrinker_nr_max);
135
136 memcg = mem_cgroup_iter(NULL, NULL, NULL);
137 do {
138 ret = expand_one_shrinker_info(memcg, map_size, defer_size,
139 old_map_size, old_defer_size,
140 new_nr_max);
141 if (ret) {
142 mem_cgroup_iter_break(NULL, memcg);
143 goto out;
144 }
145 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
146out:
147 if (!ret)
148 shrinker_nr_max = new_nr_max;
149
150 return ret;
151}
152
153void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
154{
155 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
156 struct shrinker_info *info;
157
158 rcu_read_lock();
159 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
160 if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
161 /* Pairs with smp mb in shrink_slab() */
162 smp_mb__before_atomic();
163 set_bit(shrinker_id, info->map);
164 }
165 rcu_read_unlock();
166 }
167}
168
169static DEFINE_IDR(shrinker_idr);
170
171static int prealloc_memcg_shrinker(struct shrinker *shrinker)
172{
173 int id, ret = -ENOMEM;
174
175 if (mem_cgroup_disabled())
176 return -ENOSYS;
177
178 down_write(&shrinker_rwsem);
179 /* This may call shrinker, so it must use down_read_trylock() */
180 id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
181 if (id < 0)
182 goto unlock;
183
184 if (id >= shrinker_nr_max) {
185 if (expand_shrinker_info(id)) {
186 idr_remove(&shrinker_idr, id);
187 goto unlock;
188 }
189 }
190 shrinker->id = id;
191 ret = 0;
192unlock:
193 up_write(&shrinker_rwsem);
194 return ret;
195}
196
197static void unregister_memcg_shrinker(struct shrinker *shrinker)
198{
199 int id = shrinker->id;
200
201 BUG_ON(id < 0);
202
203 lockdep_assert_held(&shrinker_rwsem);
204
205 idr_remove(&shrinker_idr, id);
206}
207
208static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
209 struct mem_cgroup *memcg)
210{
211 struct shrinker_info *info;
212
213 info = shrinker_info_protected(memcg, nid);
214 return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
215}
216
217static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
218 struct mem_cgroup *memcg)
219{
220 struct shrinker_info *info;
221
222 info = shrinker_info_protected(memcg, nid);
223 return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
224}
225
226void reparent_shrinker_deferred(struct mem_cgroup *memcg)
227{
228 int i, nid;
229 long nr;
230 struct mem_cgroup *parent;
231 struct shrinker_info *child_info, *parent_info;
232
233 parent = parent_mem_cgroup(memcg);
234 if (!parent)
235 parent = root_mem_cgroup;
236
237 /* Prevent from concurrent shrinker_info expand */
238 down_read(&shrinker_rwsem);
239 for_each_node(nid) {
240 child_info = shrinker_info_protected(memcg, nid);
241 parent_info = shrinker_info_protected(parent, nid);
242 for (i = 0; i < child_info->map_nr_max; i++) {
243 nr = atomic_long_read(&child_info->nr_deferred[i]);
244 atomic_long_add(nr, &parent_info->nr_deferred[i]);
245 }
246 }
247 up_read(&shrinker_rwsem);
248}
249#else
250static int prealloc_memcg_shrinker(struct shrinker *shrinker)
251{
252 return -ENOSYS;
253}
254
255static void unregister_memcg_shrinker(struct shrinker *shrinker)
256{
257}
258
259static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
260 struct mem_cgroup *memcg)
261{
262 return 0;
263}
264
265static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
266 struct mem_cgroup *memcg)
267{
268 return 0;
269}
270#endif /* CONFIG_MEMCG */
271
272static long xchg_nr_deferred(struct shrinker *shrinker,
273 struct shrink_control *sc)
274{
275 int nid = sc->nid;
276
277 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
278 nid = 0;
279
280 if (sc->memcg &&
281 (shrinker->flags & SHRINKER_MEMCG_AWARE))
282 return xchg_nr_deferred_memcg(nid, shrinker,
283 sc->memcg);
284
285 return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
286}
287
288
289static long add_nr_deferred(long nr, struct shrinker *shrinker,
290 struct shrink_control *sc)
291{
292 int nid = sc->nid;
293
294 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
295 nid = 0;
296
297 if (sc->memcg &&
298 (shrinker->flags & SHRINKER_MEMCG_AWARE))
299 return add_nr_deferred_memcg(nr, nid, shrinker,
300 sc->memcg);
301
302 return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
303}
304
305#define SHRINK_BATCH 128
306
307static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
308 struct shrinker *shrinker, int priority)
309{
310 unsigned long freed = 0;
311 unsigned long long delta;
312 long total_scan;
313 long freeable;
314 long nr;
315 long new_nr;
316 long batch_size = shrinker->batch ? shrinker->batch
317 : SHRINK_BATCH;
318 long scanned = 0, next_deferred;
319
320 freeable = shrinker->count_objects(shrinker, shrinkctl);
321 if (freeable == 0 || freeable == SHRINK_EMPTY)
322 return freeable;
323
324 /*
325 * copy the current shrinker scan count into a local variable
326 * and zero it so that other concurrent shrinker invocations
327 * don't also do this scanning work.
328 */
329 nr = xchg_nr_deferred(shrinker, shrinkctl);
330
331 if (shrinker->seeks) {
332 delta = freeable >> priority;
333 delta *= 4;
334 do_div(delta, shrinker->seeks);
335 } else {
336 /*
337 * These objects don't require any IO to create. Trim
338 * them aggressively under memory pressure to keep
339 * them from causing refetches in the IO caches.
340 */
341 delta = freeable / 2;
342 }
343
344 total_scan = nr >> priority;
345 total_scan += delta;
346 total_scan = min(total_scan, (2 * freeable));
347
348 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
349 freeable, delta, total_scan, priority);
350
351 /*
352 * Normally, we should not scan less than batch_size objects in one
353 * pass to avoid too frequent shrinker calls, but if the slab has less
354 * than batch_size objects in total and we are really tight on memory,
355 * we will try to reclaim all available objects, otherwise we can end
356 * up failing allocations although there are plenty of reclaimable
357 * objects spread over several slabs with usage less than the
358 * batch_size.
359 *
360 * We detect the "tight on memory" situations by looking at the total
361 * number of objects we want to scan (total_scan). If it is greater
362 * than the total number of objects on slab (freeable), we must be
363 * scanning at high prio and therefore should try to reclaim as much as
364 * possible.
365 */
366 while (total_scan >= batch_size ||
367 total_scan >= freeable) {
368 unsigned long ret;
369 unsigned long nr_to_scan = min(batch_size, total_scan);
370
371 shrinkctl->nr_to_scan = nr_to_scan;
372 shrinkctl->nr_scanned = nr_to_scan;
373 ret = shrinker->scan_objects(shrinker, shrinkctl);
374 if (ret == SHRINK_STOP)
375 break;
376 freed += ret;
377
378 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
379 total_scan -= shrinkctl->nr_scanned;
380 scanned += shrinkctl->nr_scanned;
381
382 cond_resched();
383 }
384
385 /*
386 * The deferred work is increased by any new work (delta) that wasn't
387 * done, decreased by old deferred work that was done now.
388 *
389 * And it is capped to two times of the freeable items.
390 */
391 next_deferred = max_t(long, (nr + delta - scanned), 0);
392 next_deferred = min(next_deferred, (2 * freeable));
393
394 /*
395 * move the unused scan count back into the shrinker in a
396 * manner that handles concurrent updates.
397 */
398 new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
399
400 trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
401 return freed;
402}
403
404#ifdef CONFIG_MEMCG
405static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
406 struct mem_cgroup *memcg, int priority)
407{
408 struct shrinker_info *info;
409 unsigned long ret, freed = 0;
410 int i;
411
412 if (!mem_cgroup_online(memcg))
413 return 0;
414
415 if (!down_read_trylock(&shrinker_rwsem))
416 return 0;
417
418 info = shrinker_info_protected(memcg, nid);
419 if (unlikely(!info))
420 goto unlock;
421
422 for_each_set_bit(i, info->map, info->map_nr_max) {
423 struct shrink_control sc = {
424 .gfp_mask = gfp_mask,
425 .nid = nid,
426 .memcg = memcg,
427 };
428 struct shrinker *shrinker;
429
430 shrinker = idr_find(&shrinker_idr, i);
431 if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
432 if (!shrinker)
433 clear_bit(i, info->map);
434 continue;
435 }
436
437 /* Call non-slab shrinkers even though kmem is disabled */
438 if (!memcg_kmem_online() &&
439 !(shrinker->flags & SHRINKER_NONSLAB))
440 continue;
441
442 ret = do_shrink_slab(&sc, shrinker, priority);
443 if (ret == SHRINK_EMPTY) {
444 clear_bit(i, info->map);
445 /*
446 * After the shrinker reported that it had no objects to
447 * free, but before we cleared the corresponding bit in
448 * the memcg shrinker map, a new object might have been
449 * added. To make sure, we have the bit set in this
450 * case, we invoke the shrinker one more time and reset
451 * the bit if it reports that it is not empty anymore.
452 * The memory barrier here pairs with the barrier in
453 * set_shrinker_bit():
454 *
455 * list_lru_add() shrink_slab_memcg()
456 * list_add_tail() clear_bit()
457 * <MB> <MB>
458 * set_bit() do_shrink_slab()
459 */
460 smp_mb__after_atomic();
461 ret = do_shrink_slab(&sc, shrinker, priority);
462 if (ret == SHRINK_EMPTY)
463 ret = 0;
464 else
465 set_shrinker_bit(memcg, nid, i);
466 }
467 freed += ret;
468
469 if (rwsem_is_contended(&shrinker_rwsem)) {
470 freed = freed ? : 1;
471 break;
472 }
473 }
474unlock:
475 up_read(&shrinker_rwsem);
476 return freed;
477}
478#else /* !CONFIG_MEMCG */
479static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
480 struct mem_cgroup *memcg, int priority)
481{
482 return 0;
483}
484#endif /* CONFIG_MEMCG */
485
486/**
487 * shrink_slab - shrink slab caches
488 * @gfp_mask: allocation context
489 * @nid: node whose slab caches to target
490 * @memcg: memory cgroup whose slab caches to target
491 * @priority: the reclaim priority
492 *
493 * Call the shrink functions to age shrinkable caches.
494 *
495 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
496 * unaware shrinkers will receive a node id of 0 instead.
497 *
498 * @memcg specifies the memory cgroup to target. Unaware shrinkers
499 * are called only if it is the root cgroup.
500 *
501 * @priority is sc->priority, we take the number of objects and >> by priority
502 * in order to get the scan target.
503 *
504 * Returns the number of reclaimed slab objects.
505 */
506unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
507 int priority)
508{
509 unsigned long ret, freed = 0;
510 struct shrinker *shrinker;
511
512 /*
513 * The root memcg might be allocated even though memcg is disabled
514 * via "cgroup_disable=memory" boot parameter. This could make
515 * mem_cgroup_is_root() return false, then just run memcg slab
516 * shrink, but skip global shrink. This may result in premature
517 * oom.
518 */
519 if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
520 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
521
522 if (!down_read_trylock(&shrinker_rwsem))
523 goto out;
524
525 list_for_each_entry(shrinker, &shrinker_list, list) {
526 struct shrink_control sc = {
527 .gfp_mask = gfp_mask,
528 .nid = nid,
529 .memcg = memcg,
530 };
531
532 ret = do_shrink_slab(&sc, shrinker, priority);
533 if (ret == SHRINK_EMPTY)
534 ret = 0;
535 freed += ret;
536 /*
537 * Bail out if someone want to register a new shrinker to
538 * prevent the registration from being stalled for long periods
539 * by parallel ongoing shrinking.
540 */
541 if (rwsem_is_contended(&shrinker_rwsem)) {
542 freed = freed ? : 1;
543 break;
544 }
545 }
546
547 up_read(&shrinker_rwsem);
548out:
549 cond_resched();
550 return freed;
551}
552
553/*
554 * Add a shrinker callback to be called from the vm.
555 */
556static int __prealloc_shrinker(struct shrinker *shrinker)
557{
558 unsigned int size;
559 int err;
560
561 if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
562 err = prealloc_memcg_shrinker(shrinker);
563 if (err != -ENOSYS)
564 return err;
565
566 shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
567 }
568
569 size = sizeof(*shrinker->nr_deferred);
570 if (shrinker->flags & SHRINKER_NUMA_AWARE)
571 size *= nr_node_ids;
572
573 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
574 if (!shrinker->nr_deferred)
575 return -ENOMEM;
576
577 return 0;
578}
579
580#ifdef CONFIG_SHRINKER_DEBUG
581int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
582{
583 va_list ap;
584 int err;
585
586 va_start(ap, fmt);
587 shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
588 va_end(ap);
589 if (!shrinker->name)
590 return -ENOMEM;
591
592 err = __prealloc_shrinker(shrinker);
593 if (err) {
594 kfree_const(shrinker->name);
595 shrinker->name = NULL;
596 }
597
598 return err;
599}
600#else
601int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
602{
603 return __prealloc_shrinker(shrinker);
604}
605#endif
606
607void free_prealloced_shrinker(struct shrinker *shrinker)
608{
609#ifdef CONFIG_SHRINKER_DEBUG
610 kfree_const(shrinker->name);
611 shrinker->name = NULL;
612#endif
613 if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
614 down_write(&shrinker_rwsem);
615 unregister_memcg_shrinker(shrinker);
616 up_write(&shrinker_rwsem);
617 return;
618 }
619
620 kfree(shrinker->nr_deferred);
621 shrinker->nr_deferred = NULL;
622}
623
624void register_shrinker_prepared(struct shrinker *shrinker)
625{
626 down_write(&shrinker_rwsem);
627 list_add_tail(&shrinker->list, &shrinker_list);
628 shrinker->flags |= SHRINKER_REGISTERED;
629 shrinker_debugfs_add(shrinker);
630 up_write(&shrinker_rwsem);
631}
632
633static int __register_shrinker(struct shrinker *shrinker)
634{
635 int err = __prealloc_shrinker(shrinker);
636
637 if (err)
638 return err;
639 register_shrinker_prepared(shrinker);
640 return 0;
641}
642
643#ifdef CONFIG_SHRINKER_DEBUG
644int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
645{
646 va_list ap;
647 int err;
648
649 va_start(ap, fmt);
650 shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
651 va_end(ap);
652 if (!shrinker->name)
653 return -ENOMEM;
654
655 err = __register_shrinker(shrinker);
656 if (err) {
657 kfree_const(shrinker->name);
658 shrinker->name = NULL;
659 }
660 return err;
661}
662#else
663int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
664{
665 return __register_shrinker(shrinker);
666}
667#endif
668EXPORT_SYMBOL(register_shrinker);
669
670/*
671 * Remove one
672 */
673void unregister_shrinker(struct shrinker *shrinker)
674{
675 struct dentry *debugfs_entry;
676 int debugfs_id;
677
678 if (!(shrinker->flags & SHRINKER_REGISTERED))
679 return;
680
681 down_write(&shrinker_rwsem);
682 list_del(&shrinker->list);
683 shrinker->flags &= ~SHRINKER_REGISTERED;
684 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
685 unregister_memcg_shrinker(shrinker);
686 debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
687 up_write(&shrinker_rwsem);
688
689 shrinker_debugfs_remove(debugfs_entry, debugfs_id);
690
691 kfree(shrinker->nr_deferred);
692 shrinker->nr_deferred = NULL;
693}
694EXPORT_SYMBOL(unregister_shrinker);
695
696/**
697 * synchronize_shrinkers - Wait for all running shrinkers to complete.
698 *
699 * This is equivalent to calling unregister_shrink() and register_shrinker(),
700 * but atomically and with less overhead. This is useful to guarantee that all
701 * shrinker invocations have seen an update, before freeing memory, similar to
702 * rcu.
703 */
704void synchronize_shrinkers(void)
705{
706 down_write(&shrinker_rwsem);
707 up_write(&shrinker_rwsem);
708}
709EXPORT_SYMBOL(synchronize_shrinkers);
This page took 0.116432 seconds and 4 git commands to generate.