1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/memcontrol.h>
3 #include <linux/rwsem.h>
4 #include <linux/shrinker.h>
5 #include <linux/rculist.h>
6 #include <trace/events/vmscan.h>
10 LIST_HEAD(shrinker_list);
11 DEFINE_MUTEX(shrinker_mutex);
14 static int shrinker_nr_max;
16 static inline int shrinker_unit_size(int nr_items)
18 return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
21 static inline void shrinker_unit_free(struct shrinker_info *info, int start)
23 struct shrinker_info_unit **unit;
30 nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);
32 for (i = start; i < nr; i++) {
41 static inline int shrinker_unit_alloc(struct shrinker_info *new,
42 struct shrinker_info *old, int nid)
44 struct shrinker_info_unit *unit;
45 int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
46 int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
49 for (i = start; i < nr; i++) {
50 unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid);
52 shrinker_unit_free(new, start);
62 void free_shrinker_info(struct mem_cgroup *memcg)
64 struct mem_cgroup_per_node *pn;
65 struct shrinker_info *info;
69 pn = memcg->nodeinfo[nid];
70 info = rcu_dereference_protected(pn->shrinker_info, true);
71 shrinker_unit_free(info, 0);
73 rcu_assign_pointer(pn->shrinker_info, NULL);
77 int alloc_shrinker_info(struct mem_cgroup *memcg)
79 struct shrinker_info *info;
83 mutex_lock(&shrinker_mutex);
84 array_size = shrinker_unit_size(shrinker_nr_max);
86 info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid);
89 info->map_nr_max = shrinker_nr_max;
90 if (shrinker_unit_alloc(info, NULL, nid))
92 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
94 mutex_unlock(&shrinker_mutex);
99 mutex_unlock(&shrinker_mutex);
100 free_shrinker_info(memcg);
104 static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
107 return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
108 lockdep_is_held(&shrinker_mutex));
111 static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size,
112 int old_size, int new_nr_max)
114 struct shrinker_info *new, *old;
115 struct mem_cgroup_per_node *pn;
119 pn = memcg->nodeinfo[nid];
120 old = shrinker_info_protected(memcg, nid);
121 /* Not yet online memcg */
125 /* Already expanded this shrinker_info */
126 if (new_nr_max <= old->map_nr_max)
129 new = kvzalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid);
133 new->map_nr_max = new_nr_max;
135 memcpy(new->unit, old->unit, old_size);
136 if (shrinker_unit_alloc(new, old, nid)) {
141 rcu_assign_pointer(pn->shrinker_info, new);
142 kvfree_rcu(old, rcu);
148 static int expand_shrinker_info(int new_id)
151 int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS);
152 int new_size, old_size = 0;
153 struct mem_cgroup *memcg;
155 if (!root_mem_cgroup)
158 lockdep_assert_held(&shrinker_mutex);
160 new_size = shrinker_unit_size(new_nr_max);
161 old_size = shrinker_unit_size(shrinker_nr_max);
163 memcg = mem_cgroup_iter(NULL, NULL, NULL);
165 ret = expand_one_shrinker_info(memcg, new_size, old_size,
168 mem_cgroup_iter_break(NULL, memcg);
171 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
174 shrinker_nr_max = new_nr_max;
179 static inline int shrinker_id_to_index(int shrinker_id)
181 return shrinker_id / SHRINKER_UNIT_BITS;
184 static inline int shrinker_id_to_offset(int shrinker_id)
186 return shrinker_id % SHRINKER_UNIT_BITS;
189 static inline int calc_shrinker_id(int index, int offset)
191 return index * SHRINKER_UNIT_BITS + offset;
194 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
196 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
197 struct shrinker_info *info;
198 struct shrinker_info_unit *unit;
201 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
202 unit = info->unit[shrinker_id_to_index(shrinker_id)];
203 if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
204 /* Pairs with smp mb in shrink_slab() */
205 smp_mb__before_atomic();
206 set_bit(shrinker_id_to_offset(shrinker_id), unit->map);
212 static DEFINE_IDR(shrinker_idr);
214 static int shrinker_memcg_alloc(struct shrinker *shrinker)
216 int id, ret = -ENOMEM;
218 if (mem_cgroup_disabled())
221 mutex_lock(&shrinker_mutex);
222 id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
226 if (id >= shrinker_nr_max) {
227 if (expand_shrinker_info(id)) {
228 idr_remove(&shrinker_idr, id);
235 mutex_unlock(&shrinker_mutex);
239 static void shrinker_memcg_remove(struct shrinker *shrinker)
241 int id = shrinker->id;
245 lockdep_assert_held(&shrinker_mutex);
247 idr_remove(&shrinker_idr, id);
250 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
251 struct mem_cgroup *memcg)
253 struct shrinker_info *info;
254 struct shrinker_info_unit *unit;
258 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
259 unit = info->unit[shrinker_id_to_index(shrinker->id)];
260 nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0);
266 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
267 struct mem_cgroup *memcg)
269 struct shrinker_info *info;
270 struct shrinker_info_unit *unit;
274 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
275 unit = info->unit[shrinker_id_to_index(shrinker->id)];
277 atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]);
283 void reparent_shrinker_deferred(struct mem_cgroup *memcg)
285 int nid, index, offset;
287 struct mem_cgroup *parent;
288 struct shrinker_info *child_info, *parent_info;
289 struct shrinker_info_unit *child_unit, *parent_unit;
291 parent = parent_mem_cgroup(memcg);
293 parent = root_mem_cgroup;
295 /* Prevent from concurrent shrinker_info expand */
296 mutex_lock(&shrinker_mutex);
298 child_info = shrinker_info_protected(memcg, nid);
299 parent_info = shrinker_info_protected(parent, nid);
300 for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) {
301 child_unit = child_info->unit[index];
302 parent_unit = parent_info->unit[index];
303 for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
304 nr = atomic_long_read(&child_unit->nr_deferred[offset]);
305 atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
309 mutex_unlock(&shrinker_mutex);
312 static int shrinker_memcg_alloc(struct shrinker *shrinker)
317 static void shrinker_memcg_remove(struct shrinker *shrinker)
321 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
322 struct mem_cgroup *memcg)
327 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
328 struct mem_cgroup *memcg)
332 #endif /* CONFIG_MEMCG */
334 static long xchg_nr_deferred(struct shrinker *shrinker,
335 struct shrink_control *sc)
339 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
343 (shrinker->flags & SHRINKER_MEMCG_AWARE))
344 return xchg_nr_deferred_memcg(nid, shrinker,
347 return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
351 static long add_nr_deferred(long nr, struct shrinker *shrinker,
352 struct shrink_control *sc)
356 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
360 (shrinker->flags & SHRINKER_MEMCG_AWARE))
361 return add_nr_deferred_memcg(nr, nid, shrinker,
364 return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
367 #define SHRINK_BATCH 128
369 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
370 struct shrinker *shrinker, int priority)
372 unsigned long freed = 0;
373 unsigned long long delta;
378 long batch_size = shrinker->batch ? shrinker->batch
380 long scanned = 0, next_deferred;
382 freeable = shrinker->count_objects(shrinker, shrinkctl);
383 if (freeable == 0 || freeable == SHRINK_EMPTY)
387 * copy the current shrinker scan count into a local variable
388 * and zero it so that other concurrent shrinker invocations
389 * don't also do this scanning work.
391 nr = xchg_nr_deferred(shrinker, shrinkctl);
393 if (shrinker->seeks) {
394 delta = freeable >> priority;
396 do_div(delta, shrinker->seeks);
399 * These objects don't require any IO to create. Trim
400 * them aggressively under memory pressure to keep
401 * them from causing refetches in the IO caches.
403 delta = freeable / 2;
406 total_scan = nr >> priority;
408 total_scan = min(total_scan, (2 * freeable));
410 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
411 freeable, delta, total_scan, priority);
414 * Normally, we should not scan less than batch_size objects in one
415 * pass to avoid too frequent shrinker calls, but if the slab has less
416 * than batch_size objects in total and we are really tight on memory,
417 * we will try to reclaim all available objects, otherwise we can end
418 * up failing allocations although there are plenty of reclaimable
419 * objects spread over several slabs with usage less than the
422 * We detect the "tight on memory" situations by looking at the total
423 * number of objects we want to scan (total_scan). If it is greater
424 * than the total number of objects on slab (freeable), we must be
425 * scanning at high prio and therefore should try to reclaim as much as
428 while (total_scan >= batch_size ||
429 total_scan >= freeable) {
431 unsigned long nr_to_scan = min(batch_size, total_scan);
433 shrinkctl->nr_to_scan = nr_to_scan;
434 shrinkctl->nr_scanned = nr_to_scan;
435 ret = shrinker->scan_objects(shrinker, shrinkctl);
436 if (ret == SHRINK_STOP)
440 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
441 total_scan -= shrinkctl->nr_scanned;
442 scanned += shrinkctl->nr_scanned;
448 * The deferred work is increased by any new work (delta) that wasn't
449 * done, decreased by old deferred work that was done now.
451 * And it is capped to two times of the freeable items.
453 next_deferred = max_t(long, (nr + delta - scanned), 0);
454 next_deferred = min(next_deferred, (2 * freeable));
457 * move the unused scan count back into the shrinker in a
458 * manner that handles concurrent updates.
460 new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
462 trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
467 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
468 struct mem_cgroup *memcg, int priority)
470 struct shrinker_info *info;
471 unsigned long ret, freed = 0;
472 int offset, index = 0;
474 if (!mem_cgroup_online(memcg))
478 * lockless algorithm of memcg shrink.
480 * The shrinker_info may be freed asynchronously via RCU in the
481 * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used
482 * to ensure the existence of the shrinker_info.
484 * The shrinker_info_unit is never freed unless its corresponding memcg
485 * is destroyed. Here we already hold the refcount of memcg, so the
486 * memcg will not be destroyed, and of course shrinker_info_unit will
489 * So in the memcg shrink:
490 * step 1: use rcu_read_lock() to guarantee existence of the
492 * step 2: after getting shrinker_info_unit we can safely release the
494 * step 3: traverse the bitmap and calculate shrinker_id
495 * step 4: use rcu_read_lock() to guarantee existence of the shrinker.
496 * step 5: use shrinker_id to find the shrinker, then use
497 * shrinker_try_get() to guarantee existence of the shrinker,
498 * then we can release the RCU lock to do do_shrink_slab() that
500 * step 6: do shrinker_put() paired with step 5 to put the refcount,
501 * if the refcount reaches 0, then wake up the waiter in
502 * shrinker_free() by calling complete().
503 * Note: here is different from the global shrink, we don't
504 * need to acquire the RCU lock to guarantee existence of
505 * the shrinker, because we don't need to use this
506 * shrinker to traverse the next shrinker in the bitmap.
507 * step 7: we have already exited the read-side of rcu critical section
508 * before calling do_shrink_slab(), the shrinker_info may be
509 * released in expand_one_shrinker_info(), so go back to step 1
510 * to reacquire the shrinker_info.
514 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
518 if (index < shrinker_id_to_index(info->map_nr_max)) {
519 struct shrinker_info_unit *unit;
521 unit = info->unit[index];
525 for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) {
526 struct shrink_control sc = {
527 .gfp_mask = gfp_mask,
531 struct shrinker *shrinker;
532 int shrinker_id = calc_shrinker_id(index, offset);
535 shrinker = idr_find(&shrinker_idr, shrinker_id);
536 if (unlikely(!shrinker || !shrinker_try_get(shrinker))) {
537 clear_bit(offset, unit->map);
543 /* Call non-slab shrinkers even though kmem is disabled */
544 if (!memcg_kmem_online() &&
545 !(shrinker->flags & SHRINKER_NONSLAB))
548 ret = do_shrink_slab(&sc, shrinker, priority);
549 if (ret == SHRINK_EMPTY) {
550 clear_bit(offset, unit->map);
552 * After the shrinker reported that it had no objects to
553 * free, but before we cleared the corresponding bit in
554 * the memcg shrinker map, a new object might have been
555 * added. To make sure, we have the bit set in this
556 * case, we invoke the shrinker one more time and reset
557 * the bit if it reports that it is not empty anymore.
558 * The memory barrier here pairs with the barrier in
559 * set_shrinker_bit():
561 * list_lru_add() shrink_slab_memcg()
562 * list_add_tail() clear_bit()
564 * set_bit() do_shrink_slab()
566 smp_mb__after_atomic();
567 ret = do_shrink_slab(&sc, shrinker, priority);
568 if (ret == SHRINK_EMPTY)
571 set_shrinker_bit(memcg, nid, shrinker_id);
574 shrinker_put(shrinker);
584 #else /* !CONFIG_MEMCG */
585 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
586 struct mem_cgroup *memcg, int priority)
590 #endif /* CONFIG_MEMCG */
593 * shrink_slab - shrink slab caches
594 * @gfp_mask: allocation context
595 * @nid: node whose slab caches to target
596 * @memcg: memory cgroup whose slab caches to target
597 * @priority: the reclaim priority
599 * Call the shrink functions to age shrinkable caches.
601 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
602 * unaware shrinkers will receive a node id of 0 instead.
604 * @memcg specifies the memory cgroup to target. Unaware shrinkers
605 * are called only if it is the root cgroup.
607 * @priority is sc->priority, we take the number of objects and >> by priority
608 * in order to get the scan target.
610 * Returns the number of reclaimed slab objects.
612 unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
615 unsigned long ret, freed = 0;
616 struct shrinker *shrinker;
619 * The root memcg might be allocated even though memcg is disabled
620 * via "cgroup_disable=memory" boot parameter. This could make
621 * mem_cgroup_is_root() return false, then just run memcg slab
622 * shrink, but skip global shrink. This may result in premature
625 if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
626 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
629 * lockless algorithm of global shrink.
631 * In the unregistration setp, the shrinker will be freed asynchronously
632 * via RCU after its refcount reaches 0. So both rcu_read_lock() and
633 * shrinker_try_get() can be used to ensure the existence of the shrinker.
635 * So in the global shrink:
636 * step 1: use rcu_read_lock() to guarantee existence of the shrinker
637 * and the validity of the shrinker_list walk.
638 * step 2: use shrinker_try_get() to try get the refcount, if successful,
639 * then the existence of the shrinker can also be guaranteed,
640 * so we can release the RCU lock to do do_shrink_slab() that
642 * step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
643 * which ensures that neither this shrinker nor the next shrinker
644 * will be freed in the next traversal operation.
645 * step 4: do shrinker_put() paired with step 2 to put the refcount,
646 * if the refcount reaches 0, then wake up the waiter in
647 * shrinker_free() by calling complete().
650 list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
651 struct shrink_control sc = {
652 .gfp_mask = gfp_mask,
657 if (!shrinker_try_get(shrinker))
662 ret = do_shrink_slab(&sc, shrinker, priority);
663 if (ret == SHRINK_EMPTY)
668 shrinker_put(shrinker);
676 struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
678 struct shrinker *shrinker;
683 shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL);
688 err = shrinker_debugfs_name_alloc(shrinker, fmt, ap);
693 shrinker->flags = flags | SHRINKER_ALLOCATED;
694 shrinker->seeks = DEFAULT_SEEKS;
696 if (flags & SHRINKER_MEMCG_AWARE) {
697 err = shrinker_memcg_alloc(shrinker);
698 if (err == -ENOSYS) {
699 /* Memcg is not supported, fallback to non-memcg-aware shrinker. */
700 shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
712 * The nr_deferred is available on per memcg level for memcg aware
713 * shrinkers, so only allocate nr_deferred in the following cases:
714 * - non-memcg-aware shrinkers
716 * - memcg is disabled by kernel command line
718 size = sizeof(*shrinker->nr_deferred);
719 if (flags & SHRINKER_NUMA_AWARE)
722 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
723 if (!shrinker->nr_deferred)
729 shrinker_debugfs_name_free(shrinker);
734 EXPORT_SYMBOL_GPL(shrinker_alloc);
736 void shrinker_register(struct shrinker *shrinker)
738 if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) {
739 pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker");
743 mutex_lock(&shrinker_mutex);
744 list_add_tail_rcu(&shrinker->list, &shrinker_list);
745 shrinker->flags |= SHRINKER_REGISTERED;
746 shrinker_debugfs_add(shrinker);
747 mutex_unlock(&shrinker_mutex);
749 init_completion(&shrinker->done);
751 * Now the shrinker is fully set up, take the first reference to it to
752 * indicate that lookup operations are now allowed to use it via
753 * shrinker_try_get().
755 refcount_set(&shrinker->refcount, 1);
757 EXPORT_SYMBOL_GPL(shrinker_register);
759 static void shrinker_free_rcu_cb(struct rcu_head *head)
761 struct shrinker *shrinker = container_of(head, struct shrinker, rcu);
763 kfree(shrinker->nr_deferred);
767 void shrinker_free(struct shrinker *shrinker)
769 struct dentry *debugfs_entry = NULL;
775 if (shrinker->flags & SHRINKER_REGISTERED) {
776 /* drop the initial refcount */
777 shrinker_put(shrinker);
779 * Wait for all lookups of the shrinker to complete, after that,
780 * no shrinker is running or will run again, then we can safely
781 * free it asynchronously via RCU and safely free the structure
782 * where the shrinker is located, such as super_block etc.
784 wait_for_completion(&shrinker->done);
787 mutex_lock(&shrinker_mutex);
788 if (shrinker->flags & SHRINKER_REGISTERED) {
790 * Now we can safely remove it from the shrinker_list and then
793 list_del_rcu(&shrinker->list);
794 debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
795 shrinker->flags &= ~SHRINKER_REGISTERED;
798 shrinker_debugfs_name_free(shrinker);
800 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
801 shrinker_memcg_remove(shrinker);
802 mutex_unlock(&shrinker_mutex);
805 shrinker_debugfs_remove(debugfs_entry, debugfs_id);
807 call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
809 EXPORT_SYMBOL_GPL(shrinker_free);