]>
Commit | Line | Data |
---|---|---|
96f7b2b9 QZ |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/memcontrol.h> | |
3 | #include <linux/rwsem.h> | |
4 | #include <linux/shrinker.h> | |
ca1d36b8 | 5 | #include <linux/rculist.h> |
96f7b2b9 QZ |
6 | #include <trace/events/vmscan.h> |
7 | ||
8 | #include "internal.h" | |
9 | ||
10 | LIST_HEAD(shrinker_list); | |
8a0e8bb1 | 11 | DEFINE_MUTEX(shrinker_mutex); |
96f7b2b9 QZ |
12 | |
13 | #ifdef CONFIG_MEMCG | |
14 | static int shrinker_nr_max; | |
15 | ||
307becec | 16 | static inline int shrinker_unit_size(int nr_items) |
96f7b2b9 | 17 | { |
307becec | 18 | return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *)); |
96f7b2b9 QZ |
19 | } |
20 | ||
307becec | 21 | static inline void shrinker_unit_free(struct shrinker_info *info, int start) |
96f7b2b9 | 22 | { |
307becec QZ |
23 | struct shrinker_info_unit **unit; |
24 | int nr, i; | |
25 | ||
26 | if (!info) | |
27 | return; | |
28 | ||
29 | unit = info->unit; | |
30 | nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS); | |
31 | ||
32 | for (i = start; i < nr; i++) { | |
33 | if (!unit[i]) | |
34 | break; | |
35 | ||
36 | kfree(unit[i]); | |
37 | unit[i] = NULL; | |
38 | } | |
39 | } | |
40 | ||
41 | static inline int shrinker_unit_alloc(struct shrinker_info *new, | |
42 | struct shrinker_info *old, int nid) | |
43 | { | |
44 | struct shrinker_info_unit *unit; | |
45 | int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS); | |
46 | int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0; | |
47 | int i; | |
48 | ||
49 | for (i = start; i < nr; i++) { | |
50 | unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid); | |
51 | if (!unit) { | |
52 | shrinker_unit_free(new, start); | |
53 | return -ENOMEM; | |
54 | } | |
55 | ||
56 | new->unit[i] = unit; | |
57 | } | |
58 | ||
59 | return 0; | |
96f7b2b9 QZ |
60 | } |
61 | ||
62 | void free_shrinker_info(struct mem_cgroup *memcg) | |
63 | { | |
64 | struct mem_cgroup_per_node *pn; | |
65 | struct shrinker_info *info; | |
66 | int nid; | |
67 | ||
68 | for_each_node(nid) { | |
69 | pn = memcg->nodeinfo[nid]; | |
70 | info = rcu_dereference_protected(pn->shrinker_info, true); | |
307becec | 71 | shrinker_unit_free(info, 0); |
96f7b2b9 QZ |
72 | kvfree(info); |
73 | rcu_assign_pointer(pn->shrinker_info, NULL); | |
74 | } | |
75 | } | |
76 | ||
77 | int alloc_shrinker_info(struct mem_cgroup *memcg) | |
78 | { | |
79 | struct shrinker_info *info; | |
307becec QZ |
80 | int nid, ret = 0; |
81 | int array_size = 0; | |
96f7b2b9 | 82 | |
8a0e8bb1 | 83 | mutex_lock(&shrinker_mutex); |
307becec | 84 | array_size = shrinker_unit_size(shrinker_nr_max); |
96f7b2b9 | 85 | for_each_node(nid) { |
307becec QZ |
86 | info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid); |
87 | if (!info) | |
88 | goto err; | |
96f7b2b9 | 89 | info->map_nr_max = shrinker_nr_max; |
307becec QZ |
90 | if (shrinker_unit_alloc(info, NULL, nid)) |
91 | goto err; | |
96f7b2b9 QZ |
92 | rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); |
93 | } | |
8a0e8bb1 | 94 | mutex_unlock(&shrinker_mutex); |
96f7b2b9 QZ |
95 | |
96 | return ret; | |
307becec QZ |
97 | |
98 | err: | |
8a0e8bb1 | 99 | mutex_unlock(&shrinker_mutex); |
307becec QZ |
100 | free_shrinker_info(memcg); |
101 | return -ENOMEM; | |
96f7b2b9 QZ |
102 | } |
103 | ||
104 | static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, | |
105 | int nid) | |
106 | { | |
107 | return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, | |
8a0e8bb1 | 108 | lockdep_is_held(&shrinker_mutex)); |
96f7b2b9 QZ |
109 | } |
110 | ||
307becec QZ |
111 | static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size, |
112 | int old_size, int new_nr_max) | |
96f7b2b9 QZ |
113 | { |
114 | struct shrinker_info *new, *old; | |
115 | struct mem_cgroup_per_node *pn; | |
116 | int nid; | |
96f7b2b9 QZ |
117 | |
118 | for_each_node(nid) { | |
119 | pn = memcg->nodeinfo[nid]; | |
120 | old = shrinker_info_protected(memcg, nid); | |
121 | /* Not yet online memcg */ | |
122 | if (!old) | |
123 | return 0; | |
124 | ||
125 | /* Already expanded this shrinker_info */ | |
126 | if (new_nr_max <= old->map_nr_max) | |
127 | continue; | |
128 | ||
7fba9420 | 129 | new = kvzalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid); |
96f7b2b9 QZ |
130 | if (!new) |
131 | return -ENOMEM; | |
132 | ||
96f7b2b9 QZ |
133 | new->map_nr_max = new_nr_max; |
134 | ||
307becec QZ |
135 | memcpy(new->unit, old->unit, old_size); |
136 | if (shrinker_unit_alloc(new, old, nid)) { | |
137 | kvfree(new); | |
138 | return -ENOMEM; | |
139 | } | |
96f7b2b9 QZ |
140 | |
141 | rcu_assign_pointer(pn->shrinker_info, new); | |
142 | kvfree_rcu(old, rcu); | |
143 | } | |
144 | ||
145 | return 0; | |
146 | } | |
147 | ||
148 | static int expand_shrinker_info(int new_id) | |
149 | { | |
150 | int ret = 0; | |
307becec QZ |
151 | int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS); |
152 | int new_size, old_size = 0; | |
96f7b2b9 QZ |
153 | struct mem_cgroup *memcg; |
154 | ||
155 | if (!root_mem_cgroup) | |
156 | goto out; | |
157 | ||
8a0e8bb1 | 158 | lockdep_assert_held(&shrinker_mutex); |
96f7b2b9 | 159 | |
307becec QZ |
160 | new_size = shrinker_unit_size(new_nr_max); |
161 | old_size = shrinker_unit_size(shrinker_nr_max); | |
96f7b2b9 QZ |
162 | |
163 | memcg = mem_cgroup_iter(NULL, NULL, NULL); | |
164 | do { | |
307becec | 165 | ret = expand_one_shrinker_info(memcg, new_size, old_size, |
96f7b2b9 QZ |
166 | new_nr_max); |
167 | if (ret) { | |
168 | mem_cgroup_iter_break(NULL, memcg); | |
169 | goto out; | |
170 | } | |
171 | } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); | |
172 | out: | |
173 | if (!ret) | |
174 | shrinker_nr_max = new_nr_max; | |
175 | ||
176 | return ret; | |
177 | } | |
178 | ||
307becec QZ |
179 | static inline int shrinker_id_to_index(int shrinker_id) |
180 | { | |
181 | return shrinker_id / SHRINKER_UNIT_BITS; | |
182 | } | |
183 | ||
184 | static inline int shrinker_id_to_offset(int shrinker_id) | |
185 | { | |
186 | return shrinker_id % SHRINKER_UNIT_BITS; | |
187 | } | |
188 | ||
189 | static inline int calc_shrinker_id(int index, int offset) | |
190 | { | |
191 | return index * SHRINKER_UNIT_BITS + offset; | |
192 | } | |
193 | ||
96f7b2b9 QZ |
194 | void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) |
195 | { | |
196 | if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { | |
197 | struct shrinker_info *info; | |
307becec | 198 | struct shrinker_info_unit *unit; |
96f7b2b9 QZ |
199 | |
200 | rcu_read_lock(); | |
201 | info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); | |
307becec | 202 | unit = info->unit[shrinker_id_to_index(shrinker_id)]; |
96f7b2b9 QZ |
203 | if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { |
204 | /* Pairs with smp mb in shrink_slab() */ | |
205 | smp_mb__before_atomic(); | |
307becec | 206 | set_bit(shrinker_id_to_offset(shrinker_id), unit->map); |
96f7b2b9 QZ |
207 | } |
208 | rcu_read_unlock(); | |
209 | } | |
210 | } | |
211 | ||
212 | static DEFINE_IDR(shrinker_idr); | |
213 | ||
48a7a099 | 214 | static int shrinker_memcg_alloc(struct shrinker *shrinker) |
96f7b2b9 QZ |
215 | { |
216 | int id, ret = -ENOMEM; | |
217 | ||
218 | if (mem_cgroup_disabled()) | |
219 | return -ENOSYS; | |
220 | ||
8a0e8bb1 | 221 | mutex_lock(&shrinker_mutex); |
96f7b2b9 QZ |
222 | id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); |
223 | if (id < 0) | |
224 | goto unlock; | |
225 | ||
226 | if (id >= shrinker_nr_max) { | |
227 | if (expand_shrinker_info(id)) { | |
228 | idr_remove(&shrinker_idr, id); | |
229 | goto unlock; | |
230 | } | |
231 | } | |
232 | shrinker->id = id; | |
233 | ret = 0; | |
234 | unlock: | |
8a0e8bb1 | 235 | mutex_unlock(&shrinker_mutex); |
96f7b2b9 QZ |
236 | return ret; |
237 | } | |
238 | ||
48a7a099 | 239 | static void shrinker_memcg_remove(struct shrinker *shrinker) |
96f7b2b9 QZ |
240 | { |
241 | int id = shrinker->id; | |
242 | ||
243 | BUG_ON(id < 0); | |
244 | ||
8a0e8bb1 | 245 | lockdep_assert_held(&shrinker_mutex); |
96f7b2b9 QZ |
246 | |
247 | idr_remove(&shrinker_idr, id); | |
248 | } | |
249 | ||
250 | static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, | |
251 | struct mem_cgroup *memcg) | |
252 | { | |
253 | struct shrinker_info *info; | |
307becec | 254 | struct shrinker_info_unit *unit; |
50d09da8 | 255 | long nr_deferred; |
96f7b2b9 | 256 | |
50d09da8 QZ |
257 | rcu_read_lock(); |
258 | info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); | |
307becec | 259 | unit = info->unit[shrinker_id_to_index(shrinker->id)]; |
50d09da8 QZ |
260 | nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0); |
261 | rcu_read_unlock(); | |
262 | ||
263 | return nr_deferred; | |
96f7b2b9 QZ |
264 | } |
265 | ||
266 | static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, | |
267 | struct mem_cgroup *memcg) | |
268 | { | |
269 | struct shrinker_info *info; | |
307becec | 270 | struct shrinker_info_unit *unit; |
50d09da8 | 271 | long nr_deferred; |
96f7b2b9 | 272 | |
50d09da8 QZ |
273 | rcu_read_lock(); |
274 | info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); | |
307becec | 275 | unit = info->unit[shrinker_id_to_index(shrinker->id)]; |
50d09da8 QZ |
276 | nr_deferred = |
277 | atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]); | |
278 | rcu_read_unlock(); | |
279 | ||
280 | return nr_deferred; | |
96f7b2b9 QZ |
281 | } |
282 | ||
283 | void reparent_shrinker_deferred(struct mem_cgroup *memcg) | |
284 | { | |
307becec | 285 | int nid, index, offset; |
96f7b2b9 QZ |
286 | long nr; |
287 | struct mem_cgroup *parent; | |
288 | struct shrinker_info *child_info, *parent_info; | |
307becec | 289 | struct shrinker_info_unit *child_unit, *parent_unit; |
96f7b2b9 QZ |
290 | |
291 | parent = parent_mem_cgroup(memcg); | |
292 | if (!parent) | |
293 | parent = root_mem_cgroup; | |
294 | ||
295 | /* Prevent from concurrent shrinker_info expand */ | |
8a0e8bb1 | 296 | mutex_lock(&shrinker_mutex); |
96f7b2b9 QZ |
297 | for_each_node(nid) { |
298 | child_info = shrinker_info_protected(memcg, nid); | |
299 | parent_info = shrinker_info_protected(parent, nid); | |
307becec QZ |
300 | for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) { |
301 | child_unit = child_info->unit[index]; | |
302 | parent_unit = parent_info->unit[index]; | |
303 | for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) { | |
304 | nr = atomic_long_read(&child_unit->nr_deferred[offset]); | |
305 | atomic_long_add(nr, &parent_unit->nr_deferred[offset]); | |
306 | } | |
96f7b2b9 QZ |
307 | } |
308 | } | |
8a0e8bb1 | 309 | mutex_unlock(&shrinker_mutex); |
96f7b2b9 QZ |
310 | } |
311 | #else | |
48a7a099 | 312 | static int shrinker_memcg_alloc(struct shrinker *shrinker) |
96f7b2b9 QZ |
313 | { |
314 | return -ENOSYS; | |
315 | } | |
316 | ||
48a7a099 | 317 | static void shrinker_memcg_remove(struct shrinker *shrinker) |
96f7b2b9 QZ |
318 | { |
319 | } | |
320 | ||
321 | static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, | |
322 | struct mem_cgroup *memcg) | |
323 | { | |
324 | return 0; | |
325 | } | |
326 | ||
327 | static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, | |
328 | struct mem_cgroup *memcg) | |
329 | { | |
330 | return 0; | |
331 | } | |
332 | #endif /* CONFIG_MEMCG */ | |
333 | ||
334 | static long xchg_nr_deferred(struct shrinker *shrinker, | |
335 | struct shrink_control *sc) | |
336 | { | |
337 | int nid = sc->nid; | |
338 | ||
339 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) | |
340 | nid = 0; | |
341 | ||
342 | if (sc->memcg && | |
343 | (shrinker->flags & SHRINKER_MEMCG_AWARE)) | |
344 | return xchg_nr_deferred_memcg(nid, shrinker, | |
345 | sc->memcg); | |
346 | ||
347 | return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); | |
348 | } | |
349 | ||
350 | ||
351 | static long add_nr_deferred(long nr, struct shrinker *shrinker, | |
352 | struct shrink_control *sc) | |
353 | { | |
354 | int nid = sc->nid; | |
355 | ||
356 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) | |
357 | nid = 0; | |
358 | ||
359 | if (sc->memcg && | |
360 | (shrinker->flags & SHRINKER_MEMCG_AWARE)) | |
361 | return add_nr_deferred_memcg(nr, nid, shrinker, | |
362 | sc->memcg); | |
363 | ||
364 | return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); | |
365 | } | |
366 | ||
367 | #define SHRINK_BATCH 128 | |
368 | ||
369 | static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, | |
370 | struct shrinker *shrinker, int priority) | |
371 | { | |
372 | unsigned long freed = 0; | |
373 | unsigned long long delta; | |
374 | long total_scan; | |
375 | long freeable; | |
376 | long nr; | |
377 | long new_nr; | |
378 | long batch_size = shrinker->batch ? shrinker->batch | |
379 | : SHRINK_BATCH; | |
380 | long scanned = 0, next_deferred; | |
381 | ||
382 | freeable = shrinker->count_objects(shrinker, shrinkctl); | |
383 | if (freeable == 0 || freeable == SHRINK_EMPTY) | |
384 | return freeable; | |
385 | ||
386 | /* | |
387 | * copy the current shrinker scan count into a local variable | |
388 | * and zero it so that other concurrent shrinker invocations | |
389 | * don't also do this scanning work. | |
390 | */ | |
391 | nr = xchg_nr_deferred(shrinker, shrinkctl); | |
392 | ||
393 | if (shrinker->seeks) { | |
394 | delta = freeable >> priority; | |
395 | delta *= 4; | |
396 | do_div(delta, shrinker->seeks); | |
397 | } else { | |
398 | /* | |
399 | * These objects don't require any IO to create. Trim | |
400 | * them aggressively under memory pressure to keep | |
401 | * them from causing refetches in the IO caches. | |
402 | */ | |
403 | delta = freeable / 2; | |
404 | } | |
405 | ||
406 | total_scan = nr >> priority; | |
407 | total_scan += delta; | |
408 | total_scan = min(total_scan, (2 * freeable)); | |
409 | ||
410 | trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, | |
411 | freeable, delta, total_scan, priority); | |
412 | ||
413 | /* | |
414 | * Normally, we should not scan less than batch_size objects in one | |
415 | * pass to avoid too frequent shrinker calls, but if the slab has less | |
416 | * than batch_size objects in total and we are really tight on memory, | |
417 | * we will try to reclaim all available objects, otherwise we can end | |
418 | * up failing allocations although there are plenty of reclaimable | |
419 | * objects spread over several slabs with usage less than the | |
420 | * batch_size. | |
421 | * | |
422 | * We detect the "tight on memory" situations by looking at the total | |
423 | * number of objects we want to scan (total_scan). If it is greater | |
424 | * than the total number of objects on slab (freeable), we must be | |
425 | * scanning at high prio and therefore should try to reclaim as much as | |
426 | * possible. | |
427 | */ | |
428 | while (total_scan >= batch_size || | |
429 | total_scan >= freeable) { | |
430 | unsigned long ret; | |
431 | unsigned long nr_to_scan = min(batch_size, total_scan); | |
432 | ||
433 | shrinkctl->nr_to_scan = nr_to_scan; | |
434 | shrinkctl->nr_scanned = nr_to_scan; | |
435 | ret = shrinker->scan_objects(shrinker, shrinkctl); | |
436 | if (ret == SHRINK_STOP) | |
437 | break; | |
438 | freed += ret; | |
439 | ||
440 | count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); | |
441 | total_scan -= shrinkctl->nr_scanned; | |
442 | scanned += shrinkctl->nr_scanned; | |
443 | ||
444 | cond_resched(); | |
445 | } | |
446 | ||
447 | /* | |
448 | * The deferred work is increased by any new work (delta) that wasn't | |
449 | * done, decreased by old deferred work that was done now. | |
450 | * | |
451 | * And it is capped to two times of the freeable items. | |
452 | */ | |
453 | next_deferred = max_t(long, (nr + delta - scanned), 0); | |
454 | next_deferred = min(next_deferred, (2 * freeable)); | |
455 | ||
456 | /* | |
457 | * move the unused scan count back into the shrinker in a | |
458 | * manner that handles concurrent updates. | |
459 | */ | |
460 | new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); | |
461 | ||
462 | trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); | |
463 | return freed; | |
464 | } | |
465 | ||
466 | #ifdef CONFIG_MEMCG | |
467 | static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, | |
468 | struct mem_cgroup *memcg, int priority) | |
469 | { | |
470 | struct shrinker_info *info; | |
471 | unsigned long ret, freed = 0; | |
307becec | 472 | int offset, index = 0; |
96f7b2b9 QZ |
473 | |
474 | if (!mem_cgroup_online(memcg)) | |
475 | return 0; | |
476 | ||
50d09da8 QZ |
477 | /* |
478 | * lockless algorithm of memcg shrink. | |
479 | * | |
480 | * The shrinker_info may be freed asynchronously via RCU in the | |
481 | * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used | |
482 | * to ensure the existence of the shrinker_info. | |
483 | * | |
484 | * The shrinker_info_unit is never freed unless its corresponding memcg | |
485 | * is destroyed. Here we already hold the refcount of memcg, so the | |
486 | * memcg will not be destroyed, and of course shrinker_info_unit will | |
487 | * not be freed. | |
488 | * | |
489 | * So in the memcg shrink: | |
490 | * step 1: use rcu_read_lock() to guarantee existence of the | |
491 | * shrinker_info. | |
492 | * step 2: after getting shrinker_info_unit we can safely release the | |
493 | * RCU lock. | |
494 | * step 3: traverse the bitmap and calculate shrinker_id | |
495 | * step 4: use rcu_read_lock() to guarantee existence of the shrinker. | |
496 | * step 5: use shrinker_id to find the shrinker, then use | |
497 | * shrinker_try_get() to guarantee existence of the shrinker, | |
498 | * then we can release the RCU lock to do do_shrink_slab() that | |
499 | * may sleep. | |
500 | * step 6: do shrinker_put() paired with step 5 to put the refcount, | |
501 | * if the refcount reaches 0, then wake up the waiter in | |
502 | * shrinker_free() by calling complete(). | |
503 | * Note: here is different from the global shrink, we don't | |
504 | * need to acquire the RCU lock to guarantee existence of | |
505 | * the shrinker, because we don't need to use this | |
506 | * shrinker to traverse the next shrinker in the bitmap. | |
507 | * step 7: we have already exited the read-side of rcu critical section | |
508 | * before calling do_shrink_slab(), the shrinker_info may be | |
509 | * released in expand_one_shrinker_info(), so go back to step 1 | |
510 | * to reacquire the shrinker_info. | |
511 | */ | |
512 | again: | |
513 | rcu_read_lock(); | |
514 | info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); | |
96f7b2b9 QZ |
515 | if (unlikely(!info)) |
516 | goto unlock; | |
517 | ||
50d09da8 | 518 | if (index < shrinker_id_to_index(info->map_nr_max)) { |
307becec | 519 | struct shrinker_info_unit *unit; |
96f7b2b9 | 520 | |
307becec | 521 | unit = info->unit[index]; |
96f7b2b9 | 522 | |
50d09da8 QZ |
523 | rcu_read_unlock(); |
524 | ||
307becec QZ |
525 | for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) { |
526 | struct shrink_control sc = { | |
527 | .gfp_mask = gfp_mask, | |
528 | .nid = nid, | |
529 | .memcg = memcg, | |
530 | }; | |
531 | struct shrinker *shrinker; | |
532 | int shrinker_id = calc_shrinker_id(index, offset); | |
96f7b2b9 | 533 | |
50d09da8 | 534 | rcu_read_lock(); |
307becec | 535 | shrinker = idr_find(&shrinker_idr, shrinker_id); |
50d09da8 QZ |
536 | if (unlikely(!shrinker || !shrinker_try_get(shrinker))) { |
537 | clear_bit(offset, unit->map); | |
538 | rcu_read_unlock(); | |
307becec QZ |
539 | continue; |
540 | } | |
50d09da8 | 541 | rcu_read_unlock(); |
307becec QZ |
542 | |
543 | /* Call non-slab shrinkers even though kmem is disabled */ | |
544 | if (!memcg_kmem_online() && | |
545 | !(shrinker->flags & SHRINKER_NONSLAB)) | |
546 | continue; | |
547 | ||
548 | ret = do_shrink_slab(&sc, shrinker, priority); | |
549 | if (ret == SHRINK_EMPTY) { | |
550 | clear_bit(offset, unit->map); | |
551 | /* | |
552 | * After the shrinker reported that it had no objects to | |
553 | * free, but before we cleared the corresponding bit in | |
554 | * the memcg shrinker map, a new object might have been | |
555 | * added. To make sure, we have the bit set in this | |
556 | * case, we invoke the shrinker one more time and reset | |
557 | * the bit if it reports that it is not empty anymore. | |
558 | * The memory barrier here pairs with the barrier in | |
559 | * set_shrinker_bit(): | |
560 | * | |
561 | * list_lru_add() shrink_slab_memcg() | |
562 | * list_add_tail() clear_bit() | |
563 | * <MB> <MB> | |
564 | * set_bit() do_shrink_slab() | |
565 | */ | |
566 | smp_mb__after_atomic(); | |
567 | ret = do_shrink_slab(&sc, shrinker, priority); | |
568 | if (ret == SHRINK_EMPTY) | |
569 | ret = 0; | |
570 | else | |
571 | set_shrinker_bit(memcg, nid, shrinker_id); | |
572 | } | |
573 | freed += ret; | |
50d09da8 | 574 | shrinker_put(shrinker); |
96f7b2b9 | 575 | } |
50d09da8 QZ |
576 | |
577 | index++; | |
578 | goto again; | |
96f7b2b9 QZ |
579 | } |
580 | unlock: | |
50d09da8 | 581 | rcu_read_unlock(); |
96f7b2b9 QZ |
582 | return freed; |
583 | } | |
584 | #else /* !CONFIG_MEMCG */ | |
585 | static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, | |
586 | struct mem_cgroup *memcg, int priority) | |
587 | { | |
588 | return 0; | |
589 | } | |
590 | #endif /* CONFIG_MEMCG */ | |
591 | ||
592 | /** | |
593 | * shrink_slab - shrink slab caches | |
594 | * @gfp_mask: allocation context | |
595 | * @nid: node whose slab caches to target | |
596 | * @memcg: memory cgroup whose slab caches to target | |
597 | * @priority: the reclaim priority | |
598 | * | |
599 | * Call the shrink functions to age shrinkable caches. | |
600 | * | |
601 | * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, | |
602 | * unaware shrinkers will receive a node id of 0 instead. | |
603 | * | |
604 | * @memcg specifies the memory cgroup to target. Unaware shrinkers | |
605 | * are called only if it is the root cgroup. | |
606 | * | |
607 | * @priority is sc->priority, we take the number of objects and >> by priority | |
608 | * in order to get the scan target. | |
609 | * | |
610 | * Returns the number of reclaimed slab objects. | |
611 | */ | |
612 | unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, | |
613 | int priority) | |
614 | { | |
615 | unsigned long ret, freed = 0; | |
616 | struct shrinker *shrinker; | |
617 | ||
618 | /* | |
619 | * The root memcg might be allocated even though memcg is disabled | |
620 | * via "cgroup_disable=memory" boot parameter. This could make | |
621 | * mem_cgroup_is_root() return false, then just run memcg slab | |
622 | * shrink, but skip global shrink. This may result in premature | |
623 | * oom. | |
624 | */ | |
625 | if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) | |
626 | return shrink_slab_memcg(gfp_mask, nid, memcg, priority); | |
627 | ||
ca1d36b8 QZ |
628 | /* |
629 | * lockless algorithm of global shrink. | |
630 | * | |
631 | * In the unregistration setp, the shrinker will be freed asynchronously | |
632 | * via RCU after its refcount reaches 0. So both rcu_read_lock() and | |
633 | * shrinker_try_get() can be used to ensure the existence of the shrinker. | |
634 | * | |
635 | * So in the global shrink: | |
636 | * step 1: use rcu_read_lock() to guarantee existence of the shrinker | |
637 | * and the validity of the shrinker_list walk. | |
638 | * step 2: use shrinker_try_get() to try get the refcount, if successful, | |
639 | * then the existence of the shrinker can also be guaranteed, | |
640 | * so we can release the RCU lock to do do_shrink_slab() that | |
641 | * may sleep. | |
642 | * step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(), | |
643 | * which ensures that neither this shrinker nor the next shrinker | |
644 | * will be freed in the next traversal operation. | |
645 | * step 4: do shrinker_put() paired with step 2 to put the refcount, | |
646 | * if the refcount reaches 0, then wake up the waiter in | |
647 | * shrinker_free() by calling complete(). | |
648 | */ | |
649 | rcu_read_lock(); | |
650 | list_for_each_entry_rcu(shrinker, &shrinker_list, list) { | |
96f7b2b9 QZ |
651 | struct shrink_control sc = { |
652 | .gfp_mask = gfp_mask, | |
653 | .nid = nid, | |
654 | .memcg = memcg, | |
655 | }; | |
656 | ||
ca1d36b8 QZ |
657 | if (!shrinker_try_get(shrinker)) |
658 | continue; | |
659 | ||
660 | rcu_read_unlock(); | |
661 | ||
96f7b2b9 QZ |
662 | ret = do_shrink_slab(&sc, shrinker, priority); |
663 | if (ret == SHRINK_EMPTY) | |
664 | ret = 0; | |
665 | freed += ret; | |
ca1d36b8 QZ |
666 | |
667 | rcu_read_lock(); | |
668 | shrinker_put(shrinker); | |
96f7b2b9 QZ |
669 | } |
670 | ||
ca1d36b8 | 671 | rcu_read_unlock(); |
96f7b2b9 QZ |
672 | cond_resched(); |
673 | return freed; | |
674 | } | |
675 | ||
c42d50ae QZ |
676 | struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...) |
677 | { | |
678 | struct shrinker *shrinker; | |
679 | unsigned int size; | |
680 | va_list ap; | |
681 | int err; | |
682 | ||
683 | shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL); | |
684 | if (!shrinker) | |
685 | return NULL; | |
686 | ||
687 | va_start(ap, fmt); | |
688 | err = shrinker_debugfs_name_alloc(shrinker, fmt, ap); | |
689 | va_end(ap); | |
690 | if (err) | |
691 | goto err_name; | |
692 | ||
693 | shrinker->flags = flags | SHRINKER_ALLOCATED; | |
694 | shrinker->seeks = DEFAULT_SEEKS; | |
695 | ||
696 | if (flags & SHRINKER_MEMCG_AWARE) { | |
48a7a099 | 697 | err = shrinker_memcg_alloc(shrinker); |
c42d50ae QZ |
698 | if (err == -ENOSYS) { |
699 | /* Memcg is not supported, fallback to non-memcg-aware shrinker. */ | |
700 | shrinker->flags &= ~SHRINKER_MEMCG_AWARE; | |
701 | goto non_memcg; | |
702 | } | |
703 | ||
704 | if (err) | |
705 | goto err_flags; | |
706 | ||
707 | return shrinker; | |
708 | } | |
709 | ||
710 | non_memcg: | |
711 | /* | |
712 | * The nr_deferred is available on per memcg level for memcg aware | |
713 | * shrinkers, so only allocate nr_deferred in the following cases: | |
714 | * - non-memcg-aware shrinkers | |
715 | * - !CONFIG_MEMCG | |
716 | * - memcg is disabled by kernel command line | |
717 | */ | |
718 | size = sizeof(*shrinker->nr_deferred); | |
719 | if (flags & SHRINKER_NUMA_AWARE) | |
720 | size *= nr_node_ids; | |
721 | ||
722 | shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); | |
723 | if (!shrinker->nr_deferred) | |
724 | goto err_flags; | |
725 | ||
726 | return shrinker; | |
727 | ||
728 | err_flags: | |
729 | shrinker_debugfs_name_free(shrinker); | |
730 | err_name: | |
731 | kfree(shrinker); | |
732 | return NULL; | |
733 | } | |
734 | EXPORT_SYMBOL_GPL(shrinker_alloc); | |
735 | ||
736 | void shrinker_register(struct shrinker *shrinker) | |
737 | { | |
738 | if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) { | |
739 | pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker"); | |
740 | return; | |
741 | } | |
742 | ||
8a0e8bb1 | 743 | mutex_lock(&shrinker_mutex); |
ca1d36b8 | 744 | list_add_tail_rcu(&shrinker->list, &shrinker_list); |
c42d50ae QZ |
745 | shrinker->flags |= SHRINKER_REGISTERED; |
746 | shrinker_debugfs_add(shrinker); | |
8a0e8bb1 | 747 | mutex_unlock(&shrinker_mutex); |
ca1d36b8 QZ |
748 | |
749 | init_completion(&shrinker->done); | |
750 | /* | |
751 | * Now the shrinker is fully set up, take the first reference to it to | |
752 | * indicate that lookup operations are now allowed to use it via | |
753 | * shrinker_try_get(). | |
754 | */ | |
755 | refcount_set(&shrinker->refcount, 1); | |
c42d50ae QZ |
756 | } |
757 | EXPORT_SYMBOL_GPL(shrinker_register); | |
758 | ||
ca1d36b8 QZ |
759 | static void shrinker_free_rcu_cb(struct rcu_head *head) |
760 | { | |
761 | struct shrinker *shrinker = container_of(head, struct shrinker, rcu); | |
762 | ||
763 | kfree(shrinker->nr_deferred); | |
764 | kfree(shrinker); | |
765 | } | |
766 | ||
c42d50ae QZ |
767 | void shrinker_free(struct shrinker *shrinker) |
768 | { | |
769 | struct dentry *debugfs_entry = NULL; | |
770 | int debugfs_id; | |
771 | ||
772 | if (!shrinker) | |
773 | return; | |
774 | ||
ca1d36b8 QZ |
775 | if (shrinker->flags & SHRINKER_REGISTERED) { |
776 | /* drop the initial refcount */ | |
777 | shrinker_put(shrinker); | |
778 | /* | |
779 | * Wait for all lookups of the shrinker to complete, after that, | |
780 | * no shrinker is running or will run again, then we can safely | |
781 | * free it asynchronously via RCU and safely free the structure | |
782 | * where the shrinker is located, such as super_block etc. | |
783 | */ | |
784 | wait_for_completion(&shrinker->done); | |
785 | } | |
786 | ||
8a0e8bb1 | 787 | mutex_lock(&shrinker_mutex); |
c42d50ae | 788 | if (shrinker->flags & SHRINKER_REGISTERED) { |
ca1d36b8 QZ |
789 | /* |
790 | * Now we can safely remove it from the shrinker_list and then | |
791 | * free it. | |
792 | */ | |
793 | list_del_rcu(&shrinker->list); | |
c42d50ae QZ |
794 | debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); |
795 | shrinker->flags &= ~SHRINKER_REGISTERED; | |
796 | } | |
797 | ||
798 | shrinker_debugfs_name_free(shrinker); | |
799 | ||
800 | if (shrinker->flags & SHRINKER_MEMCG_AWARE) | |
48a7a099 | 801 | shrinker_memcg_remove(shrinker); |
8a0e8bb1 | 802 | mutex_unlock(&shrinker_mutex); |
c42d50ae QZ |
803 | |
804 | if (debugfs_entry) | |
805 | shrinker_debugfs_remove(debugfs_entry, debugfs_id); | |
806 | ||
ca1d36b8 | 807 | call_rcu(&shrinker->rcu, shrinker_free_rcu_cb); |
c42d50ae QZ |
808 | } |
809 | EXPORT_SYMBOL_GPL(shrinker_free); |