]>
Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
3e32cb2e JW |
2 | /* |
3 | * Lockless hierarchical page accounting & limiting | |
4 | * | |
5 | * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner | |
6 | */ | |
7 | ||
8 | #include <linux/page_counter.h> | |
9 | #include <linux/atomic.h> | |
10 | #include <linux/kernel.h> | |
11 | #include <linux/string.h> | |
12 | #include <linux/sched.h> | |
13 | #include <linux/bug.h> | |
14 | #include <asm/page.h> | |
15 | ||
bf8d5d52 RG |
16 | static void propagate_protected_usage(struct page_counter *c, |
17 | unsigned long usage) | |
23067153 | 18 | { |
bf8d5d52 | 19 | unsigned long protected, old_protected; |
23067153 RG |
20 | long delta; |
21 | ||
22 | if (!c->parent) | |
23 | return; | |
24 | ||
cfdab60b SB |
25 | protected = min(usage, READ_ONCE(c->min)); |
26 | old_protected = atomic_long_read(&c->min_usage); | |
27 | if (protected != old_protected) { | |
bf8d5d52 RG |
28 | old_protected = atomic_long_xchg(&c->min_usage, protected); |
29 | delta = protected - old_protected; | |
30 | if (delta) | |
31 | atomic_long_add(delta, &c->parent->children_min_usage); | |
32 | } | |
23067153 | 33 | |
cfdab60b SB |
34 | protected = min(usage, READ_ONCE(c->low)); |
35 | old_protected = atomic_long_read(&c->low_usage); | |
36 | if (protected != old_protected) { | |
bf8d5d52 RG |
37 | old_protected = atomic_long_xchg(&c->low_usage, protected); |
38 | delta = protected - old_protected; | |
39 | if (delta) | |
40 | atomic_long_add(delta, &c->parent->children_low_usage); | |
41 | } | |
23067153 RG |
42 | } |
43 | ||
3e32cb2e JW |
44 | /** |
45 | * page_counter_cancel - take pages out of the local counter | |
46 | * @counter: counter | |
47 | * @nr_pages: number of pages to cancel | |
3e32cb2e | 48 | */ |
64f21993 | 49 | void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) |
3e32cb2e JW |
50 | { |
51 | long new; | |
52 | ||
bbec2e15 | 53 | new = atomic_long_sub_return(nr_pages, &counter->usage); |
3e32cb2e | 54 | /* More uncharges than charges? */ |
9317d0ff JW |
55 | if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", |
56 | new, nr_pages)) { | |
57 | new = 0; | |
58 | atomic_long_set(&counter->usage, new); | |
59 | } | |
60 | propagate_protected_usage(counter, new); | |
3e32cb2e JW |
61 | } |
62 | ||
63 | /** | |
64 | * page_counter_charge - hierarchically charge pages | |
65 | * @counter: counter | |
66 | * @nr_pages: number of pages to charge | |
67 | * | |
68 | * NOTE: This does not consider any configured counter limits. | |
69 | */ | |
70 | void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) | |
71 | { | |
72 | struct page_counter *c; | |
73 | ||
74 | for (c = counter; c; c = c->parent) { | |
75 | long new; | |
76 | ||
bbec2e15 | 77 | new = atomic_long_add_return(nr_pages, &c->usage); |
a6f23d14 | 78 | propagate_protected_usage(c, new); |
3e32cb2e JW |
79 | /* |
80 | * This is indeed racy, but we can live with some | |
81 | * inaccuracy in the watermark. | |
82 | */ | |
6e4bd50f QC |
83 | if (new > READ_ONCE(c->watermark)) |
84 | WRITE_ONCE(c->watermark, new); | |
3e32cb2e JW |
85 | } |
86 | } | |
87 | ||
88 | /** | |
89 | * page_counter_try_charge - try to hierarchically charge pages | |
90 | * @counter: counter | |
91 | * @nr_pages: number of pages to charge | |
92 | * @fail: points first counter to hit its limit, if any | |
93 | * | |
6071ca52 JW |
94 | * Returns %true on success, or %false and @fail if the counter or one |
95 | * of its ancestors has hit its configured limit. | |
3e32cb2e | 96 | */ |
6071ca52 JW |
97 | bool page_counter_try_charge(struct page_counter *counter, |
98 | unsigned long nr_pages, | |
99 | struct page_counter **fail) | |
3e32cb2e JW |
100 | { |
101 | struct page_counter *c; | |
102 | ||
103 | for (c = counter; c; c = c->parent) { | |
104 | long new; | |
105 | /* | |
106 | * Charge speculatively to avoid an expensive CAS. If | |
107 | * a bigger charge fails, it might falsely lock out a | |
108 | * racing smaller charge and send it into reclaim | |
109 | * early, but the error is limited to the difference | |
110 | * between the two sizes, which is less than 2M/4M in | |
111 | * case of a THP locking out a regular page charge. | |
112 | * | |
113 | * The atomic_long_add_return() implies a full memory | |
114 | * barrier between incrementing the count and reading | |
d437024e | 115 | * the limit. When racing with page_counter_set_max(), |
3e32cb2e JW |
116 | * we either see the new limit or the setter sees the |
117 | * counter has changed and retries. | |
118 | */ | |
bbec2e15 RG |
119 | new = atomic_long_add_return(nr_pages, &c->usage); |
120 | if (new > c->max) { | |
121 | atomic_long_sub(nr_pages, &c->usage); | |
3e32cb2e JW |
122 | /* |
123 | * This is racy, but we can live with some | |
6e4bd50f QC |
124 | * inaccuracy in the failcnt which is only used |
125 | * to report stats. | |
3e32cb2e | 126 | */ |
6e4bd50f | 127 | data_race(c->failcnt++); |
3e32cb2e JW |
128 | *fail = c; |
129 | goto failed; | |
130 | } | |
a6f23d14 | 131 | propagate_protected_usage(c, new); |
3e32cb2e JW |
132 | /* |
133 | * Just like with failcnt, we can live with some | |
134 | * inaccuracy in the watermark. | |
135 | */ | |
6e4bd50f QC |
136 | if (new > READ_ONCE(c->watermark)) |
137 | WRITE_ONCE(c->watermark, new); | |
3e32cb2e | 138 | } |
6071ca52 | 139 | return true; |
3e32cb2e JW |
140 | |
141 | failed: | |
142 | for (c = counter; c != *fail; c = c->parent) | |
143 | page_counter_cancel(c, nr_pages); | |
144 | ||
6071ca52 | 145 | return false; |
3e32cb2e JW |
146 | } |
147 | ||
148 | /** | |
149 | * page_counter_uncharge - hierarchically uncharge pages | |
150 | * @counter: counter | |
151 | * @nr_pages: number of pages to uncharge | |
3e32cb2e | 152 | */ |
64f21993 | 153 | void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) |
3e32cb2e JW |
154 | { |
155 | struct page_counter *c; | |
3e32cb2e | 156 | |
64f21993 JW |
157 | for (c = counter; c; c = c->parent) |
158 | page_counter_cancel(c, nr_pages); | |
3e32cb2e JW |
159 | } |
160 | ||
161 | /** | |
bbec2e15 | 162 | * page_counter_set_max - set the maximum number of pages allowed |
3e32cb2e | 163 | * @counter: counter |
bbec2e15 | 164 | * @nr_pages: limit to set |
3e32cb2e JW |
165 | * |
166 | * Returns 0 on success, -EBUSY if the current number of pages on the | |
167 | * counter already exceeds the specified limit. | |
168 | * | |
169 | * The caller must serialize invocations on the same counter. | |
170 | */ | |
bbec2e15 | 171 | int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) |
3e32cb2e JW |
172 | { |
173 | for (;;) { | |
174 | unsigned long old; | |
bbec2e15 | 175 | long usage; |
3e32cb2e JW |
176 | |
177 | /* | |
178 | * Update the limit while making sure that it's not | |
179 | * below the concurrently-changing counter value. | |
180 | * | |
181 | * The xchg implies two full memory barriers before | |
182 | * and after, so the read-swap-read is ordered and | |
183 | * ensures coherency with page_counter_try_charge(): | |
184 | * that function modifies the count before checking | |
185 | * the limit, so if it sees the old limit, we see the | |
186 | * modified counter and retry. | |
187 | */ | |
13064781 | 188 | usage = page_counter_read(counter); |
3e32cb2e | 189 | |
bbec2e15 | 190 | if (usage > nr_pages) |
3e32cb2e JW |
191 | return -EBUSY; |
192 | ||
bbec2e15 | 193 | old = xchg(&counter->max, nr_pages); |
3e32cb2e | 194 | |
32d77270 | 195 | if (page_counter_read(counter) <= usage || nr_pages >= old) |
3e32cb2e JW |
196 | return 0; |
197 | ||
bbec2e15 | 198 | counter->max = old; |
3e32cb2e JW |
199 | cond_resched(); |
200 | } | |
201 | } | |
202 | ||
bf8d5d52 RG |
203 | /** |
204 | * page_counter_set_min - set the amount of protected memory | |
205 | * @counter: counter | |
206 | * @nr_pages: value to set | |
207 | * | |
208 | * The caller must serialize invocations on the same counter. | |
209 | */ | |
210 | void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) | |
211 | { | |
212 | struct page_counter *c; | |
213 | ||
c3d53200 | 214 | WRITE_ONCE(counter->min, nr_pages); |
bf8d5d52 RG |
215 | |
216 | for (c = counter; c; c = c->parent) | |
217 | propagate_protected_usage(c, atomic_long_read(&c->usage)); | |
218 | } | |
219 | ||
23067153 RG |
220 | /** |
221 | * page_counter_set_low - set the amount of protected memory | |
222 | * @counter: counter | |
223 | * @nr_pages: value to set | |
224 | * | |
225 | * The caller must serialize invocations on the same counter. | |
226 | */ | |
227 | void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) | |
228 | { | |
229 | struct page_counter *c; | |
230 | ||
f86b810c | 231 | WRITE_ONCE(counter->low, nr_pages); |
23067153 RG |
232 | |
233 | for (c = counter; c; c = c->parent) | |
bf8d5d52 | 234 | propagate_protected_usage(c, atomic_long_read(&c->usage)); |
23067153 RG |
235 | } |
236 | ||
3e32cb2e JW |
237 | /** |
238 | * page_counter_memparse - memparse() for page counter limits | |
239 | * @buf: string to parse | |
650c5e56 | 240 | * @max: string meaning maximum possible value |
3e32cb2e JW |
241 | * @nr_pages: returns the result in number of pages |
242 | * | |
243 | * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be | |
244 | * limited to %PAGE_COUNTER_MAX. | |
245 | */ | |
650c5e56 JW |
246 | int page_counter_memparse(const char *buf, const char *max, |
247 | unsigned long *nr_pages) | |
3e32cb2e | 248 | { |
3e32cb2e JW |
249 | char *end; |
250 | u64 bytes; | |
251 | ||
650c5e56 | 252 | if (!strcmp(buf, max)) { |
3e32cb2e JW |
253 | *nr_pages = PAGE_COUNTER_MAX; |
254 | return 0; | |
255 | } | |
256 | ||
257 | bytes = memparse(buf, &end); | |
258 | if (*end != '\0') | |
259 | return -EINVAL; | |
260 | ||
261 | *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); | |
262 | ||
263 | return 0; | |
264 | } | |
a8585ac6 ML |
265 | |
266 | ||
267 | /* | |
268 | * This function calculates an individual page counter's effective | |
269 | * protection which is derived from its own memory.min/low, its | |
270 | * parent's and siblings' settings, as well as the actual memory | |
271 | * distribution in the tree. | |
272 | * | |
273 | * The following rules apply to the effective protection values: | |
274 | * | |
275 | * 1. At the first level of reclaim, effective protection is equal to | |
276 | * the declared protection in memory.min and memory.low. | |
277 | * | |
278 | * 2. To enable safe delegation of the protection configuration, at | |
279 | * subsequent levels the effective protection is capped to the | |
280 | * parent's effective protection. | |
281 | * | |
282 | * 3. To make complex and dynamic subtrees easier to configure, the | |
283 | * user is allowed to overcommit the declared protection at a given | |
284 | * level. If that is the case, the parent's effective protection is | |
285 | * distributed to the children in proportion to how much protection | |
286 | * they have declared and how much of it they are utilizing. | |
287 | * | |
288 | * This makes distribution proportional, but also work-conserving: | |
289 | * if one counter claims much more protection than it uses memory, | |
290 | * the unused remainder is available to its siblings. | |
291 | * | |
292 | * 4. Conversely, when the declared protection is undercommitted at a | |
293 | * given level, the distribution of the larger parental protection | |
294 | * budget is NOT proportional. A counter's protection from a sibling | |
295 | * is capped to its own memory.min/low setting. | |
296 | * | |
297 | * 5. However, to allow protecting recursive subtrees from each other | |
298 | * without having to declare each individual counter's fixed share | |
299 | * of the ancestor's claim to protection, any unutilized - | |
300 | * "floating" - protection from up the tree is distributed in | |
301 | * proportion to each counter's *usage*. This makes the protection | |
302 | * neutral wrt sibling cgroups and lets them compete freely over | |
303 | * the shared parental protection budget, but it protects the | |
304 | * subtree as a whole from neighboring subtrees. | |
305 | * | |
306 | * Note that 4. and 5. are not in conflict: 4. is about protecting | |
307 | * against immediate siblings whereas 5. is about protecting against | |
308 | * neighboring subtrees. | |
309 | */ | |
310 | static unsigned long effective_protection(unsigned long usage, | |
311 | unsigned long parent_usage, | |
312 | unsigned long setting, | |
313 | unsigned long parent_effective, | |
314 | unsigned long siblings_protected, | |
315 | bool recursive_protection) | |
316 | { | |
317 | unsigned long protected; | |
318 | unsigned long ep; | |
319 | ||
320 | protected = min(usage, setting); | |
321 | /* | |
322 | * If all cgroups at this level combined claim and use more | |
323 | * protection than what the parent affords them, distribute | |
324 | * shares in proportion to utilization. | |
325 | * | |
326 | * We are using actual utilization rather than the statically | |
327 | * claimed protection in order to be work-conserving: claimed | |
328 | * but unused protection is available to siblings that would | |
329 | * otherwise get a smaller chunk than what they claimed. | |
330 | */ | |
331 | if (siblings_protected > parent_effective) | |
332 | return protected * parent_effective / siblings_protected; | |
333 | ||
334 | /* | |
335 | * Ok, utilized protection of all children is within what the | |
336 | * parent affords them, so we know whatever this child claims | |
337 | * and utilizes is effectively protected. | |
338 | * | |
339 | * If there is unprotected usage beyond this value, reclaim | |
340 | * will apply pressure in proportion to that amount. | |
341 | * | |
342 | * If there is unutilized protection, the cgroup will be fully | |
343 | * shielded from reclaim, but we do return a smaller value for | |
344 | * protection than what the group could enjoy in theory. This | |
345 | * is okay. With the overcommit distribution above, effective | |
346 | * protection is always dependent on how memory is actually | |
347 | * consumed among the siblings anyway. | |
348 | */ | |
349 | ep = protected; | |
350 | ||
351 | /* | |
352 | * If the children aren't claiming (all of) the protection | |
353 | * afforded to them by the parent, distribute the remainder in | |
354 | * proportion to the (unprotected) memory of each cgroup. That | |
355 | * way, cgroups that aren't explicitly prioritized wrt each | |
356 | * other compete freely over the allowance, but they are | |
357 | * collectively protected from neighboring trees. | |
358 | * | |
359 | * We're using unprotected memory for the weight so that if | |
360 | * some cgroups DO claim explicit protection, we don't protect | |
361 | * the same bytes twice. | |
362 | * | |
363 | * Check both usage and parent_usage against the respective | |
364 | * protected values. One should imply the other, but they | |
365 | * aren't read atomically - make sure the division is sane. | |
366 | */ | |
367 | if (!recursive_protection) | |
368 | return ep; | |
369 | ||
370 | if (parent_effective > siblings_protected && | |
371 | parent_usage > siblings_protected && | |
372 | usage > protected) { | |
373 | unsigned long unclaimed; | |
374 | ||
375 | unclaimed = parent_effective - siblings_protected; | |
376 | unclaimed *= usage - protected; | |
377 | unclaimed /= parent_usage - siblings_protected; | |
378 | ||
379 | ep += unclaimed; | |
380 | } | |
381 | ||
382 | return ep; | |
383 | } | |
384 | ||
385 | ||
386 | /** | |
387 | * page_counter_calculate_protection - check if memory consumption is in the normal range | |
388 | * @root: the top ancestor of the sub-tree being checked | |
389 | * @counter: the page_counter the counter to update | |
390 | * @recursive_protection: Whether to use memory_recursiveprot behavior. | |
391 | * | |
392 | * Calculates elow/emin thresholds for given page_counter. | |
393 | * | |
394 | * WARNING: This function is not stateless! It can only be used as part | |
395 | * of a top-down tree iteration, not for isolated queries. | |
396 | */ | |
397 | void page_counter_calculate_protection(struct page_counter *root, | |
398 | struct page_counter *counter, | |
399 | bool recursive_protection) | |
400 | { | |
401 | unsigned long usage, parent_usage; | |
402 | struct page_counter *parent = counter->parent; | |
403 | ||
404 | /* | |
405 | * Effective values of the reclaim targets are ignored so they | |
406 | * can be stale. Have a look at mem_cgroup_protection for more | |
407 | * details. | |
408 | * TODO: calculation should be more robust so that we do not need | |
409 | * that special casing. | |
410 | */ | |
411 | if (root == counter) | |
412 | return; | |
413 | ||
414 | usage = page_counter_read(counter); | |
415 | if (!usage) | |
416 | return; | |
417 | ||
418 | if (parent == root) { | |
419 | counter->emin = READ_ONCE(counter->min); | |
420 | counter->elow = READ_ONCE(counter->low); | |
421 | return; | |
422 | } | |
423 | ||
424 | parent_usage = page_counter_read(parent); | |
425 | ||
426 | WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage, | |
427 | READ_ONCE(counter->min), | |
428 | READ_ONCE(parent->emin), | |
429 | atomic_long_read(&parent->children_min_usage), | |
430 | recursive_protection)); | |
431 | ||
432 | WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage, | |
433 | READ_ONCE(counter->low), | |
434 | READ_ONCE(parent->elow), | |
435 | atomic_long_read(&parent->children_low_usage), | |
436 | recursive_protection)); | |
437 | } |