]> Git Repo - linux.git/blame - mm/vmstat.c
mm: cma: print region name on failure
[linux.git] / mm / vmstat.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
f6ac2354
CL
2/*
3 * linux/mm/vmstat.c
4 *
5 * Manages VM statistics
6 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
2244b95a
CL
7 *
8 * zoned VM statistics
9 * Copyright (C) 2006 Silicon Graphics, Inc.,
10 * Christoph Lameter <[email protected]>
7cc36bbd 11 * Copyright (C) 2008-2014 Christoph Lameter
f6ac2354 12 */
8f32f7e5 13#include <linux/fs.h>
f6ac2354 14#include <linux/mm.h>
4e950f6f 15#include <linux/err.h>
2244b95a 16#include <linux/module.h>
5a0e3ad6 17#include <linux/slab.h>
df9ecaba 18#include <linux/cpu.h>
7cc36bbd 19#include <linux/cpumask.h>
c748e134 20#include <linux/vmstat.h>
3c486871
AM
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/debugfs.h>
e8edc6e0 24#include <linux/sched.h>
f1a5ab12 25#include <linux/math64.h>
79da826a 26#include <linux/writeback.h>
36deb0be 27#include <linux/compaction.h>
6e543d57 28#include <linux/mm_inline.h>
48c96a36
JK
29#include <linux/page_ext.h>
30#include <linux/page_owner.h>
6e543d57
LD
31
32#include "internal.h"
f6ac2354 33
1d90ca89
KW
34#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
35
4518085e
KW
36#ifdef CONFIG_NUMA
37int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
38
39/* zero numa counters within a zone */
40static void zero_zone_numa_counters(struct zone *zone)
41{
42 int item, cpu;
43
44 for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
45 atomic_long_set(&zone->vm_numa_stat[item], 0);
46 for_each_online_cpu(cpu)
47 per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
48 = 0;
49 }
50}
51
52/* zero numa counters of all the populated zones */
53static void zero_zones_numa_counters(void)
54{
55 struct zone *zone;
56
57 for_each_populated_zone(zone)
58 zero_zone_numa_counters(zone);
59}
60
61/* zero global numa counters */
62static void zero_global_numa_counters(void)
63{
64 int item;
65
66 for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
67 atomic_long_set(&vm_numa_stat[item], 0);
68}
69
70static void invalid_numa_statistics(void)
71{
72 zero_zones_numa_counters();
73 zero_global_numa_counters();
74}
75
76static DEFINE_MUTEX(vm_numa_stat_lock);
77
78int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
32927393 79 void *buffer, size_t *length, loff_t *ppos)
4518085e
KW
80{
81 int ret, oldval;
82
83 mutex_lock(&vm_numa_stat_lock);
84 if (write)
85 oldval = sysctl_vm_numa_stat;
86 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
87 if (ret || !write)
88 goto out;
89
90 if (oldval == sysctl_vm_numa_stat)
91 goto out;
92 else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
93 static_branch_enable(&vm_numa_stat_key);
94 pr_info("enable numa statistics\n");
95 } else {
96 static_branch_disable(&vm_numa_stat_key);
97 invalid_numa_statistics();
98 pr_info("disable numa statistics, and clear numa counters\n");
99 }
100
101out:
102 mutex_unlock(&vm_numa_stat_lock);
103 return ret;
104}
105#endif
106
f8891e5e
CL
107#ifdef CONFIG_VM_EVENT_COUNTERS
108DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
109EXPORT_PER_CPU_SYMBOL(vm_event_states);
110
31f961a8 111static void sum_vm_events(unsigned long *ret)
f8891e5e 112{
9eccf2a8 113 int cpu;
f8891e5e
CL
114 int i;
115
116 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
117
31f961a8 118 for_each_online_cpu(cpu) {
f8891e5e
CL
119 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
120
f8891e5e
CL
121 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
122 ret[i] += this->event[i];
123 }
124}
125
126/*
127 * Accumulate the vm event counters across all CPUs.
128 * The result is unavoidably approximate - it can change
129 * during and after execution of this function.
130*/
131void all_vm_events(unsigned long *ret)
132{
b5be1132 133 get_online_cpus();
31f961a8 134 sum_vm_events(ret);
b5be1132 135 put_online_cpus();
f8891e5e 136}
32dd66fc 137EXPORT_SYMBOL_GPL(all_vm_events);
f8891e5e 138
f8891e5e
CL
139/*
140 * Fold the foreign cpu events into our own.
141 *
142 * This is adding to the events on one processor
143 * but keeps the global counts constant.
144 */
145void vm_events_fold_cpu(int cpu)
146{
147 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
148 int i;
149
150 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
151 count_vm_events(i, fold_state->event[i]);
152 fold_state->event[i] = 0;
153 }
154}
f8891e5e
CL
155
156#endif /* CONFIG_VM_EVENT_COUNTERS */
157
2244b95a
CL
158/*
159 * Manage combined zone based / global counters
160 *
161 * vm_stat contains the global counters
162 */
75ef7184 163atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
3a321d2a 164atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
75ef7184
MG
165atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
166EXPORT_SYMBOL(vm_zone_stat);
3a321d2a 167EXPORT_SYMBOL(vm_numa_stat);
75ef7184 168EXPORT_SYMBOL(vm_node_stat);
2244b95a
CL
169
170#ifdef CONFIG_SMP
171
b44129b3 172int calculate_pressure_threshold(struct zone *zone)
88f5acf8
MG
173{
174 int threshold;
175 int watermark_distance;
176
177 /*
178 * As vmstats are not up to date, there is drift between the estimated
179 * and real values. For high thresholds and a high number of CPUs, it
180 * is possible for the min watermark to be breached while the estimated
181 * value looks fine. The pressure threshold is a reduced value such
182 * that even the maximum amount of drift will not accidentally breach
183 * the min watermark
184 */
185 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
186 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
187
188 /*
189 * Maximum threshold is 125
190 */
191 threshold = min(125, threshold);
192
193 return threshold;
194}
195
b44129b3 196int calculate_normal_threshold(struct zone *zone)
df9ecaba
CL
197{
198 int threshold;
199 int mem; /* memory in 128 MB units */
200
201 /*
202 * The threshold scales with the number of processors and the amount
203 * of memory per zone. More memory means that we can defer updates for
204 * longer, more processors could lead to more contention.
205 * fls() is used to have a cheap way of logarithmic scaling.
206 *
207 * Some sample thresholds:
208 *
209 * Threshold Processors (fls) Zonesize fls(mem+1)
210 * ------------------------------------------------------------------
211 * 8 1 1 0.9-1 GB 4
212 * 16 2 2 0.9-1 GB 4
213 * 20 2 2 1-2 GB 5
214 * 24 2 2 2-4 GB 6
215 * 28 2 2 4-8 GB 7
216 * 32 2 2 8-16 GB 8
217 * 4 2 2 <128M 1
218 * 30 4 3 2-4 GB 5
219 * 48 4 3 8-16 GB 8
220 * 32 8 4 1-2 GB 4
221 * 32 8 4 0.9-1GB 4
222 * 10 16 5 <128M 1
223 * 40 16 5 900M 4
224 * 70 64 7 2-4 GB 5
225 * 84 64 7 4-8 GB 6
226 * 108 512 9 4-8 GB 6
227 * 125 1024 10 8-16 GB 8
228 * 125 1024 10 16-32 GB 9
229 */
230
9705bea5 231 mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
df9ecaba
CL
232
233 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
234
235 /*
236 * Maximum threshold is 125
237 */
238 threshold = min(125, threshold);
239
240 return threshold;
241}
2244b95a
CL
242
243/*
df9ecaba 244 * Refresh the thresholds for each zone.
2244b95a 245 */
a6cccdc3 246void refresh_zone_stat_thresholds(void)
2244b95a 247{
75ef7184 248 struct pglist_data *pgdat;
df9ecaba
CL
249 struct zone *zone;
250 int cpu;
251 int threshold;
252
75ef7184
MG
253 /* Zero current pgdat thresholds */
254 for_each_online_pgdat(pgdat) {
255 for_each_online_cpu(cpu) {
256 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
257 }
258 }
259
ee99c71c 260 for_each_populated_zone(zone) {
75ef7184 261 struct pglist_data *pgdat = zone->zone_pgdat;
aa454840
CL
262 unsigned long max_drift, tolerate_drift;
263
b44129b3 264 threshold = calculate_normal_threshold(zone);
df9ecaba 265
75ef7184
MG
266 for_each_online_cpu(cpu) {
267 int pgdat_threshold;
268
99dcc3e5
CL
269 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
270 = threshold;
1d90ca89 271
75ef7184
MG
272 /* Base nodestat threshold on the largest populated zone. */
273 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
274 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
275 = max(threshold, pgdat_threshold);
276 }
277
aa454840
CL
278 /*
279 * Only set percpu_drift_mark if there is a danger that
280 * NR_FREE_PAGES reports the low watermark is ok when in fact
281 * the min watermark could be breached by an allocation
282 */
283 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
284 max_drift = num_online_cpus() * threshold;
285 if (max_drift > tolerate_drift)
286 zone->percpu_drift_mark = high_wmark_pages(zone) +
287 max_drift;
df9ecaba 288 }
2244b95a
CL
289}
290
b44129b3
MG
291void set_pgdat_percpu_threshold(pg_data_t *pgdat,
292 int (*calculate_pressure)(struct zone *))
88f5acf8
MG
293{
294 struct zone *zone;
295 int cpu;
296 int threshold;
297 int i;
298
88f5acf8
MG
299 for (i = 0; i < pgdat->nr_zones; i++) {
300 zone = &pgdat->node_zones[i];
301 if (!zone->percpu_drift_mark)
302 continue;
303
b44129b3 304 threshold = (*calculate_pressure)(zone);
1d90ca89 305 for_each_online_cpu(cpu)
88f5acf8
MG
306 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
307 = threshold;
308 }
88f5acf8
MG
309}
310
2244b95a 311/*
bea04b07
JZ
312 * For use when we know that interrupts are disabled,
313 * or when we know that preemption is disabled and that
314 * particular counter cannot be updated from interrupt context.
2244b95a
CL
315 */
316void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 317 long delta)
2244b95a 318{
12938a92
CL
319 struct per_cpu_pageset __percpu *pcp = zone->pageset;
320 s8 __percpu *p = pcp->vm_stat_diff + item;
2244b95a 321 long x;
12938a92
CL
322 long t;
323
324 x = delta + __this_cpu_read(*p);
2244b95a 325
12938a92 326 t = __this_cpu_read(pcp->stat_threshold);
2244b95a 327
40610076 328 if (unlikely(abs(x) > t)) {
2244b95a
CL
329 zone_page_state_add(x, zone, item);
330 x = 0;
331 }
12938a92 332 __this_cpu_write(*p, x);
2244b95a
CL
333}
334EXPORT_SYMBOL(__mod_zone_page_state);
335
75ef7184
MG
336void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
337 long delta)
338{
339 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
340 s8 __percpu *p = pcp->vm_node_stat_diff + item;
341 long x;
342 long t;
343
ea426c2a
RG
344 if (vmstat_item_in_bytes(item)) {
345 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
346 delta >>= PAGE_SHIFT;
347 }
348
75ef7184
MG
349 x = delta + __this_cpu_read(*p);
350
351 t = __this_cpu_read(pcp->stat_threshold);
352
40610076 353 if (unlikely(abs(x) > t)) {
75ef7184
MG
354 node_page_state_add(x, pgdat, item);
355 x = 0;
356 }
357 __this_cpu_write(*p, x);
358}
359EXPORT_SYMBOL(__mod_node_page_state);
360
2244b95a
CL
361/*
362 * Optimized increment and decrement functions.
363 *
364 * These are only for a single page and therefore can take a struct page *
365 * argument instead of struct zone *. This allows the inclusion of the code
366 * generated for page_zone(page) into the optimized functions.
367 *
368 * No overflow check is necessary and therefore the differential can be
369 * incremented or decremented in place which may allow the compilers to
370 * generate better code.
2244b95a
CL
371 * The increment or decrement is known and therefore one boundary check can
372 * be omitted.
373 *
df9ecaba
CL
374 * NOTE: These functions are very performance sensitive. Change only
375 * with care.
376 *
2244b95a
CL
377 * Some processors have inc/dec instructions that are atomic vs an interrupt.
378 * However, the code must first determine the differential location in a zone
379 * based on the processor number and then inc/dec the counter. There is no
380 * guarantee without disabling preemption that the processor will not change
381 * in between and therefore the atomicity vs. interrupt cannot be exploited
382 * in a useful way here.
383 */
c8785385 384void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 385{
12938a92
CL
386 struct per_cpu_pageset __percpu *pcp = zone->pageset;
387 s8 __percpu *p = pcp->vm_stat_diff + item;
388 s8 v, t;
2244b95a 389
908ee0f1 390 v = __this_cpu_inc_return(*p);
12938a92
CL
391 t = __this_cpu_read(pcp->stat_threshold);
392 if (unlikely(v > t)) {
393 s8 overstep = t >> 1;
df9ecaba 394
12938a92
CL
395 zone_page_state_add(v + overstep, zone, item);
396 __this_cpu_write(*p, -overstep);
2244b95a
CL
397 }
398}
ca889e6c 399
75ef7184
MG
400void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
401{
402 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
403 s8 __percpu *p = pcp->vm_node_stat_diff + item;
404 s8 v, t;
405
ea426c2a
RG
406 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
407
75ef7184
MG
408 v = __this_cpu_inc_return(*p);
409 t = __this_cpu_read(pcp->stat_threshold);
410 if (unlikely(v > t)) {
411 s8 overstep = t >> 1;
412
413 node_page_state_add(v + overstep, pgdat, item);
414 __this_cpu_write(*p, -overstep);
415 }
416}
417
ca889e6c
CL
418void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
419{
420 __inc_zone_state(page_zone(page), item);
421}
2244b95a
CL
422EXPORT_SYMBOL(__inc_zone_page_state);
423
75ef7184
MG
424void __inc_node_page_state(struct page *page, enum node_stat_item item)
425{
426 __inc_node_state(page_pgdat(page), item);
427}
428EXPORT_SYMBOL(__inc_node_page_state);
429
c8785385 430void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 431{
12938a92
CL
432 struct per_cpu_pageset __percpu *pcp = zone->pageset;
433 s8 __percpu *p = pcp->vm_stat_diff + item;
434 s8 v, t;
2244b95a 435
908ee0f1 436 v = __this_cpu_dec_return(*p);
12938a92
CL
437 t = __this_cpu_read(pcp->stat_threshold);
438 if (unlikely(v < - t)) {
439 s8 overstep = t >> 1;
2244b95a 440
12938a92
CL
441 zone_page_state_add(v - overstep, zone, item);
442 __this_cpu_write(*p, overstep);
2244b95a
CL
443 }
444}
c8785385 445
75ef7184
MG
446void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
447{
448 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
449 s8 __percpu *p = pcp->vm_node_stat_diff + item;
450 s8 v, t;
451
ea426c2a
RG
452 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
453
75ef7184
MG
454 v = __this_cpu_dec_return(*p);
455 t = __this_cpu_read(pcp->stat_threshold);
456 if (unlikely(v < - t)) {
457 s8 overstep = t >> 1;
458
459 node_page_state_add(v - overstep, pgdat, item);
460 __this_cpu_write(*p, overstep);
461 }
462}
463
c8785385
CL
464void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
465{
466 __dec_zone_state(page_zone(page), item);
467}
2244b95a
CL
468EXPORT_SYMBOL(__dec_zone_page_state);
469
75ef7184
MG
470void __dec_node_page_state(struct page *page, enum node_stat_item item)
471{
472 __dec_node_state(page_pgdat(page), item);
473}
474EXPORT_SYMBOL(__dec_node_page_state);
475
4156153c 476#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
7c839120
CL
477/*
478 * If we have cmpxchg_local support then we do not need to incur the overhead
479 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
480 *
481 * mod_state() modifies the zone counter state through atomic per cpu
482 * operations.
483 *
484 * Overstep mode specifies how overstep should handled:
485 * 0 No overstepping
486 * 1 Overstepping half of threshold
487 * -1 Overstepping minus half of threshold
488*/
75ef7184
MG
489static inline void mod_zone_state(struct zone *zone,
490 enum zone_stat_item item, long delta, int overstep_mode)
7c839120
CL
491{
492 struct per_cpu_pageset __percpu *pcp = zone->pageset;
493 s8 __percpu *p = pcp->vm_stat_diff + item;
494 long o, n, t, z;
495
496 do {
497 z = 0; /* overflow to zone counters */
498
499 /*
500 * The fetching of the stat_threshold is racy. We may apply
501 * a counter threshold to the wrong the cpu if we get
d3bc2367
CL
502 * rescheduled while executing here. However, the next
503 * counter update will apply the threshold again and
504 * therefore bring the counter under the threshold again.
505 *
506 * Most of the time the thresholds are the same anyways
507 * for all cpus in a zone.
7c839120
CL
508 */
509 t = this_cpu_read(pcp->stat_threshold);
510
511 o = this_cpu_read(*p);
512 n = delta + o;
513
40610076 514 if (abs(n) > t) {
7c839120
CL
515 int os = overstep_mode * (t >> 1) ;
516
517 /* Overflow must be added to zone counters */
518 z = n + os;
519 n = -os;
520 }
521 } while (this_cpu_cmpxchg(*p, o, n) != o);
522
523 if (z)
524 zone_page_state_add(z, zone, item);
525}
526
527void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 528 long delta)
7c839120 529{
75ef7184 530 mod_zone_state(zone, item, delta, 0);
7c839120
CL
531}
532EXPORT_SYMBOL(mod_zone_page_state);
533
7c839120
CL
534void inc_zone_page_state(struct page *page, enum zone_stat_item item)
535{
75ef7184 536 mod_zone_state(page_zone(page), item, 1, 1);
7c839120
CL
537}
538EXPORT_SYMBOL(inc_zone_page_state);
539
540void dec_zone_page_state(struct page *page, enum zone_stat_item item)
541{
75ef7184 542 mod_zone_state(page_zone(page), item, -1, -1);
7c839120
CL
543}
544EXPORT_SYMBOL(dec_zone_page_state);
75ef7184
MG
545
546static inline void mod_node_state(struct pglist_data *pgdat,
547 enum node_stat_item item, int delta, int overstep_mode)
548{
549 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
550 s8 __percpu *p = pcp->vm_node_stat_diff + item;
551 long o, n, t, z;
552
ea426c2a
RG
553 if (vmstat_item_in_bytes(item)) {
554 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
555 delta >>= PAGE_SHIFT;
556 }
557
75ef7184
MG
558 do {
559 z = 0; /* overflow to node counters */
560
561 /*
562 * The fetching of the stat_threshold is racy. We may apply
563 * a counter threshold to the wrong the cpu if we get
564 * rescheduled while executing here. However, the next
565 * counter update will apply the threshold again and
566 * therefore bring the counter under the threshold again.
567 *
568 * Most of the time the thresholds are the same anyways
569 * for all cpus in a node.
570 */
571 t = this_cpu_read(pcp->stat_threshold);
572
573 o = this_cpu_read(*p);
574 n = delta + o;
575
40610076 576 if (abs(n) > t) {
75ef7184
MG
577 int os = overstep_mode * (t >> 1) ;
578
579 /* Overflow must be added to node counters */
580 z = n + os;
581 n = -os;
582 }
583 } while (this_cpu_cmpxchg(*p, o, n) != o);
584
585 if (z)
586 node_page_state_add(z, pgdat, item);
587}
588
589void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
590 long delta)
591{
592 mod_node_state(pgdat, item, delta, 0);
593}
594EXPORT_SYMBOL(mod_node_page_state);
595
596void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
597{
598 mod_node_state(pgdat, item, 1, 1);
599}
600
601void inc_node_page_state(struct page *page, enum node_stat_item item)
602{
603 mod_node_state(page_pgdat(page), item, 1, 1);
604}
605EXPORT_SYMBOL(inc_node_page_state);
606
607void dec_node_page_state(struct page *page, enum node_stat_item item)
608{
609 mod_node_state(page_pgdat(page), item, -1, -1);
610}
611EXPORT_SYMBOL(dec_node_page_state);
7c839120
CL
612#else
613/*
614 * Use interrupt disable to serialize counter updates
615 */
616void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 617 long delta)
7c839120
CL
618{
619 unsigned long flags;
620
621 local_irq_save(flags);
622 __mod_zone_page_state(zone, item, delta);
623 local_irq_restore(flags);
624}
625EXPORT_SYMBOL(mod_zone_page_state);
626
2244b95a
CL
627void inc_zone_page_state(struct page *page, enum zone_stat_item item)
628{
629 unsigned long flags;
630 struct zone *zone;
2244b95a
CL
631
632 zone = page_zone(page);
633 local_irq_save(flags);
ca889e6c 634 __inc_zone_state(zone, item);
2244b95a
CL
635 local_irq_restore(flags);
636}
637EXPORT_SYMBOL(inc_zone_page_state);
638
639void dec_zone_page_state(struct page *page, enum zone_stat_item item)
640{
641 unsigned long flags;
2244b95a 642
2244b95a 643 local_irq_save(flags);
a302eb4e 644 __dec_zone_page_state(page, item);
2244b95a
CL
645 local_irq_restore(flags);
646}
647EXPORT_SYMBOL(dec_zone_page_state);
648
75ef7184
MG
649void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
650{
651 unsigned long flags;
652
653 local_irq_save(flags);
654 __inc_node_state(pgdat, item);
655 local_irq_restore(flags);
656}
657EXPORT_SYMBOL(inc_node_state);
658
659void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
660 long delta)
661{
662 unsigned long flags;
663
664 local_irq_save(flags);
665 __mod_node_page_state(pgdat, item, delta);
666 local_irq_restore(flags);
667}
668EXPORT_SYMBOL(mod_node_page_state);
669
670void inc_node_page_state(struct page *page, enum node_stat_item item)
671{
672 unsigned long flags;
673 struct pglist_data *pgdat;
674
675 pgdat = page_pgdat(page);
676 local_irq_save(flags);
677 __inc_node_state(pgdat, item);
678 local_irq_restore(flags);
679}
680EXPORT_SYMBOL(inc_node_page_state);
681
682void dec_node_page_state(struct page *page, enum node_stat_item item)
683{
684 unsigned long flags;
685
686 local_irq_save(flags);
687 __dec_node_page_state(page, item);
688 local_irq_restore(flags);
689}
690EXPORT_SYMBOL(dec_node_page_state);
691#endif
7cc36bbd
CL
692
693/*
694 * Fold a differential into the global counters.
695 * Returns the number of counters updated.
696 */
3a321d2a
KW
697#ifdef CONFIG_NUMA
698static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
699{
700 int i;
701 int changes = 0;
702
703 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
704 if (zone_diff[i]) {
705 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
706 changes++;
707 }
708
709 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
710 if (numa_diff[i]) {
711 atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
712 changes++;
713 }
714
715 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
716 if (node_diff[i]) {
717 atomic_long_add(node_diff[i], &vm_node_stat[i]);
718 changes++;
719 }
720 return changes;
721}
722#else
75ef7184 723static int fold_diff(int *zone_diff, int *node_diff)
4edb0748
CL
724{
725 int i;
7cc36bbd 726 int changes = 0;
4edb0748
CL
727
728 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
75ef7184
MG
729 if (zone_diff[i]) {
730 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
731 changes++;
732 }
733
734 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
735 if (node_diff[i]) {
736 atomic_long_add(node_diff[i], &vm_node_stat[i]);
7cc36bbd
CL
737 changes++;
738 }
739 return changes;
4edb0748 740}
3a321d2a 741#endif /* CONFIG_NUMA */
4edb0748 742
2244b95a 743/*
2bb921e5 744 * Update the zone counters for the current cpu.
a7f75e25 745 *
4037d452
CL
746 * Note that refresh_cpu_vm_stats strives to only access
747 * node local memory. The per cpu pagesets on remote zones are placed
748 * in the memory local to the processor using that pageset. So the
749 * loop over all zones will access a series of cachelines local to
750 * the processor.
751 *
752 * The call to zone_page_state_add updates the cachelines with the
753 * statistics in the remote zone struct as well as the global cachelines
754 * with the global counters. These could cause remote node cache line
755 * bouncing and will have to be only done when necessary.
7cc36bbd
CL
756 *
757 * The function returns the number of global counters updated.
2244b95a 758 */
0eb77e98 759static int refresh_cpu_vm_stats(bool do_pagesets)
2244b95a 760{
75ef7184 761 struct pglist_data *pgdat;
2244b95a
CL
762 struct zone *zone;
763 int i;
75ef7184 764 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
3a321d2a
KW
765#ifdef CONFIG_NUMA
766 int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
767#endif
75ef7184 768 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
7cc36bbd 769 int changes = 0;
2244b95a 770
ee99c71c 771 for_each_populated_zone(zone) {
fbc2edb0 772 struct per_cpu_pageset __percpu *p = zone->pageset;
2244b95a 773
fbc2edb0
CL
774 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
775 int v;
2244b95a 776
fbc2edb0
CL
777 v = this_cpu_xchg(p->vm_stat_diff[i], 0);
778 if (v) {
a7f75e25 779
a7f75e25 780 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 781 global_zone_diff[i] += v;
4037d452
CL
782#ifdef CONFIG_NUMA
783 /* 3 seconds idle till flush */
fbc2edb0 784 __this_cpu_write(p->expire, 3);
4037d452 785#endif
2244b95a 786 }
fbc2edb0 787 }
4037d452 788#ifdef CONFIG_NUMA
3a321d2a
KW
789 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
790 int v;
791
792 v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
793 if (v) {
794
795 atomic_long_add(v, &zone->vm_numa_stat[i]);
796 global_numa_diff[i] += v;
797 __this_cpu_write(p->expire, 3);
798 }
799 }
800
0eb77e98
CL
801 if (do_pagesets) {
802 cond_resched();
803 /*
804 * Deal with draining the remote pageset of this
805 * processor
806 *
807 * Check if there are pages remaining in this pageset
808 * if not then there is nothing to expire.
809 */
810 if (!__this_cpu_read(p->expire) ||
fbc2edb0 811 !__this_cpu_read(p->pcp.count))
0eb77e98 812 continue;
4037d452 813
0eb77e98
CL
814 /*
815 * We never drain zones local to this processor.
816 */
817 if (zone_to_nid(zone) == numa_node_id()) {
818 __this_cpu_write(p->expire, 0);
819 continue;
820 }
4037d452 821
0eb77e98
CL
822 if (__this_cpu_dec_return(p->expire))
823 continue;
4037d452 824
0eb77e98
CL
825 if (__this_cpu_read(p->pcp.count)) {
826 drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
827 changes++;
828 }
7cc36bbd 829 }
4037d452 830#endif
2244b95a 831 }
75ef7184
MG
832
833 for_each_online_pgdat(pgdat) {
834 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
835
836 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
837 int v;
838
839 v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
840 if (v) {
841 atomic_long_add(v, &pgdat->vm_stat[i]);
842 global_node_diff[i] += v;
843 }
844 }
845 }
846
3a321d2a
KW
847#ifdef CONFIG_NUMA
848 changes += fold_diff(global_zone_diff, global_numa_diff,
849 global_node_diff);
850#else
75ef7184 851 changes += fold_diff(global_zone_diff, global_node_diff);
3a321d2a 852#endif
7cc36bbd 853 return changes;
2244b95a
CL
854}
855
2bb921e5
CL
856/*
857 * Fold the data for an offline cpu into the global array.
858 * There cannot be any access by the offline cpu and therefore
859 * synchronization is simplified.
860 */
861void cpu_vm_stats_fold(int cpu)
862{
75ef7184 863 struct pglist_data *pgdat;
2bb921e5
CL
864 struct zone *zone;
865 int i;
75ef7184 866 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
3a321d2a
KW
867#ifdef CONFIG_NUMA
868 int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
869#endif
75ef7184 870 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
2bb921e5
CL
871
872 for_each_populated_zone(zone) {
873 struct per_cpu_pageset *p;
874
875 p = per_cpu_ptr(zone->pageset, cpu);
876
877 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
878 if (p->vm_stat_diff[i]) {
879 int v;
880
881 v = p->vm_stat_diff[i];
882 p->vm_stat_diff[i] = 0;
883 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 884 global_zone_diff[i] += v;
2bb921e5 885 }
3a321d2a
KW
886
887#ifdef CONFIG_NUMA
888 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
889 if (p->vm_numa_stat_diff[i]) {
890 int v;
891
892 v = p->vm_numa_stat_diff[i];
893 p->vm_numa_stat_diff[i] = 0;
894 atomic_long_add(v, &zone->vm_numa_stat[i]);
895 global_numa_diff[i] += v;
896 }
897#endif
2bb921e5
CL
898 }
899
75ef7184
MG
900 for_each_online_pgdat(pgdat) {
901 struct per_cpu_nodestat *p;
902
903 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
904
905 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
906 if (p->vm_node_stat_diff[i]) {
907 int v;
908
909 v = p->vm_node_stat_diff[i];
910 p->vm_node_stat_diff[i] = 0;
911 atomic_long_add(v, &pgdat->vm_stat[i]);
912 global_node_diff[i] += v;
913 }
914 }
915
3a321d2a
KW
916#ifdef CONFIG_NUMA
917 fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
918#else
75ef7184 919 fold_diff(global_zone_diff, global_node_diff);
3a321d2a 920#endif
2bb921e5
CL
921}
922
40f4b1ea
CS
923/*
924 * this is only called if !populated_zone(zone), which implies no other users of
925 * pset->vm_stat_diff[] exsist.
926 */
5a883813
MK
927void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
928{
929 int i;
930
931 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
932 if (pset->vm_stat_diff[i]) {
933 int v = pset->vm_stat_diff[i];
934 pset->vm_stat_diff[i] = 0;
935 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 936 atomic_long_add(v, &vm_zone_stat[i]);
5a883813 937 }
3a321d2a
KW
938
939#ifdef CONFIG_NUMA
940 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
941 if (pset->vm_numa_stat_diff[i]) {
942 int v = pset->vm_numa_stat_diff[i];
943
944 pset->vm_numa_stat_diff[i] = 0;
945 atomic_long_add(v, &zone->vm_numa_stat[i]);
946 atomic_long_add(v, &vm_numa_stat[i]);
947 }
948#endif
5a883813 949}
2244b95a
CL
950#endif
951
ca889e6c 952#ifdef CONFIG_NUMA
3a321d2a
KW
953void __inc_numa_state(struct zone *zone,
954 enum numa_stat_item item)
955{
956 struct per_cpu_pageset __percpu *pcp = zone->pageset;
1d90ca89
KW
957 u16 __percpu *p = pcp->vm_numa_stat_diff + item;
958 u16 v;
3a321d2a
KW
959
960 v = __this_cpu_inc_return(*p);
3a321d2a 961
1d90ca89
KW
962 if (unlikely(v > NUMA_STATS_THRESHOLD)) {
963 zone_numa_state_add(v, zone, item);
964 __this_cpu_write(*p, 0);
3a321d2a
KW
965 }
966}
967
c2d42c16 968/*
75ef7184
MG
969 * Determine the per node value of a stat item. This function
970 * is called frequently in a NUMA machine, so try to be as
971 * frugal as possible.
c2d42c16 972 */
75ef7184
MG
973unsigned long sum_zone_node_page_state(int node,
974 enum zone_stat_item item)
c2d42c16
AM
975{
976 struct zone *zones = NODE_DATA(node)->node_zones;
e87d59f7
JK
977 int i;
978 unsigned long count = 0;
c2d42c16 979
e87d59f7
JK
980 for (i = 0; i < MAX_NR_ZONES; i++)
981 count += zone_page_state(zones + i, item);
982
983 return count;
c2d42c16
AM
984}
985
63803222
KW
986/*
987 * Determine the per node value of a numa stat item. To avoid deviation,
988 * the per cpu stat number in vm_numa_stat_diff[] is also included.
989 */
3a321d2a
KW
990unsigned long sum_zone_numa_state(int node,
991 enum numa_stat_item item)
992{
993 struct zone *zones = NODE_DATA(node)->node_zones;
994 int i;
995 unsigned long count = 0;
996
997 for (i = 0; i < MAX_NR_ZONES; i++)
63803222 998 count += zone_numa_state_snapshot(zones + i, item);
3a321d2a
KW
999
1000 return count;
1001}
1002
75ef7184
MG
1003/*
1004 * Determine the per node value of a stat item.
1005 */
ea426c2a
RG
1006unsigned long node_page_state_pages(struct pglist_data *pgdat,
1007 enum node_stat_item item)
75ef7184
MG
1008{
1009 long x = atomic_long_read(&pgdat->vm_stat[item]);
1010#ifdef CONFIG_SMP
1011 if (x < 0)
1012 x = 0;
1013#endif
1014 return x;
1015}
ea426c2a
RG
1016
1017unsigned long node_page_state(struct pglist_data *pgdat,
1018 enum node_stat_item item)
1019{
1020 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1021
1022 return node_page_state_pages(pgdat, item);
1023}
ca889e6c
CL
1024#endif
1025
d7a5752c 1026#ifdef CONFIG_COMPACTION
36deb0be 1027
d7a5752c
MG
1028struct contig_page_info {
1029 unsigned long free_pages;
1030 unsigned long free_blocks_total;
1031 unsigned long free_blocks_suitable;
1032};
1033
1034/*
1035 * Calculate the number of free pages in a zone, how many contiguous
1036 * pages are free and how many are large enough to satisfy an allocation of
1037 * the target size. Note that this function makes no attempt to estimate
1038 * how many suitable free blocks there *might* be if MOVABLE pages were
1039 * migrated. Calculating that is possible, but expensive and can be
1040 * figured out from userspace
1041 */
1042static void fill_contig_page_info(struct zone *zone,
1043 unsigned int suitable_order,
1044 struct contig_page_info *info)
1045{
1046 unsigned int order;
1047
1048 info->free_pages = 0;
1049 info->free_blocks_total = 0;
1050 info->free_blocks_suitable = 0;
1051
1052 for (order = 0; order < MAX_ORDER; order++) {
1053 unsigned long blocks;
1054
1055 /* Count number of free blocks */
1056 blocks = zone->free_area[order].nr_free;
1057 info->free_blocks_total += blocks;
1058
1059 /* Count free base pages */
1060 info->free_pages += blocks << order;
1061
1062 /* Count the suitable free blocks */
1063 if (order >= suitable_order)
1064 info->free_blocks_suitable += blocks <<
1065 (order - suitable_order);
1066 }
1067}
f1a5ab12
MG
1068
1069/*
1070 * A fragmentation index only makes sense if an allocation of a requested
1071 * size would fail. If that is true, the fragmentation index indicates
1072 * whether external fragmentation or a lack of memory was the problem.
1073 * The value can be used to determine if page reclaim or compaction
1074 * should be used
1075 */
56de7263 1076static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
f1a5ab12
MG
1077{
1078 unsigned long requested = 1UL << order;
1079
88d6ac40
WY
1080 if (WARN_ON_ONCE(order >= MAX_ORDER))
1081 return 0;
1082
f1a5ab12
MG
1083 if (!info->free_blocks_total)
1084 return 0;
1085
1086 /* Fragmentation index only makes sense when a request would fail */
1087 if (info->free_blocks_suitable)
1088 return -1000;
1089
1090 /*
1091 * Index is between 0 and 1 so return within 3 decimal places
1092 *
1093 * 0 => allocation would fail due to lack of memory
1094 * 1 => allocation would fail due to fragmentation
1095 */
1096 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1097}
56de7263 1098
facdaa91
NG
1099/*
1100 * Calculates external fragmentation within a zone wrt the given order.
1101 * It is defined as the percentage of pages found in blocks of size
1102 * less than 1 << order. It returns values in range [0, 100].
1103 */
d34c0a75 1104unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
facdaa91
NG
1105{
1106 struct contig_page_info info;
1107
1108 fill_contig_page_info(zone, order, &info);
1109 if (info.free_pages == 0)
1110 return 0;
1111
1112 return div_u64((info.free_pages -
1113 (info.free_blocks_suitable << order)) * 100,
1114 info.free_pages);
1115}
1116
56de7263
MG
1117/* Same as __fragmentation index but allocs contig_page_info on stack */
1118int fragmentation_index(struct zone *zone, unsigned int order)
1119{
1120 struct contig_page_info info;
1121
1122 fill_contig_page_info(zone, order, &info);
1123 return __fragmentation_index(order, &info);
1124}
d7a5752c
MG
1125#endif
1126
ebc5d83d
KK
1127#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1128 defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
fa25c503
KM
1129#ifdef CONFIG_ZONE_DMA
1130#define TEXT_FOR_DMA(xx) xx "_dma",
1131#else
1132#define TEXT_FOR_DMA(xx)
1133#endif
1134
1135#ifdef CONFIG_ZONE_DMA32
1136#define TEXT_FOR_DMA32(xx) xx "_dma32",
1137#else
1138#define TEXT_FOR_DMA32(xx)
1139#endif
1140
1141#ifdef CONFIG_HIGHMEM
1142#define TEXT_FOR_HIGHMEM(xx) xx "_high",
1143#else
1144#define TEXT_FOR_HIGHMEM(xx)
1145#endif
1146
1147#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
1148 TEXT_FOR_HIGHMEM(xx) xx "_movable",
1149
1150const char * const vmstat_text[] = {
8d92890b 1151 /* enum zone_stat_item counters */
fa25c503 1152 "nr_free_pages",
71c799f4
MK
1153 "nr_zone_inactive_anon",
1154 "nr_zone_active_anon",
1155 "nr_zone_inactive_file",
1156 "nr_zone_active_file",
1157 "nr_zone_unevictable",
5a1c84b4 1158 "nr_zone_write_pending",
fa25c503 1159 "nr_mlock",
fa25c503 1160 "nr_bounce",
91537fee
MK
1161#if IS_ENABLED(CONFIG_ZSMALLOC)
1162 "nr_zspages",
1163#endif
3a321d2a
KW
1164 "nr_free_cma",
1165
1166 /* enum numa_stat_item counters */
fa25c503
KM
1167#ifdef CONFIG_NUMA
1168 "numa_hit",
1169 "numa_miss",
1170 "numa_foreign",
1171 "numa_interleave",
1172 "numa_local",
1173 "numa_other",
1174#endif
09316c09 1175
9d7ea9a2 1176 /* enum node_stat_item counters */
599d0c95
MG
1177 "nr_inactive_anon",
1178 "nr_active_anon",
1179 "nr_inactive_file",
1180 "nr_active_file",
1181 "nr_unevictable",
385386cf
JW
1182 "nr_slab_reclaimable",
1183 "nr_slab_unreclaimable",
599d0c95
MG
1184 "nr_isolated_anon",
1185 "nr_isolated_file",
68d48e6a 1186 "workingset_nodes",
170b04b7
JK
1187 "workingset_refault_anon",
1188 "workingset_refault_file",
1189 "workingset_activate_anon",
1190 "workingset_activate_file",
1191 "workingset_restore_anon",
1192 "workingset_restore_file",
1e6b1085 1193 "workingset_nodereclaim",
50658e2e
MG
1194 "nr_anon_pages",
1195 "nr_mapped",
11fb9989
MG
1196 "nr_file_pages",
1197 "nr_dirty",
1198 "nr_writeback",
1199 "nr_writeback_temp",
1200 "nr_shmem",
1201 "nr_shmem_hugepages",
1202 "nr_shmem_pmdmapped",
60fbf0ab
SL
1203 "nr_file_hugepages",
1204 "nr_file_pmdmapped",
11fb9989 1205 "nr_anon_transparent_hugepages",
c4a25635
MG
1206 "nr_vmscan_write",
1207 "nr_vmscan_immediate_reclaim",
1208 "nr_dirtied",
1209 "nr_written",
b29940c1 1210 "nr_kernel_misc_reclaimable",
1970dc6f
JH
1211 "nr_foll_pin_acquired",
1212 "nr_foll_pin_released",
991e7673
SB
1213 "nr_kernel_stack",
1214#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1215 "nr_shadow_call_stack",
1216#endif
f0c0c115 1217 "nr_page_table_pages",
b6038942
SB
1218#ifdef CONFIG_SWAP
1219 "nr_swapcached",
1220#endif
599d0c95 1221
09316c09 1222 /* enum writeback_stat_item counters */
fa25c503
KM
1223 "nr_dirty_threshold",
1224 "nr_dirty_background_threshold",
1225
ebc5d83d 1226#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
09316c09 1227 /* enum vm_event_item counters */
fa25c503
KM
1228 "pgpgin",
1229 "pgpgout",
1230 "pswpin",
1231 "pswpout",
1232
1233 TEXTS_FOR_ZONES("pgalloc")
7cc30fcf
MG
1234 TEXTS_FOR_ZONES("allocstall")
1235 TEXTS_FOR_ZONES("pgskip")
fa25c503
KM
1236
1237 "pgfree",
1238 "pgactivate",
1239 "pgdeactivate",
f7ad2a6c 1240 "pglazyfree",
fa25c503
KM
1241
1242 "pgfault",
1243 "pgmajfault",
854e9ed0 1244 "pglazyfreed",
fa25c503 1245
599d0c95 1246 "pgrefill",
798a6b87 1247 "pgreuse",
599d0c95
MG
1248 "pgsteal_kswapd",
1249 "pgsteal_direct",
1250 "pgscan_kswapd",
1251 "pgscan_direct",
68243e76 1252 "pgscan_direct_throttle",
497a6c1b
JW
1253 "pgscan_anon",
1254 "pgscan_file",
1255 "pgsteal_anon",
1256 "pgsteal_file",
fa25c503
KM
1257
1258#ifdef CONFIG_NUMA
1259 "zone_reclaim_failed",
1260#endif
1261 "pginodesteal",
1262 "slabs_scanned",
fa25c503
KM
1263 "kswapd_inodesteal",
1264 "kswapd_low_wmark_hit_quickly",
1265 "kswapd_high_wmark_hit_quickly",
fa25c503 1266 "pageoutrun",
fa25c503
KM
1267
1268 "pgrotated",
1269
5509a5d2
DH
1270 "drop_pagecache",
1271 "drop_slab",
8e675f7a 1272 "oom_kill",
5509a5d2 1273
03c5a6e1
MG
1274#ifdef CONFIG_NUMA_BALANCING
1275 "numa_pte_updates",
72403b4a 1276 "numa_huge_pte_updates",
03c5a6e1
MG
1277 "numa_hint_faults",
1278 "numa_hint_faults_local",
1279 "numa_pages_migrated",
1280#endif
5647bc29
MG
1281#ifdef CONFIG_MIGRATION
1282 "pgmigrate_success",
1283 "pgmigrate_fail",
1a5bae25
AK
1284 "thp_migration_success",
1285 "thp_migration_fail",
1286 "thp_migration_split",
5647bc29 1287#endif
fa25c503 1288#ifdef CONFIG_COMPACTION
397487db
MG
1289 "compact_migrate_scanned",
1290 "compact_free_scanned",
1291 "compact_isolated",
fa25c503
KM
1292 "compact_stall",
1293 "compact_fail",
1294 "compact_success",
698b1b30 1295 "compact_daemon_wake",
7f354a54
DR
1296 "compact_daemon_migrate_scanned",
1297 "compact_daemon_free_scanned",
fa25c503
KM
1298#endif
1299
1300#ifdef CONFIG_HUGETLB_PAGE
1301 "htlb_buddy_alloc_success",
1302 "htlb_buddy_alloc_fail",
1303#endif
1304 "unevictable_pgs_culled",
1305 "unevictable_pgs_scanned",
1306 "unevictable_pgs_rescued",
1307 "unevictable_pgs_mlocked",
1308 "unevictable_pgs_munlocked",
1309 "unevictable_pgs_cleared",
1310 "unevictable_pgs_stranded",
fa25c503
KM
1311
1312#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1313 "thp_fault_alloc",
1314 "thp_fault_fallback",
85b9f46e 1315 "thp_fault_fallback_charge",
fa25c503
KM
1316 "thp_collapse_alloc",
1317 "thp_collapse_alloc_failed",
95ecedcd 1318 "thp_file_alloc",
dcdf11ee 1319 "thp_file_fallback",
85b9f46e 1320 "thp_file_fallback_charge",
95ecedcd 1321 "thp_file_mapped",
122afea9
KS
1322 "thp_split_page",
1323 "thp_split_page_failed",
f9719a03 1324 "thp_deferred_split_page",
122afea9 1325 "thp_split_pmd",
ce9311cf
YX
1326#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1327 "thp_split_pud",
1328#endif
d8a8e1f0
KS
1329 "thp_zero_page_alloc",
1330 "thp_zero_page_alloc_failed",
225311a4 1331 "thp_swpout",
fe490cc0 1332 "thp_swpout_fallback",
fa25c503 1333#endif
09316c09
KK
1334#ifdef CONFIG_MEMORY_BALLOON
1335 "balloon_inflate",
1336 "balloon_deflate",
1337#ifdef CONFIG_BALLOON_COMPACTION
1338 "balloon_migrate",
1339#endif
1340#endif /* CONFIG_MEMORY_BALLOON */
ec659934 1341#ifdef CONFIG_DEBUG_TLBFLUSH
9824cf97
DH
1342 "nr_tlb_remote_flush",
1343 "nr_tlb_remote_flush_received",
1344 "nr_tlb_local_flush_all",
1345 "nr_tlb_local_flush_one",
ec659934 1346#endif /* CONFIG_DEBUG_TLBFLUSH */
fa25c503 1347
4f115147
DB
1348#ifdef CONFIG_DEBUG_VM_VMACACHE
1349 "vmacache_find_calls",
1350 "vmacache_find_hits",
1351#endif
cbc65df2
YH
1352#ifdef CONFIG_SWAP
1353 "swap_ra",
1354 "swap_ra_hit",
1355#endif
ebc5d83d 1356#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
fa25c503 1357};
ebc5d83d 1358#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
fa25c503 1359
3c486871
AM
1360#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1361 defined(CONFIG_PROC_FS)
1362static void *frag_start(struct seq_file *m, loff_t *pos)
1363{
1364 pg_data_t *pgdat;
1365 loff_t node = *pos;
1366
1367 for (pgdat = first_online_pgdat();
1368 pgdat && node;
1369 pgdat = next_online_pgdat(pgdat))
1370 --node;
1371
1372 return pgdat;
1373}
1374
1375static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1376{
1377 pg_data_t *pgdat = (pg_data_t *)arg;
1378
1379 (*pos)++;
1380 return next_online_pgdat(pgdat);
1381}
1382
1383static void frag_stop(struct seq_file *m, void *arg)
1384{
1385}
1386
b2bd8598
DR
1387/*
1388 * Walk zones in a node and print using a callback.
1389 * If @assert_populated is true, only use callback for zones that are populated.
1390 */
3c486871 1391static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
727c080f 1392 bool assert_populated, bool nolock,
3c486871
AM
1393 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1394{
1395 struct zone *zone;
1396 struct zone *node_zones = pgdat->node_zones;
1397 unsigned long flags;
1398
1399 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
b2bd8598 1400 if (assert_populated && !populated_zone(zone))
3c486871
AM
1401 continue;
1402
727c080f
VM
1403 if (!nolock)
1404 spin_lock_irqsave(&zone->lock, flags);
3c486871 1405 print(m, pgdat, zone);
727c080f
VM
1406 if (!nolock)
1407 spin_unlock_irqrestore(&zone->lock, flags);
3c486871
AM
1408 }
1409}
1410#endif
1411
d7a5752c 1412#ifdef CONFIG_PROC_FS
467c996c
MG
1413static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1414 struct zone *zone)
1415{
1416 int order;
1417
1418 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1419 for (order = 0; order < MAX_ORDER; ++order)
1420 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
1421 seq_putc(m, '\n');
1422}
1423
1424/*
1425 * This walks the free areas for each zone.
1426 */
1427static int frag_show(struct seq_file *m, void *arg)
1428{
1429 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1430 walk_zones_in_node(m, pgdat, true, false, frag_show_print);
467c996c
MG
1431 return 0;
1432}
1433
1434static void pagetypeinfo_showfree_print(struct seq_file *m,
1435 pg_data_t *pgdat, struct zone *zone)
1436{
1437 int order, mtype;
1438
1439 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1440 seq_printf(m, "Node %4d, zone %8s, type %12s ",
1441 pgdat->node_id,
1442 zone->name,
1443 migratetype_names[mtype]);
1444 for (order = 0; order < MAX_ORDER; ++order) {
1445 unsigned long freecount = 0;
1446 struct free_area *area;
1447 struct list_head *curr;
93b3a674 1448 bool overflow = false;
467c996c
MG
1449
1450 area = &(zone->free_area[order]);
1451
93b3a674
MH
1452 list_for_each(curr, &area->free_list[mtype]) {
1453 /*
1454 * Cap the free_list iteration because it might
1455 * be really large and we are under a spinlock
1456 * so a long time spent here could trigger a
1457 * hard lockup detector. Anyway this is a
1458 * debugging tool so knowing there is a handful
1459 * of pages of this order should be more than
1460 * sufficient.
1461 */
1462 if (++freecount >= 100000) {
1463 overflow = true;
1464 break;
1465 }
1466 }
1467 seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1468 spin_unlock_irq(&zone->lock);
1469 cond_resched();
1470 spin_lock_irq(&zone->lock);
467c996c 1471 }
f6ac2354
CL
1472 seq_putc(m, '\n');
1473 }
467c996c
MG
1474}
1475
1476/* Print out the free pages at each order for each migatetype */
1477static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
1478{
1479 int order;
1480 pg_data_t *pgdat = (pg_data_t *)arg;
1481
1482 /* Print header */
1483 seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1484 for (order = 0; order < MAX_ORDER; ++order)
1485 seq_printf(m, "%6d ", order);
1486 seq_putc(m, '\n');
1487
727c080f 1488 walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
467c996c
MG
1489
1490 return 0;
1491}
1492
1493static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1494 pg_data_t *pgdat, struct zone *zone)
1495{
1496 int mtype;
1497 unsigned long pfn;
1498 unsigned long start_pfn = zone->zone_start_pfn;
108bcc96 1499 unsigned long end_pfn = zone_end_pfn(zone);
467c996c
MG
1500 unsigned long count[MIGRATE_TYPES] = { 0, };
1501
1502 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1503 struct page *page;
1504
d336e94e
MH
1505 page = pfn_to_online_page(pfn);
1506 if (!page)
467c996c
MG
1507 continue;
1508
a91c43c7
JK
1509 if (page_zone(page) != zone)
1510 continue;
1511
467c996c
MG
1512 mtype = get_pageblock_migratetype(page);
1513
e80d6a24
MG
1514 if (mtype < MIGRATE_TYPES)
1515 count[mtype]++;
467c996c
MG
1516 }
1517
1518 /* Print counts */
1519 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1520 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1521 seq_printf(m, "%12lu ", count[mtype]);
1522 seq_putc(m, '\n');
1523}
1524
f113e641 1525/* Print out the number of pageblocks for each migratetype */
467c996c
MG
1526static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1527{
1528 int mtype;
1529 pg_data_t *pgdat = (pg_data_t *)arg;
1530
1531 seq_printf(m, "\n%-23s", "Number of blocks type ");
1532 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1533 seq_printf(m, "%12s ", migratetype_names[mtype]);
1534 seq_putc(m, '\n');
727c080f
VM
1535 walk_zones_in_node(m, pgdat, true, false,
1536 pagetypeinfo_showblockcount_print);
467c996c
MG
1537
1538 return 0;
1539}
1540
48c96a36
JK
1541/*
1542 * Print out the number of pageblocks for each migratetype that contain pages
1543 * of other types. This gives an indication of how well fallbacks are being
1544 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1545 * to determine what is going on
1546 */
1547static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1548{
1549#ifdef CONFIG_PAGE_OWNER
1550 int mtype;
1551
7dd80b8a 1552 if (!static_branch_unlikely(&page_owner_inited))
48c96a36
JK
1553 return;
1554
1555 drain_all_pages(NULL);
1556
1557 seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1558 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1559 seq_printf(m, "%12s ", migratetype_names[mtype]);
1560 seq_putc(m, '\n');
1561
727c080f
VM
1562 walk_zones_in_node(m, pgdat, true, true,
1563 pagetypeinfo_showmixedcount_print);
48c96a36
JK
1564#endif /* CONFIG_PAGE_OWNER */
1565}
1566
467c996c
MG
1567/*
1568 * This prints out statistics in relation to grouping pages by mobility.
1569 * It is expensive to collect so do not constantly read the file.
1570 */
1571static int pagetypeinfo_show(struct seq_file *m, void *arg)
1572{
1573 pg_data_t *pgdat = (pg_data_t *)arg;
1574
41b25a37 1575 /* check memoryless node */
a47b53c5 1576 if (!node_state(pgdat->node_id, N_MEMORY))
41b25a37
KM
1577 return 0;
1578
467c996c
MG
1579 seq_printf(m, "Page block order: %d\n", pageblock_order);
1580 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
1581 seq_putc(m, '\n');
1582 pagetypeinfo_showfree(m, pgdat);
1583 pagetypeinfo_showblockcount(m, pgdat);
48c96a36 1584 pagetypeinfo_showmixedcount(m, pgdat);
467c996c 1585
f6ac2354
CL
1586 return 0;
1587}
1588
8f32f7e5 1589static const struct seq_operations fragmentation_op = {
f6ac2354
CL
1590 .start = frag_start,
1591 .next = frag_next,
1592 .stop = frag_stop,
1593 .show = frag_show,
1594};
1595
74e2e8e8 1596static const struct seq_operations pagetypeinfo_op = {
467c996c
MG
1597 .start = frag_start,
1598 .next = frag_next,
1599 .stop = frag_stop,
1600 .show = pagetypeinfo_show,
1601};
1602
e2ecc8a7
MG
1603static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1604{
1605 int zid;
1606
1607 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1608 struct zone *compare = &pgdat->node_zones[zid];
1609
1610 if (populated_zone(compare))
1611 return zone == compare;
1612 }
1613
e2ecc8a7
MG
1614 return false;
1615}
1616
467c996c
MG
1617static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1618 struct zone *zone)
f6ac2354 1619{
467c996c
MG
1620 int i;
1621 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
e2ecc8a7
MG
1622 if (is_zone_first_populated(pgdat, zone)) {
1623 seq_printf(m, "\n per-node stats");
1624 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
69473e5d
MS
1625 unsigned long pages = node_page_state_pages(pgdat, i);
1626
1627 if (vmstat_item_print_in_thp(i))
1628 pages /= HPAGE_PMD_NR;
9d7ea9a2 1629 seq_printf(m, "\n %-12s %lu", node_stat_name(i),
69473e5d 1630 pages);
e2ecc8a7
MG
1631 }
1632 }
467c996c
MG
1633 seq_printf(m,
1634 "\n pages free %lu"
1635 "\n min %lu"
1636 "\n low %lu"
1637 "\n high %lu"
467c996c 1638 "\n spanned %lu"
9feedc9d 1639 "\n present %lu"
3c381db1
DH
1640 "\n managed %lu"
1641 "\n cma %lu",
88f5acf8 1642 zone_page_state(zone, NR_FREE_PAGES),
41858966
MG
1643 min_wmark_pages(zone),
1644 low_wmark_pages(zone),
1645 high_wmark_pages(zone),
467c996c 1646 zone->spanned_pages,
9feedc9d 1647 zone->present_pages,
3c381db1
DH
1648 zone_managed_pages(zone),
1649 zone_cma_pages(zone));
467c996c 1650
467c996c 1651 seq_printf(m,
3484b2de 1652 "\n protection: (%ld",
467c996c
MG
1653 zone->lowmem_reserve[0]);
1654 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
3484b2de 1655 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
7dfb8bf3
DR
1656 seq_putc(m, ')');
1657
a8a4b7ae
BH
1658 /* If unpopulated, no other information is useful */
1659 if (!populated_zone(zone)) {
1660 seq_putc(m, '\n');
1661 return;
1662 }
1663
7dfb8bf3 1664 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
9d7ea9a2
KK
1665 seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
1666 zone_page_state(zone, i));
7dfb8bf3 1667
3a321d2a
KW
1668#ifdef CONFIG_NUMA
1669 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
9d7ea9a2
KK
1670 seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
1671 zone_numa_state_snapshot(zone, i));
3a321d2a
KW
1672#endif
1673
7dfb8bf3 1674 seq_printf(m, "\n pagesets");
467c996c
MG
1675 for_each_online_cpu(i) {
1676 struct per_cpu_pageset *pageset;
467c996c 1677
99dcc3e5 1678 pageset = per_cpu_ptr(zone->pageset, i);
3dfa5721
CL
1679 seq_printf(m,
1680 "\n cpu: %i"
1681 "\n count: %i"
1682 "\n high: %i"
1683 "\n batch: %i",
1684 i,
1685 pageset->pcp.count,
1686 pageset->pcp.high,
1687 pageset->pcp.batch);
df9ecaba 1688#ifdef CONFIG_SMP
467c996c
MG
1689 seq_printf(m, "\n vm stats threshold: %d",
1690 pageset->stat_threshold);
df9ecaba 1691#endif
f6ac2354 1692 }
467c996c 1693 seq_printf(m,
599d0c95 1694 "\n node_unreclaimable: %u"
3a50d14d 1695 "\n start_pfn: %lu",
c73322d0 1696 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
3a50d14d 1697 zone->zone_start_pfn);
467c996c
MG
1698 seq_putc(m, '\n');
1699}
1700
1701/*
b2bd8598
DR
1702 * Output information about zones in @pgdat. All zones are printed regardless
1703 * of whether they are populated or not: lowmem_reserve_ratio operates on the
1704 * set of all zones and userspace would not be aware of such zones if they are
1705 * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
467c996c
MG
1706 */
1707static int zoneinfo_show(struct seq_file *m, void *arg)
1708{
1709 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1710 walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
f6ac2354
CL
1711 return 0;
1712}
1713
5c9fe628 1714static const struct seq_operations zoneinfo_op = {
f6ac2354
CL
1715 .start = frag_start, /* iterate over all zones. The same as in
1716 * fragmentation. */
1717 .next = frag_next,
1718 .stop = frag_stop,
1719 .show = zoneinfo_show,
1720};
1721
9d7ea9a2
KK
1722#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
1723 NR_VM_NUMA_STAT_ITEMS + \
1724 NR_VM_NODE_STAT_ITEMS + \
1725 NR_VM_WRITEBACK_STAT_ITEMS + \
1726 (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1727 NR_VM_EVENT_ITEMS : 0))
79da826a 1728
f6ac2354
CL
1729static void *vmstat_start(struct seq_file *m, loff_t *pos)
1730{
2244b95a 1731 unsigned long *v;
9d7ea9a2 1732 int i;
f6ac2354 1733
9d7ea9a2 1734 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354 1735 return NULL;
79da826a 1736
9d7ea9a2
KK
1737 BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
1738 v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
2244b95a
CL
1739 m->private = v;
1740 if (!v)
f6ac2354 1741 return ERR_PTR(-ENOMEM);
2244b95a 1742 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
c41f012a 1743 v[i] = global_zone_page_state(i);
79da826a
MR
1744 v += NR_VM_ZONE_STAT_ITEMS;
1745
3a321d2a
KW
1746#ifdef CONFIG_NUMA
1747 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
1748 v[i] = global_numa_state(i);
1749 v += NR_VM_NUMA_STAT_ITEMS;
1750#endif
1751
69473e5d 1752 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
ea426c2a 1753 v[i] = global_node_page_state_pages(i);
69473e5d
MS
1754 if (vmstat_item_print_in_thp(i))
1755 v[i] /= HPAGE_PMD_NR;
1756 }
75ef7184
MG
1757 v += NR_VM_NODE_STAT_ITEMS;
1758
79da826a
MR
1759 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1760 v + NR_DIRTY_THRESHOLD);
1761 v += NR_VM_WRITEBACK_STAT_ITEMS;
1762
f8891e5e 1763#ifdef CONFIG_VM_EVENT_COUNTERS
79da826a
MR
1764 all_vm_events(v);
1765 v[PGPGIN] /= 2; /* sectors -> kbytes */
1766 v[PGPGOUT] /= 2;
f8891e5e 1767#endif
ff8b16d7 1768 return (unsigned long *)m->private + *pos;
f6ac2354
CL
1769}
1770
1771static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1772{
1773 (*pos)++;
9d7ea9a2 1774 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354
CL
1775 return NULL;
1776 return (unsigned long *)m->private + *pos;
1777}
1778
1779static int vmstat_show(struct seq_file *m, void *arg)
1780{
1781 unsigned long *l = arg;
1782 unsigned long off = l - (unsigned long *)m->private;
68ba0326
AD
1783
1784 seq_puts(m, vmstat_text[off]);
75ba1d07 1785 seq_put_decimal_ull(m, " ", *l);
68ba0326 1786 seq_putc(m, '\n');
8d92890b
N
1787
1788 if (off == NR_VMSTAT_ITEMS - 1) {
1789 /*
1790 * We've come to the end - add any deprecated counters to avoid
1791 * breaking userspace which might depend on them being present.
1792 */
1793 seq_puts(m, "nr_unstable 0\n");
1794 }
f6ac2354
CL
1795 return 0;
1796}
1797
1798static void vmstat_stop(struct seq_file *m, void *arg)
1799{
1800 kfree(m->private);
1801 m->private = NULL;
1802}
1803
b6aa44ab 1804static const struct seq_operations vmstat_op = {
f6ac2354
CL
1805 .start = vmstat_start,
1806 .next = vmstat_next,
1807 .stop = vmstat_stop,
1808 .show = vmstat_show,
1809};
f6ac2354
CL
1810#endif /* CONFIG_PROC_FS */
1811
df9ecaba 1812#ifdef CONFIG_SMP
d1187ed2 1813static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
77461ab3 1814int sysctl_stat_interval __read_mostly = HZ;
d1187ed2 1815
52b6f46b
HD
1816#ifdef CONFIG_PROC_FS
1817static void refresh_vm_stats(struct work_struct *work)
1818{
1819 refresh_cpu_vm_stats(true);
1820}
1821
1822int vmstat_refresh(struct ctl_table *table, int write,
32927393 1823 void *buffer, size_t *lenp, loff_t *ppos)
52b6f46b
HD
1824{
1825 long val;
1826 int err;
1827 int i;
1828
1829 /*
1830 * The regular update, every sysctl_stat_interval, may come later
1831 * than expected: leaving a significant amount in per_cpu buckets.
1832 * This is particularly misleading when checking a quantity of HUGE
1833 * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
1834 * which can equally be echo'ed to or cat'ted from (by root),
1835 * can be used to update the stats just before reading them.
1836 *
c41f012a 1837 * Oh, and since global_zone_page_state() etc. are so careful to hide
52b6f46b
HD
1838 * transiently negative values, report an error here if any of
1839 * the stats is negative, so we know to go looking for imbalance.
1840 */
1841 err = schedule_on_each_cpu(refresh_vm_stats);
1842 if (err)
1843 return err;
1844 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
75ef7184 1845 val = atomic_long_read(&vm_zone_stat[i]);
52b6f46b 1846 if (val < 0) {
c822f622 1847 pr_warn("%s: %s %ld\n",
9d7ea9a2 1848 __func__, zone_stat_name(i), val);
c822f622 1849 err = -EINVAL;
52b6f46b
HD
1850 }
1851 }
3a321d2a
KW
1852#ifdef CONFIG_NUMA
1853 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
1854 val = atomic_long_read(&vm_numa_stat[i]);
1855 if (val < 0) {
1856 pr_warn("%s: %s %ld\n",
9d7ea9a2 1857 __func__, numa_stat_name(i), val);
3a321d2a
KW
1858 err = -EINVAL;
1859 }
1860 }
1861#endif
52b6f46b
HD
1862 if (err)
1863 return err;
1864 if (write)
1865 *ppos += *lenp;
1866 else
1867 *lenp = 0;
1868 return 0;
1869}
1870#endif /* CONFIG_PROC_FS */
1871
d1187ed2
CL
1872static void vmstat_update(struct work_struct *w)
1873{
0eb77e98 1874 if (refresh_cpu_vm_stats(true)) {
7cc36bbd
CL
1875 /*
1876 * Counters were updated so we expect more updates
1877 * to occur in the future. Keep on running the
1878 * update worker thread.
1879 */
ce612879 1880 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
f01f17d3
MH
1881 this_cpu_ptr(&vmstat_work),
1882 round_jiffies_relative(sysctl_stat_interval));
7cc36bbd
CL
1883 }
1884}
1885
0eb77e98
CL
1886/*
1887 * Switch off vmstat processing and then fold all the remaining differentials
1888 * until the diffs stay at zero. The function is used by NOHZ and can only be
1889 * invoked when tick processing is not active.
1890 */
7cc36bbd
CL
1891/*
1892 * Check if the diffs for a certain cpu indicate that
1893 * an update is needed.
1894 */
1895static bool need_update(int cpu)
1896{
1897 struct zone *zone;
1898
1899 for_each_populated_zone(zone) {
1900 struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
1901
1902 BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
3a321d2a 1903#ifdef CONFIG_NUMA
1d90ca89 1904 BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2);
3a321d2a 1905#endif
63803222 1906
7cc36bbd
CL
1907 /*
1908 * The fast way of checking if there are any vmstat diffs.
7cc36bbd 1909 */
13c9aaf7
JH
1910 if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
1911 sizeof(p->vm_stat_diff[0])))
7cc36bbd 1912 return true;
3a321d2a 1913#ifdef CONFIG_NUMA
13c9aaf7
JH
1914 if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
1915 sizeof(p->vm_numa_stat_diff[0])))
3a321d2a
KW
1916 return true;
1917#endif
7cc36bbd
CL
1918 }
1919 return false;
1920}
1921
7b8da4c7
CL
1922/*
1923 * Switch off vmstat processing and then fold all the remaining differentials
1924 * until the diffs stay at zero. The function is used by NOHZ and can only be
1925 * invoked when tick processing is not active.
1926 */
f01f17d3
MH
1927void quiet_vmstat(void)
1928{
1929 if (system_state != SYSTEM_RUNNING)
1930 return;
1931
7b8da4c7 1932 if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
f01f17d3
MH
1933 return;
1934
1935 if (!need_update(smp_processor_id()))
1936 return;
1937
1938 /*
1939 * Just refresh counters and do not care about the pending delayed
1940 * vmstat_update. It doesn't fire that often to matter and canceling
1941 * it would be too expensive from this path.
1942 * vmstat_shepherd will take care about that for us.
1943 */
1944 refresh_cpu_vm_stats(false);
1945}
1946
7cc36bbd
CL
1947/*
1948 * Shepherd worker thread that checks the
1949 * differentials of processors that have their worker
1950 * threads for vm statistics updates disabled because of
1951 * inactivity.
1952 */
1953static void vmstat_shepherd(struct work_struct *w);
1954
0eb77e98 1955static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
7cc36bbd
CL
1956
1957static void vmstat_shepherd(struct work_struct *w)
1958{
1959 int cpu;
1960
1961 get_online_cpus();
1962 /* Check processors whose vmstat worker threads have been disabled */
7b8da4c7 1963 for_each_online_cpu(cpu) {
f01f17d3 1964 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
7cc36bbd 1965
7b8da4c7 1966 if (!delayed_work_pending(dw) && need_update(cpu))
ce612879 1967 queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
f01f17d3 1968 }
7cc36bbd
CL
1969 put_online_cpus();
1970
1971 schedule_delayed_work(&shepherd,
98f4ebb2 1972 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
1973}
1974
7cc36bbd 1975static void __init start_shepherd_timer(void)
d1187ed2 1976{
7cc36bbd
CL
1977 int cpu;
1978
1979 for_each_possible_cpu(cpu)
ccde8bd4 1980 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
7cc36bbd
CL
1981 vmstat_update);
1982
7cc36bbd
CL
1983 schedule_delayed_work(&shepherd,
1984 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
1985}
1986
03e86dba
TC
1987static void __init init_cpu_node_state(void)
1988{
4c501327 1989 int node;
03e86dba 1990
4c501327
SAS
1991 for_each_online_node(node) {
1992 if (cpumask_weight(cpumask_of_node(node)) > 0)
1993 node_set_state(node, N_CPU);
1994 }
03e86dba
TC
1995}
1996
5438da97
SAS
1997static int vmstat_cpu_online(unsigned int cpu)
1998{
1999 refresh_zone_stat_thresholds();
2000 node_set_state(cpu_to_node(cpu), N_CPU);
2001 return 0;
2002}
2003
2004static int vmstat_cpu_down_prep(unsigned int cpu)
2005{
2006 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2007 return 0;
2008}
2009
2010static int vmstat_cpu_dead(unsigned int cpu)
807a1bd2 2011{
4c501327 2012 const struct cpumask *node_cpus;
5438da97 2013 int node;
807a1bd2 2014
5438da97
SAS
2015 node = cpu_to_node(cpu);
2016
2017 refresh_zone_stat_thresholds();
4c501327
SAS
2018 node_cpus = cpumask_of_node(node);
2019 if (cpumask_weight(node_cpus) > 0)
5438da97 2020 return 0;
807a1bd2
TK
2021
2022 node_clear_state(node, N_CPU);
5438da97 2023 return 0;
807a1bd2
TK
2024}
2025
8f32f7e5 2026#endif
df9ecaba 2027
ce612879
MH
2028struct workqueue_struct *mm_percpu_wq;
2029
597b7305 2030void __init init_mm_internals(void)
df9ecaba 2031{
ce612879 2032 int ret __maybe_unused;
5438da97 2033
80d136e1 2034 mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
ce612879
MH
2035
2036#ifdef CONFIG_SMP
5438da97
SAS
2037 ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2038 NULL, vmstat_cpu_dead);
2039 if (ret < 0)
2040 pr_err("vmstat: failed to register 'dead' hotplug state\n");
2041
2042 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2043 vmstat_cpu_online,
2044 vmstat_cpu_down_prep);
2045 if (ret < 0)
2046 pr_err("vmstat: failed to register 'online' hotplug state\n");
2047
2048 get_online_cpus();
03e86dba 2049 init_cpu_node_state();
5438da97 2050 put_online_cpus();
d1187ed2 2051
7cc36bbd 2052 start_shepherd_timer();
8f32f7e5
AD
2053#endif
2054#ifdef CONFIG_PROC_FS
fddda2b7 2055 proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
abaed011 2056 proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
fddda2b7
CH
2057 proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2058 proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
8f32f7e5 2059#endif
df9ecaba 2060}
d7a5752c
MG
2061
2062#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
d7a5752c
MG
2063
2064/*
2065 * Return an index indicating how much of the available free memory is
2066 * unusable for an allocation of the requested size.
2067 */
2068static int unusable_free_index(unsigned int order,
2069 struct contig_page_info *info)
2070{
2071 /* No free memory is interpreted as all free memory is unusable */
2072 if (info->free_pages == 0)
2073 return 1000;
2074
2075 /*
2076 * Index should be a value between 0 and 1. Return a value to 3
2077 * decimal places.
2078 *
2079 * 0 => no fragmentation
2080 * 1 => high fragmentation
2081 */
2082 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2083
2084}
2085
2086static void unusable_show_print(struct seq_file *m,
2087 pg_data_t *pgdat, struct zone *zone)
2088{
2089 unsigned int order;
2090 int index;
2091 struct contig_page_info info;
2092
2093 seq_printf(m, "Node %d, zone %8s ",
2094 pgdat->node_id,
2095 zone->name);
2096 for (order = 0; order < MAX_ORDER; ++order) {
2097 fill_contig_page_info(zone, order, &info);
2098 index = unusable_free_index(order, &info);
2099 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2100 }
2101
2102 seq_putc(m, '\n');
2103}
2104
2105/*
2106 * Display unusable free space index
2107 *
2108 * The unusable free space index measures how much of the available free
2109 * memory cannot be used to satisfy an allocation of a given size and is a
2110 * value between 0 and 1. The higher the value, the more of free memory is
2111 * unusable and by implication, the worse the external fragmentation is. This
2112 * can be expressed as a percentage by multiplying by 100.
2113 */
2114static int unusable_show(struct seq_file *m, void *arg)
2115{
2116 pg_data_t *pgdat = (pg_data_t *)arg;
2117
2118 /* check memoryless node */
a47b53c5 2119 if (!node_state(pgdat->node_id, N_MEMORY))
d7a5752c
MG
2120 return 0;
2121
727c080f 2122 walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
d7a5752c
MG
2123
2124 return 0;
2125}
2126
01a99560 2127static const struct seq_operations unusable_sops = {
d7a5752c
MG
2128 .start = frag_start,
2129 .next = frag_next,
2130 .stop = frag_stop,
2131 .show = unusable_show,
2132};
2133
01a99560 2134DEFINE_SEQ_ATTRIBUTE(unusable);
d7a5752c 2135
f1a5ab12
MG
2136static void extfrag_show_print(struct seq_file *m,
2137 pg_data_t *pgdat, struct zone *zone)
2138{
2139 unsigned int order;
2140 int index;
2141
2142 /* Alloc on stack as interrupts are disabled for zone walk */
2143 struct contig_page_info info;
2144
2145 seq_printf(m, "Node %d, zone %8s ",
2146 pgdat->node_id,
2147 zone->name);
2148 for (order = 0; order < MAX_ORDER; ++order) {
2149 fill_contig_page_info(zone, order, &info);
56de7263 2150 index = __fragmentation_index(order, &info);
f1a5ab12
MG
2151 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2152 }
2153
2154 seq_putc(m, '\n');
2155}
2156
2157/*
2158 * Display fragmentation index for orders that allocations would fail for
2159 */
2160static int extfrag_show(struct seq_file *m, void *arg)
2161{
2162 pg_data_t *pgdat = (pg_data_t *)arg;
2163
727c080f 2164 walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
f1a5ab12
MG
2165
2166 return 0;
2167}
2168
01a99560 2169static const struct seq_operations extfrag_sops = {
f1a5ab12
MG
2170 .start = frag_start,
2171 .next = frag_next,
2172 .stop = frag_stop,
2173 .show = extfrag_show,
2174};
2175
01a99560 2176DEFINE_SEQ_ATTRIBUTE(extfrag);
f1a5ab12 2177
d7a5752c
MG
2178static int __init extfrag_debug_init(void)
2179{
bde8bd8a
S
2180 struct dentry *extfrag_debug_root;
2181
d7a5752c 2182 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
d7a5752c 2183
d9f7979c 2184 debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
01a99560 2185 &unusable_fops);
d7a5752c 2186
d9f7979c 2187 debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
01a99560 2188 &extfrag_fops);
f1a5ab12 2189
d7a5752c
MG
2190 return 0;
2191}
2192
2193module_init(extfrag_debug_init);
2194#endif
This page took 3.829643 seconds and 4 git commands to generate.