]> Git Repo - linux.git/blame - mm/vmstat.c
Merge tag 'firewire-updates-6.12' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux.git] / mm / vmstat.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
f6ac2354
CL
2/*
3 * linux/mm/vmstat.c
4 *
5 * Manages VM statistics
6 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
2244b95a
CL
7 *
8 * zoned VM statistics
9 * Copyright (C) 2006 Silicon Graphics, Inc.,
10 * Christoph Lameter <[email protected]>
7cc36bbd 11 * Copyright (C) 2008-2014 Christoph Lameter
f6ac2354 12 */
8f32f7e5 13#include <linux/fs.h>
f6ac2354 14#include <linux/mm.h>
4e950f6f 15#include <linux/err.h>
2244b95a 16#include <linux/module.h>
5a0e3ad6 17#include <linux/slab.h>
df9ecaba 18#include <linux/cpu.h>
7cc36bbd 19#include <linux/cpumask.h>
c748e134 20#include <linux/vmstat.h>
3c486871
AM
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/debugfs.h>
e8edc6e0 24#include <linux/sched.h>
f1a5ab12 25#include <linux/math64.h>
79da826a 26#include <linux/writeback.h>
36deb0be 27#include <linux/compaction.h>
6e543d57 28#include <linux/mm_inline.h>
48c96a36 29#include <linux/page_owner.h>
be5e015d 30#include <linux/sched/isolation.h>
6e543d57
LD
31
32#include "internal.h"
f6ac2354 33
4518085e
KW
34#ifdef CONFIG_NUMA
35int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
36
37/* zero numa counters within a zone */
38static void zero_zone_numa_counters(struct zone *zone)
39{
40 int item, cpu;
41
f19298b9
MG
42 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
43 atomic_long_set(&zone->vm_numa_event[item], 0);
44 for_each_online_cpu(cpu) {
45 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
4518085e 46 = 0;
f19298b9 47 }
4518085e
KW
48 }
49}
50
51/* zero numa counters of all the populated zones */
52static void zero_zones_numa_counters(void)
53{
54 struct zone *zone;
55
56 for_each_populated_zone(zone)
57 zero_zone_numa_counters(zone);
58}
59
60/* zero global numa counters */
61static void zero_global_numa_counters(void)
62{
63 int item;
64
f19298b9
MG
65 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
66 atomic_long_set(&vm_numa_event[item], 0);
4518085e
KW
67}
68
69static void invalid_numa_statistics(void)
70{
71 zero_zones_numa_counters();
72 zero_global_numa_counters();
73}
74
75static DEFINE_MUTEX(vm_numa_stat_lock);
76
78eb4ea2 77int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write,
32927393 78 void *buffer, size_t *length, loff_t *ppos)
4518085e
KW
79{
80 int ret, oldval;
81
82 mutex_lock(&vm_numa_stat_lock);
83 if (write)
84 oldval = sysctl_vm_numa_stat;
85 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
86 if (ret || !write)
87 goto out;
88
89 if (oldval == sysctl_vm_numa_stat)
90 goto out;
91 else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
92 static_branch_enable(&vm_numa_stat_key);
93 pr_info("enable numa statistics\n");
94 } else {
95 static_branch_disable(&vm_numa_stat_key);
96 invalid_numa_statistics();
97 pr_info("disable numa statistics, and clear numa counters\n");
98 }
99
100out:
101 mutex_unlock(&vm_numa_stat_lock);
102 return ret;
103}
104#endif
105
f8891e5e
CL
106#ifdef CONFIG_VM_EVENT_COUNTERS
107DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
108EXPORT_PER_CPU_SYMBOL(vm_event_states);
109
31f961a8 110static void sum_vm_events(unsigned long *ret)
f8891e5e 111{
9eccf2a8 112 int cpu;
f8891e5e
CL
113 int i;
114
115 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
116
31f961a8 117 for_each_online_cpu(cpu) {
f8891e5e
CL
118 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
119
f8891e5e
CL
120 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
121 ret[i] += this->event[i];
122 }
123}
124
125/*
126 * Accumulate the vm event counters across all CPUs.
127 * The result is unavoidably approximate - it can change
128 * during and after execution of this function.
129*/
130void all_vm_events(unsigned long *ret)
131{
7625eccd 132 cpus_read_lock();
31f961a8 133 sum_vm_events(ret);
7625eccd 134 cpus_read_unlock();
f8891e5e 135}
32dd66fc 136EXPORT_SYMBOL_GPL(all_vm_events);
f8891e5e 137
f8891e5e
CL
138/*
139 * Fold the foreign cpu events into our own.
140 *
141 * This is adding to the events on one processor
142 * but keeps the global counts constant.
143 */
144void vm_events_fold_cpu(int cpu)
145{
146 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
147 int i;
148
149 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
150 count_vm_events(i, fold_state->event[i]);
151 fold_state->event[i] = 0;
152 }
153}
f8891e5e
CL
154
155#endif /* CONFIG_VM_EVENT_COUNTERS */
156
2244b95a
CL
157/*
158 * Manage combined zone based / global counters
159 *
160 * vm_stat contains the global counters
161 */
75ef7184
MG
162atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
163atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
f19298b9 164atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
75ef7184
MG
165EXPORT_SYMBOL(vm_zone_stat);
166EXPORT_SYMBOL(vm_node_stat);
2244b95a 167
ebeac3ea
GU
168#ifdef CONFIG_NUMA
169static void fold_vm_zone_numa_events(struct zone *zone)
170{
171 unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
172 int cpu;
173 enum numa_stat_item item;
174
175 for_each_online_cpu(cpu) {
176 struct per_cpu_zonestat *pzstats;
177
178 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
179 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
180 zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
181 }
182
183 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
184 zone_numa_event_add(zone_numa_events[item], zone, item);
185}
186
187void fold_vm_numa_events(void)
188{
189 struct zone *zone;
190
191 for_each_populated_zone(zone)
192 fold_vm_zone_numa_events(zone);
193}
194#endif
195
2244b95a
CL
196#ifdef CONFIG_SMP
197
b44129b3 198int calculate_pressure_threshold(struct zone *zone)
88f5acf8
MG
199{
200 int threshold;
201 int watermark_distance;
202
203 /*
204 * As vmstats are not up to date, there is drift between the estimated
205 * and real values. For high thresholds and a high number of CPUs, it
206 * is possible for the min watermark to be breached while the estimated
207 * value looks fine. The pressure threshold is a reduced value such
208 * that even the maximum amount of drift will not accidentally breach
209 * the min watermark
210 */
211 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
212 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
213
214 /*
215 * Maximum threshold is 125
216 */
217 threshold = min(125, threshold);
218
219 return threshold;
220}
221
b44129b3 222int calculate_normal_threshold(struct zone *zone)
df9ecaba
CL
223{
224 int threshold;
225 int mem; /* memory in 128 MB units */
226
227 /*
228 * The threshold scales with the number of processors and the amount
229 * of memory per zone. More memory means that we can defer updates for
230 * longer, more processors could lead to more contention.
231 * fls() is used to have a cheap way of logarithmic scaling.
232 *
233 * Some sample thresholds:
234 *
ea15ba17 235 * Threshold Processors (fls) Zonesize fls(mem)+1
df9ecaba
CL
236 * ------------------------------------------------------------------
237 * 8 1 1 0.9-1 GB 4
238 * 16 2 2 0.9-1 GB 4
239 * 20 2 2 1-2 GB 5
240 * 24 2 2 2-4 GB 6
241 * 28 2 2 4-8 GB 7
242 * 32 2 2 8-16 GB 8
243 * 4 2 2 <128M 1
244 * 30 4 3 2-4 GB 5
245 * 48 4 3 8-16 GB 8
246 * 32 8 4 1-2 GB 4
247 * 32 8 4 0.9-1GB 4
248 * 10 16 5 <128M 1
249 * 40 16 5 900M 4
250 * 70 64 7 2-4 GB 5
251 * 84 64 7 4-8 GB 6
252 * 108 512 9 4-8 GB 6
253 * 125 1024 10 8-16 GB 8
254 * 125 1024 10 16-32 GB 9
255 */
256
9705bea5 257 mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
df9ecaba
CL
258
259 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
260
261 /*
262 * Maximum threshold is 125
263 */
264 threshold = min(125, threshold);
265
266 return threshold;
267}
2244b95a
CL
268
269/*
df9ecaba 270 * Refresh the thresholds for each zone.
2244b95a 271 */
a6cccdc3 272void refresh_zone_stat_thresholds(void)
2244b95a 273{
75ef7184 274 struct pglist_data *pgdat;
df9ecaba
CL
275 struct zone *zone;
276 int cpu;
277 int threshold;
278
75ef7184
MG
279 /* Zero current pgdat thresholds */
280 for_each_online_pgdat(pgdat) {
281 for_each_online_cpu(cpu) {
282 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
283 }
284 }
285
ee99c71c 286 for_each_populated_zone(zone) {
75ef7184 287 struct pglist_data *pgdat = zone->zone_pgdat;
aa454840
CL
288 unsigned long max_drift, tolerate_drift;
289
b44129b3 290 threshold = calculate_normal_threshold(zone);
df9ecaba 291
75ef7184
MG
292 for_each_online_cpu(cpu) {
293 int pgdat_threshold;
294
28f836b6 295 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
99dcc3e5 296 = threshold;
1d90ca89 297
75ef7184
MG
298 /* Base nodestat threshold on the largest populated zone. */
299 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
300 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
301 = max(threshold, pgdat_threshold);
302 }
303
aa454840
CL
304 /*
305 * Only set percpu_drift_mark if there is a danger that
306 * NR_FREE_PAGES reports the low watermark is ok when in fact
307 * the min watermark could be breached by an allocation
308 */
309 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
310 max_drift = num_online_cpus() * threshold;
311 if (max_drift > tolerate_drift)
312 zone->percpu_drift_mark = high_wmark_pages(zone) +
313 max_drift;
df9ecaba 314 }
2244b95a
CL
315}
316
b44129b3
MG
317void set_pgdat_percpu_threshold(pg_data_t *pgdat,
318 int (*calculate_pressure)(struct zone *))
88f5acf8
MG
319{
320 struct zone *zone;
321 int cpu;
322 int threshold;
323 int i;
324
88f5acf8
MG
325 for (i = 0; i < pgdat->nr_zones; i++) {
326 zone = &pgdat->node_zones[i];
327 if (!zone->percpu_drift_mark)
328 continue;
329
b44129b3 330 threshold = (*calculate_pressure)(zone);
1d90ca89 331 for_each_online_cpu(cpu)
28f836b6 332 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
88f5acf8
MG
333 = threshold;
334 }
88f5acf8
MG
335}
336
2244b95a 337/*
bea04b07
JZ
338 * For use when we know that interrupts are disabled,
339 * or when we know that preemption is disabled and that
340 * particular counter cannot be updated from interrupt context.
2244b95a
CL
341 */
342void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 343 long delta)
2244b95a 344{
28f836b6 345 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92 346 s8 __percpu *p = pcp->vm_stat_diff + item;
2244b95a 347 long x;
12938a92
CL
348 long t;
349
c68ed794
IM
350 /*
351 * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
352 * atomicity is provided by IRQs being disabled -- either explicitly
353 * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
354 * CPU migrations and preemption potentially corrupts a counter so
355 * disable preemption.
356 */
7a025e91 357 preempt_disable_nested();
c68ed794 358
12938a92 359 x = delta + __this_cpu_read(*p);
2244b95a 360
12938a92 361 t = __this_cpu_read(pcp->stat_threshold);
2244b95a 362
40610076 363 if (unlikely(abs(x) > t)) {
2244b95a
CL
364 zone_page_state_add(x, zone, item);
365 x = 0;
366 }
12938a92 367 __this_cpu_write(*p, x);
c68ed794 368
7a025e91 369 preempt_enable_nested();
2244b95a
CL
370}
371EXPORT_SYMBOL(__mod_zone_page_state);
372
75ef7184
MG
373void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
374 long delta)
375{
376 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
377 s8 __percpu *p = pcp->vm_node_stat_diff + item;
378 long x;
379 long t;
380
ea426c2a 381 if (vmstat_item_in_bytes(item)) {
629484ae
JW
382 /*
383 * Only cgroups use subpage accounting right now; at
384 * the global level, these items still change in
385 * multiples of whole pages. Store them as pages
386 * internally to keep the per-cpu counters compact.
387 */
ea426c2a
RG
388 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
389 delta >>= PAGE_SHIFT;
390 }
391
c68ed794 392 /* See __mod_node_page_state */
7a025e91 393 preempt_disable_nested();
c68ed794 394
75ef7184
MG
395 x = delta + __this_cpu_read(*p);
396
397 t = __this_cpu_read(pcp->stat_threshold);
398
40610076 399 if (unlikely(abs(x) > t)) {
75ef7184
MG
400 node_page_state_add(x, pgdat, item);
401 x = 0;
402 }
403 __this_cpu_write(*p, x);
c68ed794 404
7a025e91 405 preempt_enable_nested();
75ef7184
MG
406}
407EXPORT_SYMBOL(__mod_node_page_state);
408
2244b95a
CL
409/*
410 * Optimized increment and decrement functions.
411 *
412 * These are only for a single page and therefore can take a struct page *
413 * argument instead of struct zone *. This allows the inclusion of the code
414 * generated for page_zone(page) into the optimized functions.
415 *
416 * No overflow check is necessary and therefore the differential can be
417 * incremented or decremented in place which may allow the compilers to
418 * generate better code.
2244b95a
CL
419 * The increment or decrement is known and therefore one boundary check can
420 * be omitted.
421 *
df9ecaba
CL
422 * NOTE: These functions are very performance sensitive. Change only
423 * with care.
424 *
2244b95a
CL
425 * Some processors have inc/dec instructions that are atomic vs an interrupt.
426 * However, the code must first determine the differential location in a zone
427 * based on the processor number and then inc/dec the counter. There is no
428 * guarantee without disabling preemption that the processor will not change
429 * in between and therefore the atomicity vs. interrupt cannot be exploited
430 * in a useful way here.
431 */
c8785385 432void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 433{
28f836b6 434 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92
CL
435 s8 __percpu *p = pcp->vm_stat_diff + item;
436 s8 v, t;
2244b95a 437
c68ed794 438 /* See __mod_node_page_state */
7a025e91 439 preempt_disable_nested();
c68ed794 440
908ee0f1 441 v = __this_cpu_inc_return(*p);
12938a92
CL
442 t = __this_cpu_read(pcp->stat_threshold);
443 if (unlikely(v > t)) {
444 s8 overstep = t >> 1;
df9ecaba 445
12938a92
CL
446 zone_page_state_add(v + overstep, zone, item);
447 __this_cpu_write(*p, -overstep);
2244b95a 448 }
c68ed794 449
7a025e91 450 preempt_enable_nested();
2244b95a 451}
ca889e6c 452
75ef7184
MG
453void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
454{
455 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
456 s8 __percpu *p = pcp->vm_node_stat_diff + item;
457 s8 v, t;
458
ea426c2a
RG
459 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
460
c68ed794 461 /* See __mod_node_page_state */
7a025e91 462 preempt_disable_nested();
c68ed794 463
75ef7184
MG
464 v = __this_cpu_inc_return(*p);
465 t = __this_cpu_read(pcp->stat_threshold);
466 if (unlikely(v > t)) {
467 s8 overstep = t >> 1;
468
469 node_page_state_add(v + overstep, pgdat, item);
470 __this_cpu_write(*p, -overstep);
471 }
c68ed794 472
7a025e91 473 preempt_enable_nested();
75ef7184
MG
474}
475
ca889e6c
CL
476void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
477{
478 __inc_zone_state(page_zone(page), item);
479}
2244b95a
CL
480EXPORT_SYMBOL(__inc_zone_page_state);
481
75ef7184
MG
482void __inc_node_page_state(struct page *page, enum node_stat_item item)
483{
484 __inc_node_state(page_pgdat(page), item);
485}
486EXPORT_SYMBOL(__inc_node_page_state);
487
c8785385 488void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 489{
28f836b6 490 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92
CL
491 s8 __percpu *p = pcp->vm_stat_diff + item;
492 s8 v, t;
2244b95a 493
c68ed794 494 /* See __mod_node_page_state */
7a025e91 495 preempt_disable_nested();
c68ed794 496
908ee0f1 497 v = __this_cpu_dec_return(*p);
12938a92
CL
498 t = __this_cpu_read(pcp->stat_threshold);
499 if (unlikely(v < - t)) {
500 s8 overstep = t >> 1;
2244b95a 501
12938a92
CL
502 zone_page_state_add(v - overstep, zone, item);
503 __this_cpu_write(*p, overstep);
2244b95a 504 }
c68ed794 505
7a025e91 506 preempt_enable_nested();
2244b95a 507}
c8785385 508
75ef7184
MG
509void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
510{
511 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
512 s8 __percpu *p = pcp->vm_node_stat_diff + item;
513 s8 v, t;
514
ea426c2a
RG
515 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
516
c68ed794 517 /* See __mod_node_page_state */
7a025e91 518 preempt_disable_nested();
c68ed794 519
75ef7184
MG
520 v = __this_cpu_dec_return(*p);
521 t = __this_cpu_read(pcp->stat_threshold);
522 if (unlikely(v < - t)) {
523 s8 overstep = t >> 1;
524
525 node_page_state_add(v - overstep, pgdat, item);
526 __this_cpu_write(*p, overstep);
527 }
c68ed794 528
7a025e91 529 preempt_enable_nested();
75ef7184
MG
530}
531
c8785385
CL
532void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
533{
534 __dec_zone_state(page_zone(page), item);
535}
2244b95a
CL
536EXPORT_SYMBOL(__dec_zone_page_state);
537
75ef7184
MG
538void __dec_node_page_state(struct page *page, enum node_stat_item item)
539{
540 __dec_node_state(page_pgdat(page), item);
541}
542EXPORT_SYMBOL(__dec_node_page_state);
543
4156153c 544#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
7c839120
CL
545/*
546 * If we have cmpxchg_local support then we do not need to incur the overhead
547 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
548 *
549 * mod_state() modifies the zone counter state through atomic per cpu
550 * operations.
551 *
552 * Overstep mode specifies how overstep should handled:
553 * 0 No overstepping
554 * 1 Overstepping half of threshold
555 * -1 Overstepping minus half of threshold
556*/
75ef7184
MG
557static inline void mod_zone_state(struct zone *zone,
558 enum zone_stat_item item, long delta, int overstep_mode)
7c839120 559{
28f836b6 560 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
7c839120 561 s8 __percpu *p = pcp->vm_stat_diff + item;
77cd8148
UB
562 long n, t, z;
563 s8 o;
7c839120 564
77cd8148 565 o = this_cpu_read(*p);
7c839120
CL
566 do {
567 z = 0; /* overflow to zone counters */
568
569 /*
570 * The fetching of the stat_threshold is racy. We may apply
571 * a counter threshold to the wrong the cpu if we get
d3bc2367
CL
572 * rescheduled while executing here. However, the next
573 * counter update will apply the threshold again and
574 * therefore bring the counter under the threshold again.
575 *
576 * Most of the time the thresholds are the same anyways
577 * for all cpus in a zone.
7c839120
CL
578 */
579 t = this_cpu_read(pcp->stat_threshold);
580
77cd8148 581 n = delta + (long)o;
7c839120 582
40610076 583 if (abs(n) > t) {
7c839120
CL
584 int os = overstep_mode * (t >> 1) ;
585
586 /* Overflow must be added to zone counters */
587 z = n + os;
588 n = -os;
589 }
77cd8148 590 } while (!this_cpu_try_cmpxchg(*p, &o, n));
7c839120
CL
591
592 if (z)
593 zone_page_state_add(z, zone, item);
594}
595
596void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 597 long delta)
7c839120 598{
75ef7184 599 mod_zone_state(zone, item, delta, 0);
7c839120
CL
600}
601EXPORT_SYMBOL(mod_zone_page_state);
602
7c839120
CL
603void inc_zone_page_state(struct page *page, enum zone_stat_item item)
604{
75ef7184 605 mod_zone_state(page_zone(page), item, 1, 1);
7c839120
CL
606}
607EXPORT_SYMBOL(inc_zone_page_state);
608
609void dec_zone_page_state(struct page *page, enum zone_stat_item item)
610{
75ef7184 611 mod_zone_state(page_zone(page), item, -1, -1);
7c839120
CL
612}
613EXPORT_SYMBOL(dec_zone_page_state);
75ef7184
MG
614
615static inline void mod_node_state(struct pglist_data *pgdat,
616 enum node_stat_item item, int delta, int overstep_mode)
617{
618 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
619 s8 __percpu *p = pcp->vm_node_stat_diff + item;
77cd8148
UB
620 long n, t, z;
621 s8 o;
75ef7184 622
ea426c2a 623 if (vmstat_item_in_bytes(item)) {
629484ae
JW
624 /*
625 * Only cgroups use subpage accounting right now; at
626 * the global level, these items still change in
627 * multiples of whole pages. Store them as pages
628 * internally to keep the per-cpu counters compact.
629 */
ea426c2a
RG
630 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
631 delta >>= PAGE_SHIFT;
632 }
633
77cd8148 634 o = this_cpu_read(*p);
75ef7184
MG
635 do {
636 z = 0; /* overflow to node counters */
637
638 /*
639 * The fetching of the stat_threshold is racy. We may apply
640 * a counter threshold to the wrong the cpu if we get
641 * rescheduled while executing here. However, the next
642 * counter update will apply the threshold again and
643 * therefore bring the counter under the threshold again.
644 *
645 * Most of the time the thresholds are the same anyways
646 * for all cpus in a node.
647 */
648 t = this_cpu_read(pcp->stat_threshold);
649
77cd8148 650 n = delta + (long)o;
75ef7184 651
40610076 652 if (abs(n) > t) {
75ef7184
MG
653 int os = overstep_mode * (t >> 1) ;
654
655 /* Overflow must be added to node counters */
656 z = n + os;
657 n = -os;
658 }
77cd8148 659 } while (!this_cpu_try_cmpxchg(*p, &o, n));
75ef7184
MG
660
661 if (z)
662 node_page_state_add(z, pgdat, item);
663}
664
665void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
666 long delta)
667{
668 mod_node_state(pgdat, item, delta, 0);
669}
670EXPORT_SYMBOL(mod_node_page_state);
671
672void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
673{
674 mod_node_state(pgdat, item, 1, 1);
675}
676
677void inc_node_page_state(struct page *page, enum node_stat_item item)
678{
679 mod_node_state(page_pgdat(page), item, 1, 1);
680}
681EXPORT_SYMBOL(inc_node_page_state);
682
683void dec_node_page_state(struct page *page, enum node_stat_item item)
684{
685 mod_node_state(page_pgdat(page), item, -1, -1);
686}
687EXPORT_SYMBOL(dec_node_page_state);
7c839120
CL
688#else
689/*
690 * Use interrupt disable to serialize counter updates
691 */
692void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 693 long delta)
7c839120
CL
694{
695 unsigned long flags;
696
697 local_irq_save(flags);
698 __mod_zone_page_state(zone, item, delta);
699 local_irq_restore(flags);
700}
701EXPORT_SYMBOL(mod_zone_page_state);
702
2244b95a
CL
703void inc_zone_page_state(struct page *page, enum zone_stat_item item)
704{
705 unsigned long flags;
706 struct zone *zone;
2244b95a
CL
707
708 zone = page_zone(page);
709 local_irq_save(flags);
ca889e6c 710 __inc_zone_state(zone, item);
2244b95a
CL
711 local_irq_restore(flags);
712}
713EXPORT_SYMBOL(inc_zone_page_state);
714
715void dec_zone_page_state(struct page *page, enum zone_stat_item item)
716{
717 unsigned long flags;
2244b95a 718
2244b95a 719 local_irq_save(flags);
a302eb4e 720 __dec_zone_page_state(page, item);
2244b95a
CL
721 local_irq_restore(flags);
722}
723EXPORT_SYMBOL(dec_zone_page_state);
724
75ef7184
MG
725void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
726{
727 unsigned long flags;
728
729 local_irq_save(flags);
730 __inc_node_state(pgdat, item);
731 local_irq_restore(flags);
732}
733EXPORT_SYMBOL(inc_node_state);
734
735void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
736 long delta)
737{
738 unsigned long flags;
739
740 local_irq_save(flags);
741 __mod_node_page_state(pgdat, item, delta);
742 local_irq_restore(flags);
743}
744EXPORT_SYMBOL(mod_node_page_state);
745
746void inc_node_page_state(struct page *page, enum node_stat_item item)
747{
748 unsigned long flags;
749 struct pglist_data *pgdat;
750
751 pgdat = page_pgdat(page);
752 local_irq_save(flags);
753 __inc_node_state(pgdat, item);
754 local_irq_restore(flags);
755}
756EXPORT_SYMBOL(inc_node_page_state);
757
758void dec_node_page_state(struct page *page, enum node_stat_item item)
759{
760 unsigned long flags;
761
762 local_irq_save(flags);
763 __dec_node_page_state(page, item);
764 local_irq_restore(flags);
765}
766EXPORT_SYMBOL(dec_node_page_state);
767#endif
7cc36bbd
CL
768
769/*
770 * Fold a differential into the global counters.
771 * Returns the number of counters updated.
772 */
f19298b9 773static int fold_diff(int *zone_diff, int *node_diff)
3a321d2a
KW
774{
775 int i;
776 int changes = 0;
777
778 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
779 if (zone_diff[i]) {
780 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
781 changes++;
782 }
783
3a321d2a
KW
784 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
785 if (node_diff[i]) {
786 atomic_long_add(node_diff[i], &vm_node_stat[i]);
787 changes++;
788 }
789 return changes;
790}
f19298b9 791
2244b95a 792/*
2bb921e5 793 * Update the zone counters for the current cpu.
a7f75e25 794 *
4037d452
CL
795 * Note that refresh_cpu_vm_stats strives to only access
796 * node local memory. The per cpu pagesets on remote zones are placed
797 * in the memory local to the processor using that pageset. So the
798 * loop over all zones will access a series of cachelines local to
799 * the processor.
800 *
801 * The call to zone_page_state_add updates the cachelines with the
802 * statistics in the remote zone struct as well as the global cachelines
803 * with the global counters. These could cause remote node cache line
804 * bouncing and will have to be only done when necessary.
7cc36bbd
CL
805 *
806 * The function returns the number of global counters updated.
2244b95a 807 */
0eb77e98 808static int refresh_cpu_vm_stats(bool do_pagesets)
2244b95a 809{
75ef7184 810 struct pglist_data *pgdat;
2244b95a
CL
811 struct zone *zone;
812 int i;
75ef7184
MG
813 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
814 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
7cc36bbd 815 int changes = 0;
2244b95a 816
ee99c71c 817 for_each_populated_zone(zone) {
28f836b6 818 struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
28f836b6 819 struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
2244b95a 820
fbc2edb0
CL
821 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
822 int v;
2244b95a 823
28f836b6 824 v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
fbc2edb0 825 if (v) {
a7f75e25 826
a7f75e25 827 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 828 global_zone_diff[i] += v;
4037d452
CL
829#ifdef CONFIG_NUMA
830 /* 3 seconds idle till flush */
28f836b6 831 __this_cpu_write(pcp->expire, 3);
4037d452 832#endif
2244b95a 833 }
fbc2edb0 834 }
3a321d2a 835
0eb77e98
CL
836 if (do_pagesets) {
837 cond_resched();
51a755c5
YH
838
839 changes += decay_pcp_high(zone, this_cpu_ptr(pcp));
840#ifdef CONFIG_NUMA
0eb77e98
CL
841 /*
842 * Deal with draining the remote pageset of this
843 * processor
844 *
845 * Check if there are pages remaining in this pageset
846 * if not then there is nothing to expire.
847 */
28f836b6
MG
848 if (!__this_cpu_read(pcp->expire) ||
849 !__this_cpu_read(pcp->count))
0eb77e98 850 continue;
4037d452 851
0eb77e98
CL
852 /*
853 * We never drain zones local to this processor.
854 */
855 if (zone_to_nid(zone) == numa_node_id()) {
28f836b6 856 __this_cpu_write(pcp->expire, 0);
0eb77e98
CL
857 continue;
858 }
4037d452 859
fa8c4f9a
YH
860 if (__this_cpu_dec_return(pcp->expire)) {
861 changes++;
0eb77e98 862 continue;
fa8c4f9a 863 }
4037d452 864
28f836b6
MG
865 if (__this_cpu_read(pcp->count)) {
866 drain_zone_pages(zone, this_cpu_ptr(pcp));
0eb77e98
CL
867 changes++;
868 }
4037d452 869#endif
51a755c5 870 }
2244b95a 871 }
75ef7184
MG
872
873 for_each_online_pgdat(pgdat) {
874 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
875
876 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
877 int v;
878
879 v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
880 if (v) {
881 atomic_long_add(v, &pgdat->vm_stat[i]);
882 global_node_diff[i] += v;
883 }
884 }
885 }
886
887 changes += fold_diff(global_zone_diff, global_node_diff);
7cc36bbd 888 return changes;
2244b95a
CL
889}
890
2bb921e5
CL
891/*
892 * Fold the data for an offline cpu into the global array.
893 * There cannot be any access by the offline cpu and therefore
894 * synchronization is simplified.
895 */
896void cpu_vm_stats_fold(int cpu)
897{
75ef7184 898 struct pglist_data *pgdat;
2bb921e5
CL
899 struct zone *zone;
900 int i;
75ef7184
MG
901 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
902 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
2bb921e5
CL
903
904 for_each_populated_zone(zone) {
28f836b6 905 struct per_cpu_zonestat *pzstats;
2bb921e5 906
28f836b6 907 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2bb921e5 908
f19298b9 909 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
28f836b6 910 if (pzstats->vm_stat_diff[i]) {
2bb921e5
CL
911 int v;
912
28f836b6
MG
913 v = pzstats->vm_stat_diff[i];
914 pzstats->vm_stat_diff[i] = 0;
2bb921e5 915 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 916 global_zone_diff[i] += v;
2bb921e5 917 }
f19298b9 918 }
3a321d2a 919#ifdef CONFIG_NUMA
f19298b9
MG
920 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
921 if (pzstats->vm_numa_event[i]) {
922 unsigned long v;
3a321d2a 923
f19298b9
MG
924 v = pzstats->vm_numa_event[i];
925 pzstats->vm_numa_event[i] = 0;
926 zone_numa_event_add(v, zone, i);
3a321d2a 927 }
f19298b9 928 }
3a321d2a 929#endif
2bb921e5
CL
930 }
931
75ef7184
MG
932 for_each_online_pgdat(pgdat) {
933 struct per_cpu_nodestat *p;
934
935 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
936
937 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
938 if (p->vm_node_stat_diff[i]) {
939 int v;
940
941 v = p->vm_node_stat_diff[i];
942 p->vm_node_stat_diff[i] = 0;
943 atomic_long_add(v, &pgdat->vm_stat[i]);
944 global_node_diff[i] += v;
945 }
946 }
947
948 fold_diff(global_zone_diff, global_node_diff);
2bb921e5
CL
949}
950
40f4b1ea
CS
951/*
952 * this is only called if !populated_zone(zone), which implies no other users of
f0953a1b 953 * pset->vm_stat_diff[] exist.
40f4b1ea 954 */
28f836b6 955void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
5a883813 956{
f19298b9 957 unsigned long v;
5a883813
MK
958 int i;
959
f19298b9 960 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
28f836b6 961 if (pzstats->vm_stat_diff[i]) {
f19298b9 962 v = pzstats->vm_stat_diff[i];
28f836b6 963 pzstats->vm_stat_diff[i] = 0;
f19298b9 964 zone_page_state_add(v, zone, i);
5a883813 965 }
f19298b9 966 }
3a321d2a
KW
967
968#ifdef CONFIG_NUMA
f19298b9
MG
969 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
970 if (pzstats->vm_numa_event[i]) {
971 v = pzstats->vm_numa_event[i];
972 pzstats->vm_numa_event[i] = 0;
973 zone_numa_event_add(v, zone, i);
3a321d2a 974 }
f19298b9 975 }
3a321d2a 976#endif
5a883813 977}
2244b95a
CL
978#endif
979
ca889e6c 980#ifdef CONFIG_NUMA
c2d42c16 981/*
75ef7184
MG
982 * Determine the per node value of a stat item. This function
983 * is called frequently in a NUMA machine, so try to be as
984 * frugal as possible.
c2d42c16 985 */
75ef7184
MG
986unsigned long sum_zone_node_page_state(int node,
987 enum zone_stat_item item)
c2d42c16
AM
988{
989 struct zone *zones = NODE_DATA(node)->node_zones;
e87d59f7
JK
990 int i;
991 unsigned long count = 0;
c2d42c16 992
e87d59f7
JK
993 for (i = 0; i < MAX_NR_ZONES; i++)
994 count += zone_page_state(zones + i, item);
995
996 return count;
c2d42c16
AM
997}
998
f19298b9
MG
999/* Determine the per node value of a numa stat item. */
1000unsigned long sum_zone_numa_event_state(int node,
3a321d2a
KW
1001 enum numa_stat_item item)
1002{
1003 struct zone *zones = NODE_DATA(node)->node_zones;
3a321d2a 1004 unsigned long count = 0;
f19298b9 1005 int i;
3a321d2a
KW
1006
1007 for (i = 0; i < MAX_NR_ZONES; i++)
f19298b9 1008 count += zone_numa_event_state(zones + i, item);
3a321d2a
KW
1009
1010 return count;
1011}
1012
75ef7184
MG
1013/*
1014 * Determine the per node value of a stat item.
1015 */
ea426c2a
RG
1016unsigned long node_page_state_pages(struct pglist_data *pgdat,
1017 enum node_stat_item item)
75ef7184
MG
1018{
1019 long x = atomic_long_read(&pgdat->vm_stat[item]);
1020#ifdef CONFIG_SMP
1021 if (x < 0)
1022 x = 0;
1023#endif
1024 return x;
1025}
ea426c2a
RG
1026
1027unsigned long node_page_state(struct pglist_data *pgdat,
1028 enum node_stat_item item)
1029{
1030 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1031
1032 return node_page_state_pages(pgdat, item);
1033}
ca889e6c
CL
1034#endif
1035
9d857311
PT
1036/*
1037 * Count number of pages "struct page" and "struct page_ext" consume.
1038 * nr_memmap_boot_pages: # of pages allocated by boot allocator
1039 * nr_memmap_pages: # of pages that were allocated by buddy allocator
1040 */
1041static atomic_long_t nr_memmap_boot_pages = ATOMIC_LONG_INIT(0);
1042static atomic_long_t nr_memmap_pages = ATOMIC_LONG_INIT(0);
1043
1044void memmap_boot_pages_add(long delta)
1045{
1046 atomic_long_add(delta, &nr_memmap_boot_pages);
1047}
1048
1049void memmap_pages_add(long delta)
1050{
1051 atomic_long_add(delta, &nr_memmap_pages);
1052}
1053
d7a5752c 1054#ifdef CONFIG_COMPACTION
36deb0be 1055
d7a5752c
MG
1056struct contig_page_info {
1057 unsigned long free_pages;
1058 unsigned long free_blocks_total;
1059 unsigned long free_blocks_suitable;
1060};
1061
1062/*
1063 * Calculate the number of free pages in a zone, how many contiguous
1064 * pages are free and how many are large enough to satisfy an allocation of
1065 * the target size. Note that this function makes no attempt to estimate
1066 * how many suitable free blocks there *might* be if MOVABLE pages were
1067 * migrated. Calculating that is possible, but expensive and can be
1068 * figured out from userspace
1069 */
1070static void fill_contig_page_info(struct zone *zone,
1071 unsigned int suitable_order,
1072 struct contig_page_info *info)
1073{
1074 unsigned int order;
1075
1076 info->free_pages = 0;
1077 info->free_blocks_total = 0;
1078 info->free_blocks_suitable = 0;
1079
fd377218 1080 for (order = 0; order < NR_PAGE_ORDERS; order++) {
d7a5752c
MG
1081 unsigned long blocks;
1082
af1c31ac
LS
1083 /*
1084 * Count number of free blocks.
1085 *
1086 * Access to nr_free is lockless as nr_free is used only for
1087 * diagnostic purposes. Use data_race to avoid KCSAN warning.
1088 */
1089 blocks = data_race(zone->free_area[order].nr_free);
d7a5752c
MG
1090 info->free_blocks_total += blocks;
1091
1092 /* Count free base pages */
1093 info->free_pages += blocks << order;
1094
1095 /* Count the suitable free blocks */
1096 if (order >= suitable_order)
1097 info->free_blocks_suitable += blocks <<
1098 (order - suitable_order);
1099 }
1100}
f1a5ab12
MG
1101
1102/*
1103 * A fragmentation index only makes sense if an allocation of a requested
1104 * size would fail. If that is true, the fragmentation index indicates
1105 * whether external fragmentation or a lack of memory was the problem.
1106 * The value can be used to determine if page reclaim or compaction
1107 * should be used
1108 */
56de7263 1109static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
f1a5ab12
MG
1110{
1111 unsigned long requested = 1UL << order;
1112
5e0a760b 1113 if (WARN_ON_ONCE(order > MAX_PAGE_ORDER))
88d6ac40
WY
1114 return 0;
1115
f1a5ab12
MG
1116 if (!info->free_blocks_total)
1117 return 0;
1118
1119 /* Fragmentation index only makes sense when a request would fail */
1120 if (info->free_blocks_suitable)
1121 return -1000;
1122
1123 /*
1124 * Index is between 0 and 1 so return within 3 decimal places
1125 *
1126 * 0 => allocation would fail due to lack of memory
1127 * 1 => allocation would fail due to fragmentation
1128 */
1129 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1130}
56de7263 1131
facdaa91
NG
1132/*
1133 * Calculates external fragmentation within a zone wrt the given order.
1134 * It is defined as the percentage of pages found in blocks of size
1135 * less than 1 << order. It returns values in range [0, 100].
1136 */
d34c0a75 1137unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
facdaa91
NG
1138{
1139 struct contig_page_info info;
1140
1141 fill_contig_page_info(zone, order, &info);
1142 if (info.free_pages == 0)
1143 return 0;
1144
1145 return div_u64((info.free_pages -
1146 (info.free_blocks_suitable << order)) * 100,
1147 info.free_pages);
1148}
1149
56de7263
MG
1150/* Same as __fragmentation index but allocs contig_page_info on stack */
1151int fragmentation_index(struct zone *zone, unsigned int order)
1152{
1153 struct contig_page_info info;
1154
1155 fill_contig_page_info(zone, order, &info);
1156 return __fragmentation_index(order, &info);
1157}
d7a5752c
MG
1158#endif
1159
ebc5d83d
KK
1160#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1161 defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
fa25c503
KM
1162#ifdef CONFIG_ZONE_DMA
1163#define TEXT_FOR_DMA(xx) xx "_dma",
1164#else
1165#define TEXT_FOR_DMA(xx)
1166#endif
1167
1168#ifdef CONFIG_ZONE_DMA32
1169#define TEXT_FOR_DMA32(xx) xx "_dma32",
1170#else
1171#define TEXT_FOR_DMA32(xx)
1172#endif
1173
1174#ifdef CONFIG_HIGHMEM
1175#define TEXT_FOR_HIGHMEM(xx) xx "_high",
1176#else
1177#define TEXT_FOR_HIGHMEM(xx)
1178#endif
1179
a39c5d3c
HL
1180#ifdef CONFIG_ZONE_DEVICE
1181#define TEXT_FOR_DEVICE(xx) xx "_device",
1182#else
1183#define TEXT_FOR_DEVICE(xx)
1184#endif
1185
fa25c503 1186#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
a39c5d3c
HL
1187 TEXT_FOR_HIGHMEM(xx) xx "_movable", \
1188 TEXT_FOR_DEVICE(xx)
fa25c503
KM
1189
1190const char * const vmstat_text[] = {
8d92890b 1191 /* enum zone_stat_item counters */
fa25c503 1192 "nr_free_pages",
71c799f4
MK
1193 "nr_zone_inactive_anon",
1194 "nr_zone_active_anon",
1195 "nr_zone_inactive_file",
1196 "nr_zone_active_file",
1197 "nr_zone_unevictable",
5a1c84b4 1198 "nr_zone_write_pending",
fa25c503 1199 "nr_mlock",
fa25c503 1200 "nr_bounce",
91537fee
MK
1201#if IS_ENABLED(CONFIG_ZSMALLOC)
1202 "nr_zspages",
1203#endif
3a321d2a 1204 "nr_free_cma",
dcdfdd40
KS
1205#ifdef CONFIG_UNACCEPTED_MEMORY
1206 "nr_unaccepted",
1207#endif
3a321d2a
KW
1208
1209 /* enum numa_stat_item counters */
fa25c503
KM
1210#ifdef CONFIG_NUMA
1211 "numa_hit",
1212 "numa_miss",
1213 "numa_foreign",
1214 "numa_interleave",
1215 "numa_local",
1216 "numa_other",
1217#endif
09316c09 1218
9d7ea9a2 1219 /* enum node_stat_item counters */
599d0c95
MG
1220 "nr_inactive_anon",
1221 "nr_active_anon",
1222 "nr_inactive_file",
1223 "nr_active_file",
1224 "nr_unevictable",
385386cf
JW
1225 "nr_slab_reclaimable",
1226 "nr_slab_unreclaimable",
599d0c95
MG
1227 "nr_isolated_anon",
1228 "nr_isolated_file",
68d48e6a 1229 "workingset_nodes",
170b04b7
JK
1230 "workingset_refault_anon",
1231 "workingset_refault_file",
1232 "workingset_activate_anon",
1233 "workingset_activate_file",
1234 "workingset_restore_anon",
1235 "workingset_restore_file",
1e6b1085 1236 "workingset_nodereclaim",
50658e2e
MG
1237 "nr_anon_pages",
1238 "nr_mapped",
11fb9989
MG
1239 "nr_file_pages",
1240 "nr_dirty",
1241 "nr_writeback",
1242 "nr_writeback_temp",
1243 "nr_shmem",
1244 "nr_shmem_hugepages",
1245 "nr_shmem_pmdmapped",
60fbf0ab
SL
1246 "nr_file_hugepages",
1247 "nr_file_pmdmapped",
11fb9989 1248 "nr_anon_transparent_hugepages",
c4a25635
MG
1249 "nr_vmscan_write",
1250 "nr_vmscan_immediate_reclaim",
1251 "nr_dirtied",
1252 "nr_written",
8cd7c588 1253 "nr_throttled_written",
b29940c1 1254 "nr_kernel_misc_reclaimable",
1970dc6f
JH
1255 "nr_foll_pin_acquired",
1256 "nr_foll_pin_released",
991e7673
SB
1257 "nr_kernel_stack",
1258#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1259 "nr_shadow_call_stack",
1260#endif
f0c0c115 1261 "nr_page_table_pages",
ebc97a52 1262 "nr_sec_page_table_pages",
bd3520a9
PT
1263#ifdef CONFIG_IOMMU_SUPPORT
1264 "nr_iommu_pages",
1265#endif
b6038942
SB
1266#ifdef CONFIG_SWAP
1267 "nr_swapcached",
1268#endif
e39bb6be
YH
1269#ifdef CONFIG_NUMA_BALANCING
1270 "pgpromote_success",
c6833e10 1271 "pgpromote_candidate",
b805ab3c 1272#endif
23e9f013
LZ
1273 "pgdemote_kswapd",
1274 "pgdemote_direct",
1275 "pgdemote_khugepaged",
f4cb78af 1276 /* system-wide enum vm_stat_item counters */
fa25c503
KM
1277 "nr_dirty_threshold",
1278 "nr_dirty_background_threshold",
9d857311
PT
1279 "nr_memmap_pages",
1280 "nr_memmap_boot_pages",
fa25c503 1281
ebc5d83d 1282#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
09316c09 1283 /* enum vm_event_item counters */
fa25c503
KM
1284 "pgpgin",
1285 "pgpgout",
1286 "pswpin",
1287 "pswpout",
1288
1289 TEXTS_FOR_ZONES("pgalloc")
7cc30fcf
MG
1290 TEXTS_FOR_ZONES("allocstall")
1291 TEXTS_FOR_ZONES("pgskip")
fa25c503
KM
1292
1293 "pgfree",
1294 "pgactivate",
1295 "pgdeactivate",
f7ad2a6c 1296 "pglazyfree",
fa25c503
KM
1297
1298 "pgfault",
1299 "pgmajfault",
854e9ed0 1300 "pglazyfreed",
fa25c503 1301
599d0c95 1302 "pgrefill",
798a6b87 1303 "pgreuse",
599d0c95
MG
1304 "pgsteal_kswapd",
1305 "pgsteal_direct",
57e9cc50 1306 "pgsteal_khugepaged",
599d0c95
MG
1307 "pgscan_kswapd",
1308 "pgscan_direct",
57e9cc50 1309 "pgscan_khugepaged",
68243e76 1310 "pgscan_direct_throttle",
497a6c1b
JW
1311 "pgscan_anon",
1312 "pgscan_file",
1313 "pgsteal_anon",
1314 "pgsteal_file",
fa25c503
KM
1315
1316#ifdef CONFIG_NUMA
5fe690a5 1317 "zone_reclaim_success",
fa25c503
KM
1318 "zone_reclaim_failed",
1319#endif
1320 "pginodesteal",
1321 "slabs_scanned",
fa25c503
KM
1322 "kswapd_inodesteal",
1323 "kswapd_low_wmark_hit_quickly",
1324 "kswapd_high_wmark_hit_quickly",
fa25c503 1325 "pageoutrun",
fa25c503
KM
1326
1327 "pgrotated",
1328
5509a5d2
DH
1329 "drop_pagecache",
1330 "drop_slab",
8e675f7a 1331 "oom_kill",
5509a5d2 1332
03c5a6e1
MG
1333#ifdef CONFIG_NUMA_BALANCING
1334 "numa_pte_updates",
72403b4a 1335 "numa_huge_pte_updates",
03c5a6e1
MG
1336 "numa_hint_faults",
1337 "numa_hint_faults_local",
1338 "numa_pages_migrated",
1339#endif
5647bc29
MG
1340#ifdef CONFIG_MIGRATION
1341 "pgmigrate_success",
1342 "pgmigrate_fail",
1a5bae25
AK
1343 "thp_migration_success",
1344 "thp_migration_fail",
1345 "thp_migration_split",
5647bc29 1346#endif
fa25c503 1347#ifdef CONFIG_COMPACTION
397487db
MG
1348 "compact_migrate_scanned",
1349 "compact_free_scanned",
1350 "compact_isolated",
fa25c503
KM
1351 "compact_stall",
1352 "compact_fail",
1353 "compact_success",
698b1b30 1354 "compact_daemon_wake",
7f354a54
DR
1355 "compact_daemon_migrate_scanned",
1356 "compact_daemon_free_scanned",
fa25c503
KM
1357#endif
1358
1359#ifdef CONFIG_HUGETLB_PAGE
1360 "htlb_buddy_alloc_success",
1361 "htlb_buddy_alloc_fail",
bbb26920
MK
1362#endif
1363#ifdef CONFIG_CMA
1364 "cma_alloc_success",
1365 "cma_alloc_fail",
fa25c503
KM
1366#endif
1367 "unevictable_pgs_culled",
1368 "unevictable_pgs_scanned",
1369 "unevictable_pgs_rescued",
1370 "unevictable_pgs_mlocked",
1371 "unevictable_pgs_munlocked",
1372 "unevictable_pgs_cleared",
1373 "unevictable_pgs_stranded",
fa25c503
KM
1374
1375#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1376 "thp_fault_alloc",
1377 "thp_fault_fallback",
85b9f46e 1378 "thp_fault_fallback_charge",
fa25c503
KM
1379 "thp_collapse_alloc",
1380 "thp_collapse_alloc_failed",
95ecedcd 1381 "thp_file_alloc",
dcdf11ee 1382 "thp_file_fallback",
85b9f46e 1383 "thp_file_fallback_charge",
95ecedcd 1384 "thp_file_mapped",
122afea9
KS
1385 "thp_split_page",
1386 "thp_split_page_failed",
f9719a03 1387 "thp_deferred_split_page",
dafff3f4 1388 "thp_underused_split_page",
122afea9 1389 "thp_split_pmd",
e9ea874a
YY
1390 "thp_scan_exceed_none_pte",
1391 "thp_scan_exceed_swap_pte",
1392 "thp_scan_exceed_share_pte",
ce9311cf
YX
1393#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1394 "thp_split_pud",
1395#endif
d8a8e1f0
KS
1396 "thp_zero_page_alloc",
1397 "thp_zero_page_alloc_failed",
225311a4 1398 "thp_swpout",
fe490cc0 1399 "thp_swpout_fallback",
fa25c503 1400#endif
09316c09
KK
1401#ifdef CONFIG_MEMORY_BALLOON
1402 "balloon_inflate",
1403 "balloon_deflate",
1404#ifdef CONFIG_BALLOON_COMPACTION
1405 "balloon_migrate",
1406#endif
1407#endif /* CONFIG_MEMORY_BALLOON */
ec659934 1408#ifdef CONFIG_DEBUG_TLBFLUSH
9824cf97
DH
1409 "nr_tlb_remote_flush",
1410 "nr_tlb_remote_flush_received",
1411 "nr_tlb_local_flush_all",
1412 "nr_tlb_local_flush_one",
ec659934 1413#endif /* CONFIG_DEBUG_TLBFLUSH */
fa25c503 1414
cbc65df2
YH
1415#ifdef CONFIG_SWAP
1416 "swap_ra",
1417 "swap_ra_hit",
4d45c3af
YY
1418#ifdef CONFIG_KSM
1419 "ksm_swpin_copy",
1420#endif
cbc65df2 1421#endif
94bfe85b
YY
1422#ifdef CONFIG_KSM
1423 "cow_ksm",
1424#endif
f6498b77
JW
1425#ifdef CONFIG_ZSWAP
1426 "zswpin",
1427 "zswpout",
7108cc3f 1428 "zswpwb",
f6498b77 1429#endif
575299ea
S
1430#ifdef CONFIG_X86
1431 "direct_map_level2_splits",
1432 "direct_map_level3_splits",
1433#endif
52f23865
SB
1434#ifdef CONFIG_PER_VMA_LOCK_STATS
1435 "vma_lock_success",
1436 "vma_lock_abort",
1437 "vma_lock_retry",
1438 "vma_lock_miss",
1439#endif
c4a6fce8
PT
1440#ifdef CONFIG_DEBUG_STACK_USAGE
1441 "kstack_1k",
1442#if THREAD_SIZE > 1024
1443 "kstack_2k",
1444#endif
1445#if THREAD_SIZE > 2048
1446 "kstack_4k",
1447#endif
1448#if THREAD_SIZE > 4096
1449 "kstack_8k",
1450#endif
1451#if THREAD_SIZE > 8192
1452 "kstack_16k",
1453#endif
1454#if THREAD_SIZE > 16384
1455 "kstack_32k",
1456#endif
1457#if THREAD_SIZE > 32768
1458 "kstack_64k",
1459#endif
1460#if THREAD_SIZE > 65536
1461 "kstack_rest",
1462#endif
1463#endif
ebc5d83d 1464#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
fa25c503 1465};
ebc5d83d 1466#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
fa25c503 1467
3c486871
AM
1468#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1469 defined(CONFIG_PROC_FS)
1470static void *frag_start(struct seq_file *m, loff_t *pos)
1471{
1472 pg_data_t *pgdat;
1473 loff_t node = *pos;
1474
1475 for (pgdat = first_online_pgdat();
1476 pgdat && node;
1477 pgdat = next_online_pgdat(pgdat))
1478 --node;
1479
1480 return pgdat;
1481}
1482
1483static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1484{
1485 pg_data_t *pgdat = (pg_data_t *)arg;
1486
1487 (*pos)++;
1488 return next_online_pgdat(pgdat);
1489}
1490
1491static void frag_stop(struct seq_file *m, void *arg)
1492{
1493}
1494
b2bd8598
DR
1495/*
1496 * Walk zones in a node and print using a callback.
1497 * If @assert_populated is true, only use callback for zones that are populated.
1498 */
3c486871 1499static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
727c080f 1500 bool assert_populated, bool nolock,
3c486871
AM
1501 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1502{
1503 struct zone *zone;
1504 struct zone *node_zones = pgdat->node_zones;
1505 unsigned long flags;
1506
1507 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
b2bd8598 1508 if (assert_populated && !populated_zone(zone))
3c486871
AM
1509 continue;
1510
727c080f
VM
1511 if (!nolock)
1512 spin_lock_irqsave(&zone->lock, flags);
3c486871 1513 print(m, pgdat, zone);
727c080f
VM
1514 if (!nolock)
1515 spin_unlock_irqrestore(&zone->lock, flags);
3c486871
AM
1516 }
1517}
1518#endif
1519
d7a5752c 1520#ifdef CONFIG_PROC_FS
467c996c
MG
1521static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1522 struct zone *zone)
1523{
1524 int order;
1525
1526 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
fd377218 1527 for (order = 0; order < NR_PAGE_ORDERS; ++order)
af1c31ac
LS
1528 /*
1529 * Access to nr_free is lockless as nr_free is used only for
1530 * printing purposes. Use data_race to avoid KCSAN warning.
1531 */
1532 seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
467c996c
MG
1533 seq_putc(m, '\n');
1534}
1535
1536/*
1537 * This walks the free areas for each zone.
1538 */
1539static int frag_show(struct seq_file *m, void *arg)
1540{
1541 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1542 walk_zones_in_node(m, pgdat, true, false, frag_show_print);
467c996c
MG
1543 return 0;
1544}
1545
1546static void pagetypeinfo_showfree_print(struct seq_file *m,
1547 pg_data_t *pgdat, struct zone *zone)
1548{
1549 int order, mtype;
1550
1551 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1552 seq_printf(m, "Node %4d, zone %8s, type %12s ",
1553 pgdat->node_id,
1554 zone->name,
1555 migratetype_names[mtype]);
fd377218 1556 for (order = 0; order < NR_PAGE_ORDERS; ++order) {
467c996c
MG
1557 unsigned long freecount = 0;
1558 struct free_area *area;
1559 struct list_head *curr;
93b3a674 1560 bool overflow = false;
467c996c
MG
1561
1562 area = &(zone->free_area[order]);
1563
93b3a674
MH
1564 list_for_each(curr, &area->free_list[mtype]) {
1565 /*
1566 * Cap the free_list iteration because it might
1567 * be really large and we are under a spinlock
1568 * so a long time spent here could trigger a
1569 * hard lockup detector. Anyway this is a
1570 * debugging tool so knowing there is a handful
1571 * of pages of this order should be more than
1572 * sufficient.
1573 */
1574 if (++freecount >= 100000) {
1575 overflow = true;
1576 break;
1577 }
1578 }
1579 seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1580 spin_unlock_irq(&zone->lock);
1581 cond_resched();
1582 spin_lock_irq(&zone->lock);
467c996c 1583 }
f6ac2354
CL
1584 seq_putc(m, '\n');
1585 }
467c996c
MG
1586}
1587
1588/* Print out the free pages at each order for each migatetype */
33090af9 1589static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
467c996c
MG
1590{
1591 int order;
1592 pg_data_t *pgdat = (pg_data_t *)arg;
1593
1594 /* Print header */
1595 seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
fd377218 1596 for (order = 0; order < NR_PAGE_ORDERS; ++order)
467c996c
MG
1597 seq_printf(m, "%6d ", order);
1598 seq_putc(m, '\n');
1599
727c080f 1600 walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
467c996c
MG
1601}
1602
1603static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1604 pg_data_t *pgdat, struct zone *zone)
1605{
1606 int mtype;
1607 unsigned long pfn;
1608 unsigned long start_pfn = zone->zone_start_pfn;
108bcc96 1609 unsigned long end_pfn = zone_end_pfn(zone);
467c996c
MG
1610 unsigned long count[MIGRATE_TYPES] = { 0, };
1611
1612 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1613 struct page *page;
1614
d336e94e
MH
1615 page = pfn_to_online_page(pfn);
1616 if (!page)
467c996c
MG
1617 continue;
1618
a91c43c7
JK
1619 if (page_zone(page) != zone)
1620 continue;
1621
467c996c
MG
1622 mtype = get_pageblock_migratetype(page);
1623
e80d6a24
MG
1624 if (mtype < MIGRATE_TYPES)
1625 count[mtype]++;
467c996c
MG
1626 }
1627
1628 /* Print counts */
1629 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1630 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1631 seq_printf(m, "%12lu ", count[mtype]);
1632 seq_putc(m, '\n');
1633}
1634
f113e641 1635/* Print out the number of pageblocks for each migratetype */
33090af9 1636static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
467c996c
MG
1637{
1638 int mtype;
1639 pg_data_t *pgdat = (pg_data_t *)arg;
1640
1641 seq_printf(m, "\n%-23s", "Number of blocks type ");
1642 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1643 seq_printf(m, "%12s ", migratetype_names[mtype]);
1644 seq_putc(m, '\n');
727c080f
VM
1645 walk_zones_in_node(m, pgdat, true, false,
1646 pagetypeinfo_showblockcount_print);
467c996c
MG
1647}
1648
48c96a36
JK
1649/*
1650 * Print out the number of pageblocks for each migratetype that contain pages
1651 * of other types. This gives an indication of how well fallbacks are being
1652 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1653 * to determine what is going on
1654 */
1655static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1656{
1657#ifdef CONFIG_PAGE_OWNER
1658 int mtype;
1659
7dd80b8a 1660 if (!static_branch_unlikely(&page_owner_inited))
48c96a36
JK
1661 return;
1662
1663 drain_all_pages(NULL);
1664
1665 seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1666 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1667 seq_printf(m, "%12s ", migratetype_names[mtype]);
1668 seq_putc(m, '\n');
1669
727c080f
VM
1670 walk_zones_in_node(m, pgdat, true, true,
1671 pagetypeinfo_showmixedcount_print);
48c96a36
JK
1672#endif /* CONFIG_PAGE_OWNER */
1673}
1674
467c996c
MG
1675/*
1676 * This prints out statistics in relation to grouping pages by mobility.
1677 * It is expensive to collect so do not constantly read the file.
1678 */
1679static int pagetypeinfo_show(struct seq_file *m, void *arg)
1680{
1681 pg_data_t *pgdat = (pg_data_t *)arg;
1682
41b25a37 1683 /* check memoryless node */
a47b53c5 1684 if (!node_state(pgdat->node_id, N_MEMORY))
41b25a37
KM
1685 return 0;
1686
467c996c
MG
1687 seq_printf(m, "Page block order: %d\n", pageblock_order);
1688 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
1689 seq_putc(m, '\n');
1690 pagetypeinfo_showfree(m, pgdat);
1691 pagetypeinfo_showblockcount(m, pgdat);
48c96a36 1692 pagetypeinfo_showmixedcount(m, pgdat);
467c996c 1693
f6ac2354
CL
1694 return 0;
1695}
1696
8f32f7e5 1697static const struct seq_operations fragmentation_op = {
f6ac2354
CL
1698 .start = frag_start,
1699 .next = frag_next,
1700 .stop = frag_stop,
1701 .show = frag_show,
1702};
1703
74e2e8e8 1704static const struct seq_operations pagetypeinfo_op = {
467c996c
MG
1705 .start = frag_start,
1706 .next = frag_next,
1707 .stop = frag_stop,
1708 .show = pagetypeinfo_show,
1709};
1710
e2ecc8a7
MG
1711static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1712{
1713 int zid;
1714
1715 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1716 struct zone *compare = &pgdat->node_zones[zid];
1717
1718 if (populated_zone(compare))
1719 return zone == compare;
1720 }
1721
e2ecc8a7
MG
1722 return false;
1723}
1724
467c996c
MG
1725static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1726 struct zone *zone)
f6ac2354 1727{
467c996c
MG
1728 int i;
1729 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
e2ecc8a7
MG
1730 if (is_zone_first_populated(pgdat, zone)) {
1731 seq_printf(m, "\n per-node stats");
1732 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
69473e5d
MS
1733 unsigned long pages = node_page_state_pages(pgdat, i);
1734
1735 if (vmstat_item_print_in_thp(i))
1736 pages /= HPAGE_PMD_NR;
9d7ea9a2 1737 seq_printf(m, "\n %-12s %lu", node_stat_name(i),
69473e5d 1738 pages);
e2ecc8a7
MG
1739 }
1740 }
467c996c
MG
1741 seq_printf(m,
1742 "\n pages free %lu"
a6ea8b5b 1743 "\n boost %lu"
467c996c
MG
1744 "\n min %lu"
1745 "\n low %lu"
1746 "\n high %lu"
528afe6b 1747 "\n promo %lu"
467c996c 1748 "\n spanned %lu"
9feedc9d 1749 "\n present %lu"
3c381db1
DH
1750 "\n managed %lu"
1751 "\n cma %lu",
88f5acf8 1752 zone_page_state(zone, NR_FREE_PAGES),
a6ea8b5b 1753 zone->watermark_boost,
41858966
MG
1754 min_wmark_pages(zone),
1755 low_wmark_pages(zone),
1756 high_wmark_pages(zone),
528afe6b 1757 promo_wmark_pages(zone),
467c996c 1758 zone->spanned_pages,
9feedc9d 1759 zone->present_pages,
3c381db1
DH
1760 zone_managed_pages(zone),
1761 zone_cma_pages(zone));
467c996c 1762
467c996c 1763 seq_printf(m,
3484b2de 1764 "\n protection: (%ld",
467c996c
MG
1765 zone->lowmem_reserve[0]);
1766 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
3484b2de 1767 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
7dfb8bf3
DR
1768 seq_putc(m, ')');
1769
a8a4b7ae
BH
1770 /* If unpopulated, no other information is useful */
1771 if (!populated_zone(zone)) {
1772 seq_putc(m, '\n');
1773 return;
1774 }
1775
7dfb8bf3 1776 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
9d7ea9a2
KK
1777 seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
1778 zone_page_state(zone, i));
7dfb8bf3 1779
3a321d2a 1780#ifdef CONFIG_NUMA
f19298b9 1781 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
9d7ea9a2 1782 seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
f19298b9 1783 zone_numa_event_state(zone, i));
3a321d2a
KW
1784#endif
1785
7dfb8bf3 1786 seq_printf(m, "\n pagesets");
467c996c 1787 for_each_online_cpu(i) {
28f836b6
MG
1788 struct per_cpu_pages *pcp;
1789 struct per_cpu_zonestat __maybe_unused *pzstats;
467c996c 1790
28f836b6 1791 pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
3dfa5721
CL
1792 seq_printf(m,
1793 "\n cpu: %i"
1794 "\n count: %i"
1795 "\n high: %i"
1796 "\n batch: %i",
1797 i,
28f836b6
MG
1798 pcp->count,
1799 pcp->high,
1800 pcp->batch);
df9ecaba 1801#ifdef CONFIG_SMP
28f836b6 1802 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
467c996c 1803 seq_printf(m, "\n vm stats threshold: %d",
28f836b6 1804 pzstats->stat_threshold);
df9ecaba 1805#endif
f6ac2354 1806 }
467c996c 1807 seq_printf(m,
599d0c95 1808 "\n node_unreclaimable: %u"
3a50d14d 1809 "\n start_pfn: %lu",
c73322d0 1810 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
3a50d14d 1811 zone->zone_start_pfn);
467c996c
MG
1812 seq_putc(m, '\n');
1813}
1814
1815/*
b2bd8598
DR
1816 * Output information about zones in @pgdat. All zones are printed regardless
1817 * of whether they are populated or not: lowmem_reserve_ratio operates on the
1818 * set of all zones and userspace would not be aware of such zones if they are
1819 * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
467c996c
MG
1820 */
1821static int zoneinfo_show(struct seq_file *m, void *arg)
1822{
1823 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1824 walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
f6ac2354
CL
1825 return 0;
1826}
1827
5c9fe628 1828static const struct seq_operations zoneinfo_op = {
f6ac2354
CL
1829 .start = frag_start, /* iterate over all zones. The same as in
1830 * fragmentation. */
1831 .next = frag_next,
1832 .stop = frag_stop,
1833 .show = zoneinfo_show,
1834};
1835
9d7ea9a2 1836#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
f19298b9 1837 NR_VM_NUMA_EVENT_ITEMS + \
9d7ea9a2 1838 NR_VM_NODE_STAT_ITEMS + \
f4cb78af 1839 NR_VM_STAT_ITEMS + \
9d7ea9a2
KK
1840 (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1841 NR_VM_EVENT_ITEMS : 0))
79da826a 1842
f6ac2354
CL
1843static void *vmstat_start(struct seq_file *m, loff_t *pos)
1844{
2244b95a 1845 unsigned long *v;
9d7ea9a2 1846 int i;
f6ac2354 1847
9d7ea9a2 1848 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354 1849 return NULL;
79da826a 1850
9d7ea9a2 1851 BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
f19298b9 1852 fold_vm_numa_events();
9d7ea9a2 1853 v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
2244b95a
CL
1854 m->private = v;
1855 if (!v)
f6ac2354 1856 return ERR_PTR(-ENOMEM);
2244b95a 1857 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
c41f012a 1858 v[i] = global_zone_page_state(i);
79da826a
MR
1859 v += NR_VM_ZONE_STAT_ITEMS;
1860
3a321d2a 1861#ifdef CONFIG_NUMA
f19298b9
MG
1862 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1863 v[i] = global_numa_event_state(i);
1864 v += NR_VM_NUMA_EVENT_ITEMS;
3a321d2a
KW
1865#endif
1866
69473e5d 1867 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
ea426c2a 1868 v[i] = global_node_page_state_pages(i);
69473e5d
MS
1869 if (vmstat_item_print_in_thp(i))
1870 v[i] /= HPAGE_PMD_NR;
1871 }
75ef7184
MG
1872 v += NR_VM_NODE_STAT_ITEMS;
1873
79da826a
MR
1874 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1875 v + NR_DIRTY_THRESHOLD);
9d857311
PT
1876 v[NR_MEMMAP_PAGES] = atomic_long_read(&nr_memmap_pages);
1877 v[NR_MEMMAP_BOOT_PAGES] = atomic_long_read(&nr_memmap_boot_pages);
f4cb78af 1878 v += NR_VM_STAT_ITEMS;
79da826a 1879
f8891e5e 1880#ifdef CONFIG_VM_EVENT_COUNTERS
79da826a
MR
1881 all_vm_events(v);
1882 v[PGPGIN] /= 2; /* sectors -> kbytes */
1883 v[PGPGOUT] /= 2;
f8891e5e 1884#endif
ff8b16d7 1885 return (unsigned long *)m->private + *pos;
f6ac2354
CL
1886}
1887
1888static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1889{
1890 (*pos)++;
9d7ea9a2 1891 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354
CL
1892 return NULL;
1893 return (unsigned long *)m->private + *pos;
1894}
1895
1896static int vmstat_show(struct seq_file *m, void *arg)
1897{
1898 unsigned long *l = arg;
1899 unsigned long off = l - (unsigned long *)m->private;
68ba0326
AD
1900
1901 seq_puts(m, vmstat_text[off]);
75ba1d07 1902 seq_put_decimal_ull(m, " ", *l);
68ba0326 1903 seq_putc(m, '\n');
8d92890b
N
1904
1905 if (off == NR_VMSTAT_ITEMS - 1) {
1906 /*
1907 * We've come to the end - add any deprecated counters to avoid
1908 * breaking userspace which might depend on them being present.
1909 */
1910 seq_puts(m, "nr_unstable 0\n");
1911 }
f6ac2354
CL
1912 return 0;
1913}
1914
1915static void vmstat_stop(struct seq_file *m, void *arg)
1916{
1917 kfree(m->private);
1918 m->private = NULL;
1919}
1920
b6aa44ab 1921static const struct seq_operations vmstat_op = {
f6ac2354
CL
1922 .start = vmstat_start,
1923 .next = vmstat_next,
1924 .stop = vmstat_stop,
1925 .show = vmstat_show,
1926};
f6ac2354
CL
1927#endif /* CONFIG_PROC_FS */
1928
df9ecaba 1929#ifdef CONFIG_SMP
d1187ed2 1930static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
77461ab3 1931int sysctl_stat_interval __read_mostly = HZ;
d1187ed2 1932
52b6f46b
HD
1933#ifdef CONFIG_PROC_FS
1934static void refresh_vm_stats(struct work_struct *work)
1935{
1936 refresh_cpu_vm_stats(true);
1937}
1938
78eb4ea2 1939int vmstat_refresh(const struct ctl_table *table, int write,
32927393 1940 void *buffer, size_t *lenp, loff_t *ppos)
52b6f46b
HD
1941{
1942 long val;
1943 int err;
1944 int i;
1945
1946 /*
1947 * The regular update, every sysctl_stat_interval, may come later
1948 * than expected: leaving a significant amount in per_cpu buckets.
1949 * This is particularly misleading when checking a quantity of HUGE
1950 * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
1951 * which can equally be echo'ed to or cat'ted from (by root),
1952 * can be used to update the stats just before reading them.
1953 *
c41f012a 1954 * Oh, and since global_zone_page_state() etc. are so careful to hide
52b6f46b
HD
1955 * transiently negative values, report an error here if any of
1956 * the stats is negative, so we know to go looking for imbalance.
1957 */
1958 err = schedule_on_each_cpu(refresh_vm_stats);
1959 if (err)
1960 return err;
1961 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
75083aae
HD
1962 /*
1963 * Skip checking stats known to go negative occasionally.
1964 */
1965 switch (i) {
1966 case NR_ZONE_WRITE_PENDING:
1967 case NR_FREE_CMA_PAGES:
1968 continue;
1969 }
75ef7184 1970 val = atomic_long_read(&vm_zone_stat[i]);
52b6f46b 1971 if (val < 0) {
c822f622 1972 pr_warn("%s: %s %ld\n",
9d7ea9a2 1973 __func__, zone_stat_name(i), val);
52b6f46b
HD
1974 }
1975 }
76d8cc3c 1976 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
75083aae
HD
1977 /*
1978 * Skip checking stats known to go negative occasionally.
1979 */
1980 switch (i) {
1981 case NR_WRITEBACK:
1982 continue;
1983 }
76d8cc3c
HD
1984 val = atomic_long_read(&vm_node_stat[i]);
1985 if (val < 0) {
1986 pr_warn("%s: %s %ld\n",
1987 __func__, node_stat_name(i), val);
76d8cc3c
HD
1988 }
1989 }
52b6f46b
HD
1990 if (write)
1991 *ppos += *lenp;
1992 else
1993 *lenp = 0;
1994 return 0;
1995}
1996#endif /* CONFIG_PROC_FS */
1997
d1187ed2
CL
1998static void vmstat_update(struct work_struct *w)
1999{
0eb77e98 2000 if (refresh_cpu_vm_stats(true)) {
7cc36bbd
CL
2001 /*
2002 * Counters were updated so we expect more updates
2003 * to occur in the future. Keep on running the
2004 * update worker thread.
2005 */
ce612879 2006 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
f01f17d3
MH
2007 this_cpu_ptr(&vmstat_work),
2008 round_jiffies_relative(sysctl_stat_interval));
7cc36bbd
CL
2009 }
2010}
2011
2012/*
2013 * Check if the diffs for a certain cpu indicate that
2014 * an update is needed.
2015 */
2016static bool need_update(int cpu)
2017{
2bbd00ae 2018 pg_data_t *last_pgdat = NULL;
7cc36bbd
CL
2019 struct zone *zone;
2020
2021 for_each_populated_zone(zone) {
28f836b6 2022 struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2bbd00ae 2023 struct per_cpu_nodestat *n;
28f836b6 2024
7cc36bbd
CL
2025 /*
2026 * The fast way of checking if there are any vmstat diffs.
7cc36bbd 2027 */
64632fd3 2028 if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
7cc36bbd 2029 return true;
f19298b9 2030
2bbd00ae
JW
2031 if (last_pgdat == zone->zone_pgdat)
2032 continue;
2033 last_pgdat = zone->zone_pgdat;
2034 n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
64632fd3
ML
2035 if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
2036 return true;
7cc36bbd
CL
2037 }
2038 return false;
2039}
2040
7b8da4c7
CL
2041/*
2042 * Switch off vmstat processing and then fold all the remaining differentials
2043 * until the diffs stay at zero. The function is used by NOHZ and can only be
2044 * invoked when tick processing is not active.
2045 */
f01f17d3
MH
2046void quiet_vmstat(void)
2047{
2048 if (system_state != SYSTEM_RUNNING)
2049 return;
2050
7b8da4c7 2051 if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
f01f17d3
MH
2052 return;
2053
2054 if (!need_update(smp_processor_id()))
2055 return;
2056
2057 /*
2058 * Just refresh counters and do not care about the pending delayed
2059 * vmstat_update. It doesn't fire that often to matter and canceling
2060 * it would be too expensive from this path.
2061 * vmstat_shepherd will take care about that for us.
2062 */
2063 refresh_cpu_vm_stats(false);
2064}
2065
7cc36bbd
CL
2066/*
2067 * Shepherd worker thread that checks the
2068 * differentials of processors that have their worker
2069 * threads for vm statistics updates disabled because of
2070 * inactivity.
2071 */
2072static void vmstat_shepherd(struct work_struct *w);
2073
0eb77e98 2074static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
7cc36bbd
CL
2075
2076static void vmstat_shepherd(struct work_struct *w)
2077{
2078 int cpu;
2079
7625eccd 2080 cpus_read_lock();
7cc36bbd 2081 /* Check processors whose vmstat worker threads have been disabled */
7b8da4c7 2082 for_each_online_cpu(cpu) {
f01f17d3 2083 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
7cc36bbd 2084
be5e015d
MT
2085 /*
2086 * In kernel users of vmstat counters either require the precise value and
2087 * they are using zone_page_state_snapshot interface or they can live with
2088 * an imprecision as the regular flushing can happen at arbitrary time and
2089 * cumulative error can grow (see calculate_normal_threshold).
2090 *
2091 * From that POV the regular flushing can be postponed for CPUs that have
2092 * been isolated from the kernel interference without critical
2093 * infrastructure ever noticing. Skip regular flushing from vmstat_shepherd
2094 * for all isolated CPUs to avoid interference with the isolated workload.
2095 */
2096 if (cpu_is_isolated(cpu))
2097 continue;
2098
7b8da4c7 2099 if (!delayed_work_pending(dw) && need_update(cpu))
ce612879 2100 queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
fbcc8183
JB
2101
2102 cond_resched();
f01f17d3 2103 }
7625eccd 2104 cpus_read_unlock();
7cc36bbd
CL
2105
2106 schedule_delayed_work(&shepherd,
98f4ebb2 2107 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
2108}
2109
7cc36bbd 2110static void __init start_shepherd_timer(void)
d1187ed2 2111{
7cc36bbd
CL
2112 int cpu;
2113
2114 for_each_possible_cpu(cpu)
ccde8bd4 2115 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
7cc36bbd
CL
2116 vmstat_update);
2117
7cc36bbd
CL
2118 schedule_delayed_work(&shepherd,
2119 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
2120}
2121
03e86dba
TC
2122static void __init init_cpu_node_state(void)
2123{
4c501327 2124 int node;
03e86dba 2125
4c501327 2126 for_each_online_node(node) {
b55032f1 2127 if (!cpumask_empty(cpumask_of_node(node)))
4c501327
SAS
2128 node_set_state(node, N_CPU);
2129 }
03e86dba
TC
2130}
2131
5438da97
SAS
2132static int vmstat_cpu_online(unsigned int cpu)
2133{
2134 refresh_zone_stat_thresholds();
734c1570
OS
2135
2136 if (!node_state(cpu_to_node(cpu), N_CPU)) {
2137 node_set_state(cpu_to_node(cpu), N_CPU);
734c1570
OS
2138 }
2139
5438da97
SAS
2140 return 0;
2141}
2142
2143static int vmstat_cpu_down_prep(unsigned int cpu)
2144{
2145 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2146 return 0;
2147}
2148
2149static int vmstat_cpu_dead(unsigned int cpu)
807a1bd2 2150{
4c501327 2151 const struct cpumask *node_cpus;
5438da97 2152 int node;
807a1bd2 2153
5438da97
SAS
2154 node = cpu_to_node(cpu);
2155
2156 refresh_zone_stat_thresholds();
4c501327 2157 node_cpus = cpumask_of_node(node);
b55032f1 2158 if (!cpumask_empty(node_cpus))
5438da97 2159 return 0;
807a1bd2
TK
2160
2161 node_clear_state(node, N_CPU);
734c1570 2162
5438da97 2163 return 0;
807a1bd2
TK
2164}
2165
8f32f7e5 2166#endif
df9ecaba 2167
ce612879
MH
2168struct workqueue_struct *mm_percpu_wq;
2169
597b7305 2170void __init init_mm_internals(void)
df9ecaba 2171{
ce612879 2172 int ret __maybe_unused;
5438da97 2173
80d136e1 2174 mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
ce612879
MH
2175
2176#ifdef CONFIG_SMP
5438da97
SAS
2177 ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2178 NULL, vmstat_cpu_dead);
2179 if (ret < 0)
2180 pr_err("vmstat: failed to register 'dead' hotplug state\n");
2181
2182 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2183 vmstat_cpu_online,
2184 vmstat_cpu_down_prep);
2185 if (ret < 0)
2186 pr_err("vmstat: failed to register 'online' hotplug state\n");
2187
7625eccd 2188 cpus_read_lock();
03e86dba 2189 init_cpu_node_state();
7625eccd 2190 cpus_read_unlock();
d1187ed2 2191
7cc36bbd 2192 start_shepherd_timer();
8f32f7e5
AD
2193#endif
2194#ifdef CONFIG_PROC_FS
fddda2b7 2195 proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
abaed011 2196 proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
fddda2b7
CH
2197 proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2198 proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
8f32f7e5 2199#endif
df9ecaba 2200}
d7a5752c
MG
2201
2202#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
d7a5752c
MG
2203
2204/*
2205 * Return an index indicating how much of the available free memory is
2206 * unusable for an allocation of the requested size.
2207 */
2208static int unusable_free_index(unsigned int order,
2209 struct contig_page_info *info)
2210{
2211 /* No free memory is interpreted as all free memory is unusable */
2212 if (info->free_pages == 0)
2213 return 1000;
2214
2215 /*
2216 * Index should be a value between 0 and 1. Return a value to 3
2217 * decimal places.
2218 *
2219 * 0 => no fragmentation
2220 * 1 => high fragmentation
2221 */
2222 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2223
2224}
2225
2226static void unusable_show_print(struct seq_file *m,
2227 pg_data_t *pgdat, struct zone *zone)
2228{
2229 unsigned int order;
2230 int index;
2231 struct contig_page_info info;
2232
2233 seq_printf(m, "Node %d, zone %8s ",
2234 pgdat->node_id,
2235 zone->name);
fd377218 2236 for (order = 0; order < NR_PAGE_ORDERS; ++order) {
d7a5752c
MG
2237 fill_contig_page_info(zone, order, &info);
2238 index = unusable_free_index(order, &info);
2239 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2240 }
2241
2242 seq_putc(m, '\n');
2243}
2244
2245/*
2246 * Display unusable free space index
2247 *
2248 * The unusable free space index measures how much of the available free
2249 * memory cannot be used to satisfy an allocation of a given size and is a
2250 * value between 0 and 1. The higher the value, the more of free memory is
2251 * unusable and by implication, the worse the external fragmentation is. This
2252 * can be expressed as a percentage by multiplying by 100.
2253 */
2254static int unusable_show(struct seq_file *m, void *arg)
2255{
2256 pg_data_t *pgdat = (pg_data_t *)arg;
2257
2258 /* check memoryless node */
a47b53c5 2259 if (!node_state(pgdat->node_id, N_MEMORY))
d7a5752c
MG
2260 return 0;
2261
727c080f 2262 walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
d7a5752c
MG
2263
2264 return 0;
2265}
2266
01a99560 2267static const struct seq_operations unusable_sops = {
d7a5752c
MG
2268 .start = frag_start,
2269 .next = frag_next,
2270 .stop = frag_stop,
2271 .show = unusable_show,
2272};
2273
01a99560 2274DEFINE_SEQ_ATTRIBUTE(unusable);
d7a5752c 2275
f1a5ab12
MG
2276static void extfrag_show_print(struct seq_file *m,
2277 pg_data_t *pgdat, struct zone *zone)
2278{
2279 unsigned int order;
2280 int index;
2281
2282 /* Alloc on stack as interrupts are disabled for zone walk */
2283 struct contig_page_info info;
2284
2285 seq_printf(m, "Node %d, zone %8s ",
2286 pgdat->node_id,
2287 zone->name);
fd377218 2288 for (order = 0; order < NR_PAGE_ORDERS; ++order) {
f1a5ab12 2289 fill_contig_page_info(zone, order, &info);
56de7263 2290 index = __fragmentation_index(order, &info);
a9970586 2291 seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
f1a5ab12
MG
2292 }
2293
2294 seq_putc(m, '\n');
2295}
2296
2297/*
2298 * Display fragmentation index for orders that allocations would fail for
2299 */
2300static int extfrag_show(struct seq_file *m, void *arg)
2301{
2302 pg_data_t *pgdat = (pg_data_t *)arg;
2303
727c080f 2304 walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
f1a5ab12
MG
2305
2306 return 0;
2307}
2308
01a99560 2309static const struct seq_operations extfrag_sops = {
f1a5ab12
MG
2310 .start = frag_start,
2311 .next = frag_next,
2312 .stop = frag_stop,
2313 .show = extfrag_show,
2314};
2315
01a99560 2316DEFINE_SEQ_ATTRIBUTE(extfrag);
f1a5ab12 2317
d7a5752c
MG
2318static int __init extfrag_debug_init(void)
2319{
bde8bd8a
S
2320 struct dentry *extfrag_debug_root;
2321
d7a5752c 2322 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
d7a5752c 2323
d9f7979c 2324 debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
01a99560 2325 &unusable_fops);
d7a5752c 2326
d9f7979c 2327 debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
01a99560 2328 &extfrag_fops);
f1a5ab12 2329
d7a5752c
MG
2330 return 0;
2331}
2332
2333module_init(extfrag_debug_init);
15995a35 2334
d7a5752c 2335#endif
This page took 5.210184 seconds and 4 git commands to generate.