]> Git Repo - linux.git/blame - mm/mempolicy.c
mempolicy: mmap_lock is not needed while migrating folios
[linux.git] / mm / mempolicy.c
CommitLineData
46aeb7e6 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4
LT
2/*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
c36f6e6d 28 * preferred Try a specific node first before normal fallback.
00ef2d2f 29 * As a special case NUMA_NO_NODE here means do the allocation
1da177e4
LT
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
b27abacc
DH
34 * preferred many Try a set of nodes first before normal fallback. This is
35 * similar to preferred without the special case.
36 *
1da177e4
LT
37 * default Allocate on the local node first, or when on a VMA
38 * use the process policy. This is what Linux always did
39 * in a NUMA aware kernel and still does by, ahem, default.
40 *
41 * The process policy is applied for most non interrupt memory allocations
42 * in that process' context. Interrupts ignore the policies and always
43 * try to allocate on the local CPU. The VMA policy is only applied for memory
44 * allocations for a VMA in the VM.
45 *
46 * Currently there are a few corner cases in swapping where the policy
47 * is not applied, but the majority should be handled. When process policy
48 * is used it is not remembered over swap outs/swap ins.
49 *
50 * Only the highest zone in the zone hierarchy gets policied. Allocations
51 * requesting a lower zone just use default policy. This implies that
52 * on systems with highmem kernel lowmem allocation don't get policied.
53 * Same with GFP_DMA allocations.
54 *
c36f6e6d 55 * For shmem/tmpfs shared memory the policy is shared between
1da177e4
LT
56 * all users and remembered even when nobody has memory mapped.
57 */
58
59/* Notebook:
60 fix mmap readahead to honour policy and enable policy for any page cache
61 object
62 statistics for bigpages
63 global policy for page cache? currently it uses process policy. Requires
64 first item above.
65 handle mremap for shared memory (currently ignored for the policy)
66 grows down?
67 make bind policy root only? It can trigger oom much faster and the
68 kernel is not always grateful with that.
1da177e4
LT
69*/
70
b1de0d13
MH
71#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
72
1da177e4 73#include <linux/mempolicy.h>
a520110e 74#include <linux/pagewalk.h>
1da177e4
LT
75#include <linux/highmem.h>
76#include <linux/hugetlb.h>
77#include <linux/kernel.h>
78#include <linux/sched.h>
6e84f315 79#include <linux/sched/mm.h>
6a3827d7 80#include <linux/sched/numa_balancing.h>
f719ff9b 81#include <linux/sched/task.h>
1da177e4
LT
82#include <linux/nodemask.h>
83#include <linux/cpuset.h>
1da177e4
LT
84#include <linux/slab.h>
85#include <linux/string.h>
b95f1b31 86#include <linux/export.h>
b488893a 87#include <linux/nsproxy.h>
1da177e4
LT
88#include <linux/interrupt.h>
89#include <linux/init.h>
90#include <linux/compat.h>
31367466 91#include <linux/ptrace.h>
dc9aa5b9 92#include <linux/swap.h>
1a75a6c8
CL
93#include <linux/seq_file.h>
94#include <linux/proc_fs.h>
b20a3503 95#include <linux/migrate.h>
62b61f61 96#include <linux/ksm.h>
95a402c3 97#include <linux/rmap.h>
86c3a764 98#include <linux/security.h>
dbcb0f19 99#include <linux/syscalls.h>
095f1fc4 100#include <linux/ctype.h>
6d9c285a 101#include <linux/mm_inline.h>
b24f53a0 102#include <linux/mmu_notifier.h>
b1de0d13 103#include <linux/printk.h>
c8633798 104#include <linux/swapops.h>
dc9aa5b9 105
1da177e4 106#include <asm/tlbflush.h>
4a18419f 107#include <asm/tlb.h>
7c0f6ba6 108#include <linux/uaccess.h>
1da177e4 109
62695a84
NP
110#include "internal.h"
111
38e35860 112/* Internal flags */
dc9aa5b9 113#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
1cb5d11a
HD
114#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
115#define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */
dc9aa5b9 116
fcc234f8
PE
117static struct kmem_cache *policy_cache;
118static struct kmem_cache *sn_cache;
1da177e4 119
1da177e4
LT
120/* Highest zone. An specific allocation for a zone below that is not
121 policied. */
6267276f 122enum zone_type policy_zone = 0;
1da177e4 123
bea904d5
LS
124/*
125 * run-time system-wide default policy => local allocation
126 */
e754d79d 127static struct mempolicy default_policy = {
1da177e4 128 .refcnt = ATOMIC_INIT(1), /* never free it */
7858d7bc 129 .mode = MPOL_LOCAL,
1da177e4
LT
130};
131
5606e387
MG
132static struct mempolicy preferred_node_policy[MAX_NUMNODES];
133
b2ca916c
DW
134/**
135 * numa_map_to_online_node - Find closest online node
f6e92f40 136 * @node: Node id to start the search
b2ca916c
DW
137 *
138 * Lookup the next closest node by distance if @nid is not online.
dad5b023
RD
139 *
140 * Return: this @node if it is online, otherwise the closest node by distance
b2ca916c
DW
141 */
142int numa_map_to_online_node(int node)
143{
4fcbe96e 144 int min_dist = INT_MAX, dist, n, min_node;
b2ca916c 145
4fcbe96e
DW
146 if (node == NUMA_NO_NODE || node_online(node))
147 return node;
b2ca916c
DW
148
149 min_node = node;
4fcbe96e
DW
150 for_each_online_node(n) {
151 dist = node_distance(node, n);
152 if (dist < min_dist) {
153 min_dist = dist;
154 min_node = n;
b2ca916c
DW
155 }
156 }
157
158 return min_node;
159}
160EXPORT_SYMBOL_GPL(numa_map_to_online_node);
161
74d2c3a0 162struct mempolicy *get_task_policy(struct task_struct *p)
5606e387
MG
163{
164 struct mempolicy *pol = p->mempolicy;
f15ca78e 165 int node;
5606e387 166
f15ca78e
ON
167 if (pol)
168 return pol;
5606e387 169
f15ca78e
ON
170 node = numa_node_id();
171 if (node != NUMA_NO_NODE) {
172 pol = &preferred_node_policy[node];
173 /* preferred_node_policy is not initialised early in boot */
174 if (pol->mode)
175 return pol;
5606e387
MG
176 }
177
f15ca78e 178 return &default_policy;
5606e387
MG
179}
180
37012946
DR
181static const struct mempolicy_operations {
182 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
213980c0 183 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
37012946
DR
184} mpol_ops[MPOL_MAX];
185
f5b087b5
DR
186static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
187{
6d556294 188 return pol->flags & MPOL_MODE_FLAGS;
4c50bc01
DR
189}
190
191static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
192 const nodemask_t *rel)
193{
194 nodemask_t tmp;
195 nodes_fold(tmp, *orig, nodes_weight(*rel));
196 nodes_onto(*ret, tmp, *rel);
f5b087b5
DR
197}
198
be897d48 199static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
200{
201 if (nodes_empty(*nodes))
202 return -EINVAL;
269fbe72 203 pol->nodes = *nodes;
37012946
DR
204 return 0;
205}
206
207static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
208{
7858d7bc
FT
209 if (nodes_empty(*nodes))
210 return -EINVAL;
269fbe72
BW
211
212 nodes_clear(pol->nodes);
213 node_set(first_node(*nodes), pol->nodes);
37012946
DR
214 return 0;
215}
216
58568d2a
MX
217/*
218 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
219 * any, for the new policy. mpol_new() has already validated the nodes
7858d7bc 220 * parameter with respect to the policy mode and flags.
58568d2a
MX
221 *
222 * Must be called holding task's alloc_lock to protect task's mems_allowed
c1e8d7c6 223 * and mempolicy. May also be called holding the mmap_lock for write.
58568d2a 224 */
4bfc4495
KH
225static int mpol_set_nodemask(struct mempolicy *pol,
226 const nodemask_t *nodes, struct nodemask_scratch *nsc)
58568d2a 227{
58568d2a
MX
228 int ret;
229
7858d7bc
FT
230 /*
231 * Default (pol==NULL) resp. local memory policies are not a
232 * subject of any remapping. They also do not need any special
233 * constructor.
234 */
235 if (!pol || pol->mode == MPOL_LOCAL)
58568d2a 236 return 0;
7858d7bc 237
01f13bd6 238 /* Check N_MEMORY */
4bfc4495 239 nodes_and(nsc->mask1,
01f13bd6 240 cpuset_current_mems_allowed, node_states[N_MEMORY]);
58568d2a
MX
241
242 VM_BUG_ON(!nodes);
4bfc4495 243
7858d7bc
FT
244 if (pol->flags & MPOL_F_RELATIVE_NODES)
245 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
246 else
247 nodes_and(nsc->mask2, *nodes, nsc->mask1);
58568d2a 248
7858d7bc
FT
249 if (mpol_store_user_nodemask(pol))
250 pol->w.user_nodemask = *nodes;
4bfc4495 251 else
7858d7bc
FT
252 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
253
254 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
58568d2a
MX
255 return ret;
256}
257
258/*
259 * This function just creates a new policy, does some check and simple
260 * initialization. You must invoke mpol_set_nodemask() to set nodes.
261 */
028fec41
DR
262static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
263 nodemask_t *nodes)
1da177e4
LT
264{
265 struct mempolicy *policy;
266
3e1f0645
DR
267 if (mode == MPOL_DEFAULT) {
268 if (nodes && !nodes_empty(*nodes))
37012946 269 return ERR_PTR(-EINVAL);
d3a71033 270 return NULL;
37012946 271 }
3e1f0645
DR
272 VM_BUG_ON(!nodes);
273
274 /*
275 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
276 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
277 * All other modes require a valid pointer to a non-empty nodemask.
278 */
279 if (mode == MPOL_PREFERRED) {
280 if (nodes_empty(*nodes)) {
281 if (((flags & MPOL_F_STATIC_NODES) ||
282 (flags & MPOL_F_RELATIVE_NODES)))
283 return ERR_PTR(-EINVAL);
7858d7bc
FT
284
285 mode = MPOL_LOCAL;
3e1f0645 286 }
479e2802 287 } else if (mode == MPOL_LOCAL) {
8d303e44
PK
288 if (!nodes_empty(*nodes) ||
289 (flags & MPOL_F_STATIC_NODES) ||
290 (flags & MPOL_F_RELATIVE_NODES))
479e2802 291 return ERR_PTR(-EINVAL);
3e1f0645
DR
292 } else if (nodes_empty(*nodes))
293 return ERR_PTR(-EINVAL);
c36f6e6d 294
1da177e4
LT
295 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
296 if (!policy)
297 return ERR_PTR(-ENOMEM);
298 atomic_set(&policy->refcnt, 1);
45c4745a 299 policy->mode = mode;
3e1f0645 300 policy->flags = flags;
c6018b4b 301 policy->home_node = NUMA_NO_NODE;
37012946 302
1da177e4 303 return policy;
37012946
DR
304}
305
52cd3b07 306/* Slow path of a mpol destructor. */
c36f6e6d 307void __mpol_put(struct mempolicy *pol)
52cd3b07 308{
c36f6e6d 309 if (!atomic_dec_and_test(&pol->refcnt))
52cd3b07 310 return;
c36f6e6d 311 kmem_cache_free(policy_cache, pol);
52cd3b07
LS
312}
313
213980c0 314static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
315{
316}
317
213980c0 318static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
319{
320 nodemask_t tmp;
321
322 if (pol->flags & MPOL_F_STATIC_NODES)
323 nodes_and(tmp, pol->w.user_nodemask, *nodes);
324 else if (pol->flags & MPOL_F_RELATIVE_NODES)
325 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
326 else {
269fbe72 327 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
213980c0 328 *nodes);
29b190fa 329 pol->w.cpuset_mems_allowed = *nodes;
37012946 330 }
f5b087b5 331
708c1bbc
MX
332 if (nodes_empty(tmp))
333 tmp = *nodes;
334
269fbe72 335 pol->nodes = tmp;
37012946
DR
336}
337
338static void mpol_rebind_preferred(struct mempolicy *pol,
213980c0 339 const nodemask_t *nodes)
37012946 340{
7858d7bc 341 pol->w.cpuset_mems_allowed = *nodes;
1da177e4
LT
342}
343
708c1bbc
MX
344/*
345 * mpol_rebind_policy - Migrate a policy to a different set of nodes
346 *
c1e8d7c6 347 * Per-vma policies are protected by mmap_lock. Allocations using per-task
213980c0
VB
348 * policies are protected by task->mems_allowed_seq to prevent a premature
349 * OOM/allocation failure due to parallel nodemask modification.
708c1bbc 350 */
213980c0 351static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1d0d2680 352{
018160ad 353 if (!pol || pol->mode == MPOL_LOCAL)
1d0d2680 354 return;
7858d7bc 355 if (!mpol_store_user_nodemask(pol) &&
1d0d2680
DR
356 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
357 return;
708c1bbc 358
213980c0 359 mpol_ops[pol->mode].rebind(pol, newmask);
1d0d2680
DR
360}
361
362/*
363 * Wrapper for mpol_rebind_policy() that just requires task
364 * pointer, and updates task mempolicy.
58568d2a
MX
365 *
366 * Called with task's alloc_lock held.
1d0d2680 367 */
213980c0 368void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1d0d2680 369{
213980c0 370 mpol_rebind_policy(tsk->mempolicy, new);
1d0d2680
DR
371}
372
373/*
374 * Rebind each vma in mm to new nodemask.
375 *
c1e8d7c6 376 * Call holding a reference to mm. Takes mm->mmap_lock during call.
1d0d2680 377 */
1d0d2680
DR
378void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
379{
380 struct vm_area_struct *vma;
66850be5 381 VMA_ITERATOR(vmi, mm, 0);
1d0d2680 382
d8ed45c5 383 mmap_write_lock(mm);
6c21e066
JH
384 for_each_vma(vmi, vma) {
385 vma_start_write(vma);
213980c0 386 mpol_rebind_policy(vma->vm_policy, new);
6c21e066 387 }
d8ed45c5 388 mmap_write_unlock(mm);
1d0d2680
DR
389}
390
37012946
DR
391static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
392 [MPOL_DEFAULT] = {
393 .rebind = mpol_rebind_default,
394 },
395 [MPOL_INTERLEAVE] = {
be897d48 396 .create = mpol_new_nodemask,
37012946
DR
397 .rebind = mpol_rebind_nodemask,
398 },
399 [MPOL_PREFERRED] = {
400 .create = mpol_new_preferred,
401 .rebind = mpol_rebind_preferred,
402 },
403 [MPOL_BIND] = {
be897d48 404 .create = mpol_new_nodemask,
37012946
DR
405 .rebind = mpol_rebind_nodemask,
406 },
7858d7bc
FT
407 [MPOL_LOCAL] = {
408 .rebind = mpol_rebind_default,
409 },
b27abacc 410 [MPOL_PREFERRED_MANY] = {
be897d48 411 .create = mpol_new_nodemask,
b27abacc
DH
412 .rebind = mpol_rebind_preferred,
413 },
37012946
DR
414};
415
1cb5d11a 416static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
fc301289 417 unsigned long flags);
72e315f7
HD
418static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
419 pgoff_t ilx, int *nid);
1a75a6c8 420
1cb5d11a
HD
421static bool strictly_unmovable(unsigned long flags)
422{
423 /*
424 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
425 * if any misplaced page is found.
426 */
427 return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
428 MPOL_MF_STRICT;
429}
430
6f4576e3
NH
431struct queue_pages {
432 struct list_head *pagelist;
433 unsigned long flags;
434 nodemask_t *nmask;
f18da660
LX
435 unsigned long start;
436 unsigned long end;
437 struct vm_area_struct *first;
1cb5d11a
HD
438 struct folio *large; /* note last large folio encountered */
439 long nr_failed; /* could not be isolated at this time */
6f4576e3
NH
440};
441
88aaa2a1 442/*
d451b89d 443 * Check if the folio's nid is in qp->nmask.
88aaa2a1
NH
444 *
445 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
446 * in the invert of qp->nmask.
447 */
d451b89d 448static inline bool queue_folio_required(struct folio *folio,
88aaa2a1
NH
449 struct queue_pages *qp)
450{
d451b89d 451 int nid = folio_nid(folio);
88aaa2a1
NH
452 unsigned long flags = qp->flags;
453
454 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
455}
456
1cb5d11a 457static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
c8633798 458{
de1f5055 459 struct folio *folio;
c8633798 460 struct queue_pages *qp = walk->private;
c8633798
NH
461
462 if (unlikely(is_pmd_migration_entry(*pmd))) {
1cb5d11a
HD
463 qp->nr_failed++;
464 return;
c8633798 465 }
de1f5055
VMO
466 folio = pfn_folio(pmd_pfn(*pmd));
467 if (is_huge_zero_page(&folio->page)) {
e5947d23 468 walk->action = ACTION_CONTINUE;
1cb5d11a 469 return;
c8633798 470 }
d451b89d 471 if (!queue_folio_required(folio, qp))
1cb5d11a
HD
472 return;
473 if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
474 !vma_migratable(walk->vma) ||
475 !migrate_folio_add(folio, qp->pagelist, qp->flags))
476 qp->nr_failed++;
c8633798
NH
477}
478
98094945 479/*
1cb5d11a
HD
480 * Scan through folios, checking if they satisfy the required conditions,
481 * moving them from LRU to local pagelist for migration if they do (or not).
d8835445 482 *
1cb5d11a
HD
483 * queue_folios_pte_range() has two possible return values:
484 * 0 - continue walking to scan for more, even if an existing folio on the
485 * wrong node could not be isolated and queued for migration.
486 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
487 * and an existing folio was on a node that does not follow the policy.
98094945 488 */
3dae02bb 489static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
6f4576e3 490 unsigned long end, struct mm_walk *walk)
1da177e4 491{
6f4576e3 492 struct vm_area_struct *vma = walk->vma;
3dae02bb 493 struct folio *folio;
6f4576e3
NH
494 struct queue_pages *qp = walk->private;
495 unsigned long flags = qp->flags;
3f088420 496 pte_t *pte, *mapped_pte;
c33c7948 497 pte_t ptent;
705e87c0 498 spinlock_t *ptl;
941150a3 499
c8633798 500 ptl = pmd_trans_huge_lock(pmd, vma);
1cb5d11a
HD
501 if (ptl) {
502 queue_folios_pmd(pmd, walk);
503 spin_unlock(ptl);
504 goto out;
505 }
91612e0d 506
3f088420 507 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
7780d040
HD
508 if (!pte) {
509 walk->action = ACTION_AGAIN;
510 return 0;
511 }
6f4576e3 512 for (; addr != end; pte++, addr += PAGE_SIZE) {
c33c7948 513 ptent = ptep_get(pte);
1cb5d11a 514 if (pte_none(ptent))
1da177e4 515 continue;
1cb5d11a
HD
516 if (!pte_present(ptent)) {
517 if (is_migration_entry(pte_to_swp_entry(ptent)))
518 qp->nr_failed++;
519 continue;
520 }
c33c7948 521 folio = vm_normal_folio(vma, addr, ptent);
3dae02bb 522 if (!folio || folio_is_zone_device(folio))
1da177e4 523 continue;
053837fc 524 /*
3dae02bb
VMO
525 * vm_normal_folio() filters out zero pages, but there might
526 * still be reserved folios to skip, perhaps in a VDSO.
053837fc 527 */
3dae02bb 528 if (folio_test_reserved(folio))
f4598c8b 529 continue;
d451b89d 530 if (!queue_folio_required(folio, qp))
38e35860 531 continue;
1cb5d11a 532 if (folio_test_large(folio)) {
a53190a4 533 /*
1cb5d11a
HD
534 * A large folio can only be isolated from LRU once,
535 * but may be mapped by many PTEs (and Copy-On-Write may
536 * intersperse PTEs of other, order 0, folios). This is
537 * a common case, so don't mistake it for failure (but
538 * there can be other cases of multi-mapped pages which
539 * this quick check does not help to filter out - and a
540 * search of the pagelist might grow to be prohibitive).
541 *
542 * migrate_pages(&pagelist) returns nr_failed folios, so
543 * check "large" now so that queue_pages_range() returns
544 * a comparable nr_failed folios. This does imply that
545 * if folio could not be isolated for some racy reason
546 * at its first PTE, later PTEs will not give it another
547 * chance of isolation; but keeps the accounting simple.
a53190a4 548 */
1cb5d11a
HD
549 if (folio == qp->large)
550 continue;
551 qp->large = folio;
552 }
553 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
554 !vma_migratable(vma) ||
555 !migrate_folio_add(folio, qp->pagelist, flags)) {
556 qp->nr_failed++;
557 if (strictly_unmovable(flags))
558 break;
559 }
6f4576e3 560 }
3f088420 561 pte_unmap_unlock(mapped_pte, ptl);
6f4576e3 562 cond_resched();
1cb5d11a
HD
563out:
564 if (qp->nr_failed && strictly_unmovable(flags))
565 return -EIO;
566 return 0;
91612e0d
HD
567}
568
0a2c1e81 569static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
6f4576e3
NH
570 unsigned long addr, unsigned long end,
571 struct mm_walk *walk)
e2d8cf40
NH
572{
573#ifdef CONFIG_HUGETLB_PAGE
6f4576e3 574 struct queue_pages *qp = walk->private;
1cb5d11a 575 unsigned long flags = qp->flags;
0a2c1e81 576 struct folio *folio;
cb900f41 577 spinlock_t *ptl;
d4c54919 578 pte_t entry;
e2d8cf40 579
6f4576e3
NH
580 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
581 entry = huge_ptep_get(pte);
1cb5d11a
HD
582 if (!pte_present(entry)) {
583 if (unlikely(is_hugetlb_entry_migration(entry)))
584 qp->nr_failed++;
d4c54919 585 goto unlock;
1cb5d11a 586 }
0a2c1e81 587 folio = pfn_folio(pte_pfn(entry));
d451b89d 588 if (!queue_folio_required(folio, qp))
e2d8cf40 589 goto unlock;
1cb5d11a
HD
590 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
591 !vma_migratable(walk->vma)) {
592 qp->nr_failed++;
dcf17635
LX
593 goto unlock;
594 }
0a2c1e81 595 /*
1cb5d11a
HD
596 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
597 * Choosing not to migrate a shared folio is not counted as a failure.
0a2c1e81
VMO
598 *
599 * To check if the folio is shared, ideally we want to make sure
600 * every page is mapped to the same process. Doing that is very
1cb5d11a 601 * expensive, so check the estimated sharers of the folio instead.
0a2c1e81 602 */
1cb5d11a
HD
603 if ((flags & MPOL_MF_MOVE_ALL) ||
604 (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte)))
605 if (!isolate_hugetlb(folio, qp->pagelist))
606 qp->nr_failed++;
e2d8cf40 607unlock:
cb900f41 608 spin_unlock(ptl);
1cb5d11a
HD
609 if (qp->nr_failed && strictly_unmovable(flags))
610 return -EIO;
e2d8cf40 611#endif
1cb5d11a 612 return 0;
1da177e4
LT
613}
614
5877231f 615#ifdef CONFIG_NUMA_BALANCING
b24f53a0 616/*
4b10e7d5
MG
617 * This is used to mark a range of virtual addresses to be inaccessible.
618 * These are later cleared by a NUMA hinting fault. Depending on these
619 * faults, pages may be migrated for better NUMA placement.
620 *
621 * This is assuming that NUMA faults are handled using PROT_NONE. If
622 * an architecture makes a different choice, it will need further
623 * changes to the core.
b24f53a0 624 */
4b10e7d5
MG
625unsigned long change_prot_numa(struct vm_area_struct *vma,
626 unsigned long addr, unsigned long end)
b24f53a0 627{
4a18419f 628 struct mmu_gather tlb;
a79390f5 629 long nr_updated;
b24f53a0 630
4a18419f
NA
631 tlb_gather_mmu(&tlb, vma->vm_mm);
632
1ef488ed 633 nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
d1751118 634 if (nr_updated > 0)
03c5a6e1 635 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
b24f53a0 636
4a18419f
NA
637 tlb_finish_mmu(&tlb);
638
4b10e7d5 639 return nr_updated;
b24f53a0 640}
5877231f 641#endif /* CONFIG_NUMA_BALANCING */
b24f53a0 642
6f4576e3
NH
643static int queue_pages_test_walk(unsigned long start, unsigned long end,
644 struct mm_walk *walk)
645{
66850be5 646 struct vm_area_struct *next, *vma = walk->vma;
6f4576e3
NH
647 struct queue_pages *qp = walk->private;
648 unsigned long endvma = vma->vm_end;
649 unsigned long flags = qp->flags;
650
a18b3ac2 651 /* range check first */
ce33135c 652 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
f18da660
LX
653
654 if (!qp->first) {
655 qp->first = vma;
656 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
657 (qp->start < vma->vm_start))
658 /* hole at head side of range */
a18b3ac2
LX
659 return -EFAULT;
660 }
66850be5 661 next = find_vma(vma->vm_mm, vma->vm_end);
f18da660
LX
662 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
663 ((vma->vm_end < qp->end) &&
66850be5 664 (!next || vma->vm_end < next->vm_start)))
f18da660
LX
665 /* hole at middle or tail of range */
666 return -EFAULT;
a18b3ac2 667
a7f40cfe
YS
668 /*
669 * Need check MPOL_MF_STRICT to return -EIO if possible
670 * regardless of vma_migratable
671 */
672 if (!vma_migratable(vma) &&
673 !(flags & MPOL_MF_STRICT))
48684a65
NH
674 return 1;
675
6f4576e3
NH
676 if (endvma > end)
677 endvma = end;
6f4576e3 678
1cb5d11a
HD
679 /*
680 * Check page nodes, and queue pages to move, in the current vma.
681 * But if no moving, and no strict checking, the scan can be skipped.
682 */
683 if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
6f4576e3
NH
684 return 0;
685 return 1;
686}
687
7b86ac33 688static const struct mm_walk_ops queue_pages_walk_ops = {
0a2c1e81 689 .hugetlb_entry = queue_folios_hugetlb,
3dae02bb 690 .pmd_entry = queue_folios_pte_range,
7b86ac33 691 .test_walk = queue_pages_test_walk,
49b06385
SB
692 .walk_lock = PGWALK_RDLOCK,
693};
694
695static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
696 .hugetlb_entry = queue_folios_hugetlb,
697 .pmd_entry = queue_folios_pte_range,
698 .test_walk = queue_pages_test_walk,
699 .walk_lock = PGWALK_WRLOCK,
7b86ac33
CH
700};
701
dc9aa5b9 702/*
98094945
NH
703 * Walk through page tables and collect pages to be migrated.
704 *
1cb5d11a
HD
705 * If pages found in a given range are not on the required set of @nodes,
706 * and migration is allowed, they are isolated and queued to @pagelist.
d8835445 707 *
1cb5d11a
HD
708 * queue_pages_range() may return:
709 * 0 - all pages already on the right node, or successfully queued for moving
710 * (or neither strict checking nor moving requested: only range checking).
711 * >0 - this number of misplaced folios could not be queued for moving
712 * (a hugetlbfs page or a transparent huge page being counted as 1).
713 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
714 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
dc9aa5b9 715 */
1cb5d11a 716static long
98094945 717queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
6f4576e3 718 nodemask_t *nodes, unsigned long flags,
1cb5d11a 719 struct list_head *pagelist)
1da177e4 720{
f18da660 721 int err;
6f4576e3
NH
722 struct queue_pages qp = {
723 .pagelist = pagelist,
724 .flags = flags,
725 .nmask = nodes,
f18da660
LX
726 .start = start,
727 .end = end,
728 .first = NULL,
6f4576e3 729 };
1cb5d11a 730 const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
49b06385 731 &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
6f4576e3 732
49b06385 733 err = walk_page_range(mm, start, end, ops, &qp);
f18da660
LX
734
735 if (!qp.first)
736 /* whole range in hole */
737 err = -EFAULT;
738
1cb5d11a 739 return err ? : qp.nr_failed;
1da177e4
LT
740}
741
869833f2
KM
742/*
743 * Apply policy to a single VMA
c1e8d7c6 744 * This must be called with the mmap_lock held for writing.
869833f2
KM
745 */
746static int vma_replace_policy(struct vm_area_struct *vma,
c36f6e6d 747 struct mempolicy *pol)
8d34694c 748{
869833f2
KM
749 int err;
750 struct mempolicy *old;
751 struct mempolicy *new;
8d34694c 752
6c21e066
JH
753 vma_assert_write_locked(vma);
754
869833f2
KM
755 new = mpol_dup(pol);
756 if (IS_ERR(new))
757 return PTR_ERR(new);
758
759 if (vma->vm_ops && vma->vm_ops->set_policy) {
8d34694c 760 err = vma->vm_ops->set_policy(vma, new);
869833f2
KM
761 if (err)
762 goto err_out;
8d34694c 763 }
869833f2
KM
764
765 old = vma->vm_policy;
c1e8d7c6 766 vma->vm_policy = new; /* protected by mmap_lock */
869833f2
KM
767 mpol_put(old);
768
769 return 0;
770 err_out:
771 mpol_put(new);
8d34694c
KM
772 return err;
773}
774
f4e9e0e6
LH
775/* Split or merge the VMA (if required) and apply the new policy */
776static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
777 struct vm_area_struct **prev, unsigned long start,
778 unsigned long end, struct mempolicy *new_pol)
1da177e4 779{
f4e9e0e6 780 unsigned long vmstart, vmend;
9d8cebd4 781
f4e9e0e6
LH
782 vmend = min(end, vma->vm_end);
783 if (start > vma->vm_start) {
784 *prev = vma;
785 vmstart = start;
786 } else {
787 vmstart = vma->vm_start;
788 }
789
c36f6e6d 790 if (mpol_equal(vma->vm_policy, new_pol)) {
00ca0f2e 791 *prev = vma;
7329e3eb 792 return 0;
00ca0f2e 793 }
7329e3eb 794
94d7d923
LS
795 vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
796 if (IS_ERR(vma))
797 return PTR_ERR(vma);
f4e9e0e6
LH
798
799 *prev = vma;
800 return vma_replace_policy(vma, new_pol);
1da177e4
LT
801}
802
1da177e4 803/* Set the process memory policy */
028fec41
DR
804static long do_set_mempolicy(unsigned short mode, unsigned short flags,
805 nodemask_t *nodes)
1da177e4 806{
58568d2a 807 struct mempolicy *new, *old;
4bfc4495 808 NODEMASK_SCRATCH(scratch);
58568d2a 809 int ret;
1da177e4 810
4bfc4495
KH
811 if (!scratch)
812 return -ENOMEM;
f4e53d91 813
4bfc4495
KH
814 new = mpol_new(mode, flags, nodes);
815 if (IS_ERR(new)) {
816 ret = PTR_ERR(new);
817 goto out;
818 }
2c7c3a7d 819
12c1dc8e 820 task_lock(current);
4bfc4495 821 ret = mpol_set_nodemask(new, nodes, scratch);
58568d2a 822 if (ret) {
12c1dc8e 823 task_unlock(current);
58568d2a 824 mpol_put(new);
4bfc4495 825 goto out;
58568d2a 826 }
12c1dc8e 827
58568d2a 828 old = current->mempolicy;
1da177e4 829 current->mempolicy = new;
45816682
VB
830 if (new && new->mode == MPOL_INTERLEAVE)
831 current->il_prev = MAX_NUMNODES-1;
58568d2a 832 task_unlock(current);
58568d2a 833 mpol_put(old);
4bfc4495
KH
834 ret = 0;
835out:
836 NODEMASK_SCRATCH_FREE(scratch);
837 return ret;
1da177e4
LT
838}
839
bea904d5
LS
840/*
841 * Return nodemask for policy for get_mempolicy() query
58568d2a
MX
842 *
843 * Called with task's alloc_lock held
bea904d5 844 */
c36f6e6d 845static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
1da177e4 846{
dfcd3c0d 847 nodes_clear(*nodes);
c36f6e6d 848 if (pol == &default_policy)
bea904d5
LS
849 return;
850
c36f6e6d 851 switch (pol->mode) {
19770b32 852 case MPOL_BIND:
1da177e4 853 case MPOL_INTERLEAVE:
269fbe72 854 case MPOL_PREFERRED:
b27abacc 855 case MPOL_PREFERRED_MANY:
c36f6e6d 856 *nodes = pol->nodes;
1da177e4 857 break;
7858d7bc
FT
858 case MPOL_LOCAL:
859 /* return empty node mask for local allocation */
860 break;
1da177e4
LT
861 default:
862 BUG();
863 }
864}
865
3b9aadf7 866static int lookup_node(struct mm_struct *mm, unsigned long addr)
1da177e4 867{
ba841078 868 struct page *p = NULL;
f728b9c4 869 int ret;
1da177e4 870
f728b9c4
JH
871 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
872 if (ret > 0) {
873 ret = page_to_nid(p);
1da177e4
LT
874 put_page(p);
875 }
f728b9c4 876 return ret;
1da177e4
LT
877}
878
1da177e4 879/* Retrieve NUMA policy */
dbcb0f19
AB
880static long do_get_mempolicy(int *policy, nodemask_t *nmask,
881 unsigned long addr, unsigned long flags)
1da177e4 882{
8bccd85f 883 int err;
1da177e4
LT
884 struct mm_struct *mm = current->mm;
885 struct vm_area_struct *vma = NULL;
3b9aadf7 886 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1da177e4 887
754af6f5
LS
888 if (flags &
889 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1da177e4 890 return -EINVAL;
754af6f5
LS
891
892 if (flags & MPOL_F_MEMS_ALLOWED) {
893 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
894 return -EINVAL;
895 *policy = 0; /* just so it's initialized */
58568d2a 896 task_lock(current);
754af6f5 897 *nmask = cpuset_current_mems_allowed;
58568d2a 898 task_unlock(current);
754af6f5
LS
899 return 0;
900 }
901
1da177e4 902 if (flags & MPOL_F_ADDR) {
ddc1a5cb 903 pgoff_t ilx; /* ignored here */
bea904d5
LS
904 /*
905 * Do NOT fall back to task policy if the
906 * vma/shared policy at addr is NULL. We
907 * want to return MPOL_DEFAULT in this case.
908 */
d8ed45c5 909 mmap_read_lock(mm);
33e3575c 910 vma = vma_lookup(mm, addr);
1da177e4 911 if (!vma) {
d8ed45c5 912 mmap_read_unlock(mm);
1da177e4
LT
913 return -EFAULT;
914 }
ddc1a5cb 915 pol = __get_vma_policy(vma, addr, &ilx);
1da177e4
LT
916 } else if (addr)
917 return -EINVAL;
918
919 if (!pol)
bea904d5 920 pol = &default_policy; /* indicates default behavior */
1da177e4
LT
921
922 if (flags & MPOL_F_NODE) {
923 if (flags & MPOL_F_ADDR) {
3b9aadf7 924 /*
f728b9c4
JH
925 * Take a refcount on the mpol, because we are about to
926 * drop the mmap_lock, after which only "pol" remains
927 * valid, "vma" is stale.
3b9aadf7
AA
928 */
929 pol_refcount = pol;
930 vma = NULL;
931 mpol_get(pol);
f728b9c4 932 mmap_read_unlock(mm);
3b9aadf7 933 err = lookup_node(mm, addr);
1da177e4
LT
934 if (err < 0)
935 goto out;
8bccd85f 936 *policy = err;
1da177e4 937 } else if (pol == current->mempolicy &&
45c4745a 938 pol->mode == MPOL_INTERLEAVE) {
269fbe72 939 *policy = next_node_in(current->il_prev, pol->nodes);
1da177e4
LT
940 } else {
941 err = -EINVAL;
942 goto out;
943 }
bea904d5
LS
944 } else {
945 *policy = pol == &default_policy ? MPOL_DEFAULT :
946 pol->mode;
d79df630
DR
947 /*
948 * Internal mempolicy flags must be masked off before exposing
949 * the policy to userspace.
950 */
951 *policy |= (pol->flags & MPOL_MODE_FLAGS);
bea904d5 952 }
1da177e4 953
1da177e4 954 err = 0;
58568d2a 955 if (nmask) {
c6b6ef8b
LS
956 if (mpol_store_user_nodemask(pol)) {
957 *nmask = pol->w.user_nodemask;
958 } else {
959 task_lock(current);
960 get_policy_nodemask(pol, nmask);
961 task_unlock(current);
962 }
58568d2a 963 }
1da177e4
LT
964
965 out:
52cd3b07 966 mpol_cond_put(pol);
1da177e4 967 if (vma)
d8ed45c5 968 mmap_read_unlock(mm);
3b9aadf7
AA
969 if (pol_refcount)
970 mpol_put(pol_refcount);
1da177e4
LT
971 return err;
972}
973
b20a3503 974#ifdef CONFIG_MIGRATION
1cb5d11a 975static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
fc301289 976 unsigned long flags)
6ce3c4c0
CL
977{
978 /*
1cb5d11a
HD
979 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
980 * Choosing not to migrate a shared folio is not counted as a failure.
4a64981d
VMO
981 *
982 * To check if the folio is shared, ideally we want to make sure
983 * every page is mapped to the same process. Doing that is very
1cb5d11a 984 * expensive, so check the estimated sharers of the folio instead.
6ce3c4c0 985 */
4a64981d 986 if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
be2d5756 987 if (folio_isolate_lru(folio)) {
4a64981d
VMO
988 list_add_tail(&folio->lru, foliolist);
989 node_stat_mod_folio(folio,
990 NR_ISOLATED_ANON + folio_is_file_lru(folio),
991 folio_nr_pages(folio));
1cb5d11a 992 } else {
a53190a4 993 /*
4a64981d
VMO
994 * Non-movable folio may reach here. And, there may be
995 * temporary off LRU folios or non-LRU movable folios.
996 * Treat them as unmovable folios since they can't be
1cb5d11a 997 * isolated, so they can't be moved at the moment.
a53190a4 998 */
1cb5d11a 999 return false;
62695a84
NP
1000 }
1001 }
1cb5d11a 1002 return true;
7e2ab150 1003}
6ce3c4c0 1004
7e2ab150
CL
1005/*
1006 * Migrate pages from one node to a target node.
1007 * Returns error or the number of pages not migrated.
1008 */
1cb5d11a
HD
1009static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1010 int flags)
7e2ab150
CL
1011{
1012 nodemask_t nmask;
66850be5 1013 struct vm_area_struct *vma;
7e2ab150 1014 LIST_HEAD(pagelist);
1cb5d11a
HD
1015 long nr_failed;
1016 long err = 0;
a0976311
JK
1017 struct migration_target_control mtc = {
1018 .nid = dest,
1019 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1020 };
7e2ab150
CL
1021
1022 nodes_clear(nmask);
1023 node_set(source, nmask);
6ce3c4c0 1024
1cb5d11a 1025 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
72e315f7
HD
1026
1027 mmap_read_lock(mm);
1cb5d11a
HD
1028 vma = find_vma(mm, 0);
1029
08270807 1030 /*
1cb5d11a 1031 * This does not migrate the range, but isolates all pages that
08270807 1032 * need migration. Between passing in the full user address
1cb5d11a
HD
1033 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1034 * but passes back the count of pages which could not be isolated.
08270807 1035 */
1cb5d11a
HD
1036 nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1037 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
72e315f7 1038 mmap_read_unlock(mm);
7e2ab150 1039
cf608ac1 1040 if (!list_empty(&pagelist)) {
a0976311 1041 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1cb5d11a 1042 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
cf608ac1 1043 if (err)
e2d8cf40 1044 putback_movable_pages(&pagelist);
cf608ac1 1045 }
95a402c3 1046
1cb5d11a
HD
1047 if (err >= 0)
1048 err += nr_failed;
7e2ab150 1049 return err;
6ce3c4c0
CL
1050}
1051
39743889 1052/*
7e2ab150
CL
1053 * Move pages between the two nodesets so as to preserve the physical
1054 * layout as much as possible.
39743889
CL
1055 *
1056 * Returns the number of page that could not be moved.
1057 */
0ce72d4f
AM
1058int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1059 const nodemask_t *to, int flags)
39743889 1060{
1cb5d11a
HD
1061 long nr_failed = 0;
1062 long err = 0;
7e2ab150 1063 nodemask_t tmp;
39743889 1064
361a2a22 1065 lru_cache_disable();
0aedadf9 1066
da0aa138
KM
1067 /*
1068 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1069 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1070 * bit in 'tmp', and return that <source, dest> pair for migration.
1071 * The pair of nodemasks 'to' and 'from' define the map.
1072 *
1073 * If no pair of bits is found that way, fallback to picking some
1074 * pair of 'source' and 'dest' bits that are not the same. If the
1075 * 'source' and 'dest' bits are the same, this represents a node
1076 * that will be migrating to itself, so no pages need move.
1077 *
1078 * If no bits are left in 'tmp', or if all remaining bits left
1079 * in 'tmp' correspond to the same bit in 'to', return false
1080 * (nothing left to migrate).
1081 *
1082 * This lets us pick a pair of nodes to migrate between, such that
1083 * if possible the dest node is not already occupied by some other
1084 * source node, minimizing the risk of overloading the memory on a
1085 * node that would happen if we migrated incoming memory to a node
1086 * before migrating outgoing memory source that same node.
1087 *
1088 * A single scan of tmp is sufficient. As we go, we remember the
1089 * most recent <s, d> pair that moved (s != d). If we find a pair
1090 * that not only moved, but what's better, moved to an empty slot
1091 * (d is not set in tmp), then we break out then, with that pair.
ae0e47f0 1092 * Otherwise when we finish scanning from_tmp, we at least have the
da0aa138
KM
1093 * most recent <s, d> pair that moved. If we get all the way through
1094 * the scan of tmp without finding any node that moved, much less
1095 * moved to an empty node, then there is nothing left worth migrating.
1096 */
d4984711 1097
0ce72d4f 1098 tmp = *from;
7e2ab150 1099 while (!nodes_empty(tmp)) {
68d68ff6 1100 int s, d;
b76ac7e7 1101 int source = NUMA_NO_NODE;
7e2ab150
CL
1102 int dest = 0;
1103
1104 for_each_node_mask(s, tmp) {
4a5b18cc
LW
1105
1106 /*
1107 * do_migrate_pages() tries to maintain the relative
1108 * node relationship of the pages established between
1109 * threads and memory areas.
1110 *
1111 * However if the number of source nodes is not equal to
1112 * the number of destination nodes we can not preserve
1113 * this node relative relationship. In that case, skip
1114 * copying memory from a node that is in the destination
1115 * mask.
1116 *
1117 * Example: [2,3,4] -> [3,4,5] moves everything.
1118 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1119 */
1120
0ce72d4f
AM
1121 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1122 (node_isset(s, *to)))
4a5b18cc
LW
1123 continue;
1124
0ce72d4f 1125 d = node_remap(s, *from, *to);
7e2ab150
CL
1126 if (s == d)
1127 continue;
1128
1129 source = s; /* Node moved. Memorize */
1130 dest = d;
1131
1132 /* dest not in remaining from nodes? */
1133 if (!node_isset(dest, tmp))
1134 break;
1135 }
b76ac7e7 1136 if (source == NUMA_NO_NODE)
7e2ab150
CL
1137 break;
1138
1139 node_clear(source, tmp);
1140 err = migrate_to_node(mm, source, dest, flags);
1141 if (err > 0)
1cb5d11a 1142 nr_failed += err;
7e2ab150
CL
1143 if (err < 0)
1144 break;
39743889 1145 }
d479960e 1146
361a2a22 1147 lru_cache_enable();
7e2ab150
CL
1148 if (err < 0)
1149 return err;
1cb5d11a 1150 return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
b20a3503
CL
1151}
1152
3ad33b24 1153/*
72e315f7 1154 * Allocate a new folio for page migration, according to NUMA mempolicy.
3ad33b24 1155 */
72e315f7
HD
1156static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1157 unsigned long private)
95a402c3 1158{
72e315f7
HD
1159 struct mempolicy *pol = (struct mempolicy *)private;
1160 pgoff_t ilx = 0; /* improve on this later */
1161 struct page *page;
1162 unsigned int order;
1163 int nid = numa_node_id();
1164 gfp_t gfp;
11c731e8 1165
72e315f7
HD
1166 order = folio_order(src);
1167 ilx += src->index >> order;
ddc1a5cb 1168
d0ce0e47 1169 if (folio_test_hugetlb(src)) {
72e315f7
HD
1170 nodemask_t *nodemask;
1171 struct hstate *h;
1172
1173 h = folio_hstate(src);
1174 gfp = htlb_alloc_mask(h);
1175 nodemask = policy_nodemask(gfp, pol, ilx, &nid);
1176 return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp);
d0ce0e47 1177 }
ec4858e0
MWO
1178
1179 if (folio_test_large(src))
1180 gfp = GFP_TRANSHUGE;
72e315f7
HD
1181 else
1182 gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
ec4858e0 1183
72e315f7
HD
1184 page = alloc_pages_mpol(gfp, order, pol, ilx, nid);
1185 return page_rmappable_folio(page);
95a402c3 1186}
b20a3503
CL
1187#else
1188
1cb5d11a 1189static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
b20a3503
CL
1190 unsigned long flags)
1191{
1cb5d11a 1192 return false;
39743889
CL
1193}
1194
0ce72d4f
AM
1195int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1196 const nodemask_t *to, int flags)
b20a3503
CL
1197{
1198 return -ENOSYS;
1199}
95a402c3 1200
72e315f7
HD
1201static struct folio *alloc_migration_target_by_mpol(struct folio *src,
1202 unsigned long private)
95a402c3
CL
1203{
1204 return NULL;
1205}
b20a3503
CL
1206#endif
1207
dbcb0f19 1208static long do_mbind(unsigned long start, unsigned long len,
028fec41
DR
1209 unsigned short mode, unsigned short mode_flags,
1210 nodemask_t *nmask, unsigned long flags)
6ce3c4c0 1211{
6ce3c4c0 1212 struct mm_struct *mm = current->mm;
f4e9e0e6
LH
1213 struct vm_area_struct *vma, *prev;
1214 struct vma_iterator vmi;
6ce3c4c0
CL
1215 struct mempolicy *new;
1216 unsigned long end;
1cb5d11a
HD
1217 long err;
1218 long nr_failed;
6ce3c4c0
CL
1219 LIST_HEAD(pagelist);
1220
b24f53a0 1221 if (flags & ~(unsigned long)MPOL_MF_VALID)
6ce3c4c0 1222 return -EINVAL;
74c00241 1223 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
6ce3c4c0
CL
1224 return -EPERM;
1225
1226 if (start & ~PAGE_MASK)
1227 return -EINVAL;
1228
1229 if (mode == MPOL_DEFAULT)
1230 flags &= ~MPOL_MF_STRICT;
1231
aaa31e05 1232 len = PAGE_ALIGN(len);
6ce3c4c0
CL
1233 end = start + len;
1234
1235 if (end < start)
1236 return -EINVAL;
1237 if (end == start)
1238 return 0;
1239
028fec41 1240 new = mpol_new(mode, mode_flags, nmask);
6ce3c4c0
CL
1241 if (IS_ERR(new))
1242 return PTR_ERR(new);
1243
1244 /*
1245 * If we are using the default policy then operation
1246 * on discontinuous address spaces is okay after all
1247 */
1248 if (!new)
1249 flags |= MPOL_MF_DISCONTIG_OK;
1250
1cb5d11a 1251 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
361a2a22 1252 lru_cache_disable();
4bfc4495
KH
1253 {
1254 NODEMASK_SCRATCH(scratch);
1255 if (scratch) {
d8ed45c5 1256 mmap_write_lock(mm);
4bfc4495 1257 err = mpol_set_nodemask(new, nmask, scratch);
4bfc4495 1258 if (err)
d8ed45c5 1259 mmap_write_unlock(mm);
4bfc4495
KH
1260 } else
1261 err = -ENOMEM;
1262 NODEMASK_SCRATCH_FREE(scratch);
1263 }
b05ca738
KM
1264 if (err)
1265 goto mpol_out;
1266
6c21e066 1267 /*
1cb5d11a
HD
1268 * Lock the VMAs before scanning for pages to migrate,
1269 * to ensure we don't miss a concurrently inserted page.
6c21e066 1270 */
1cb5d11a
HD
1271 nr_failed = queue_pages_range(mm, start, end, nmask,
1272 flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
d8835445 1273
1cb5d11a
HD
1274 if (nr_failed < 0) {
1275 err = nr_failed;
72e315f7 1276 nr_failed = 0;
1cb5d11a
HD
1277 } else {
1278 vma_iter_init(&vmi, mm, start);
1279 prev = vma_prev(&vmi);
1280 for_each_vma_range(vmi, vma, end) {
1281 err = mbind_range(&vmi, vma, &prev, start, end, new);
1282 if (err)
1283 break;
1284 }
f4e9e0e6 1285 }
7e2ab150 1286
72e315f7
HD
1287 mmap_write_unlock(mm);
1288
1289 if (!err && !list_empty(&pagelist)) {
1290 /* Convert MPOL_DEFAULT's NULL to task or default policy */
1291 if (!new) {
1292 new = get_task_policy(current);
1293 mpol_get(new);
cf608ac1 1294 }
72e315f7
HD
1295 nr_failed |= migrate_pages(&pagelist,
1296 alloc_migration_target_by_mpol, NULL,
1297 (unsigned long)new, MIGRATE_SYNC,
1298 MR_MEMPOLICY_MBIND, NULL);
a85dfc30
YS
1299 }
1300
72e315f7
HD
1301 if (nr_failed && (flags & MPOL_MF_STRICT))
1302 err = -EIO;
1cb5d11a
HD
1303 if (!list_empty(&pagelist))
1304 putback_movable_pages(&pagelist);
d8835445 1305mpol_out:
f0be3d32 1306 mpol_put(new);
d479960e 1307 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
361a2a22 1308 lru_cache_enable();
6ce3c4c0
CL
1309 return err;
1310}
1311
8bccd85f
CL
1312/*
1313 * User space interface with variable sized bitmaps for nodelists.
1314 */
e130242d
AB
1315static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1316 unsigned long maxnode)
1317{
1318 unsigned long nlongs = BITS_TO_LONGS(maxnode);
1319 int ret;
1320
1321 if (in_compat_syscall())
1322 ret = compat_get_bitmap(mask,
1323 (const compat_ulong_t __user *)nmask,
1324 maxnode);
1325 else
1326 ret = copy_from_user(mask, nmask,
1327 nlongs * sizeof(unsigned long));
1328
1329 if (ret)
1330 return -EFAULT;
1331
1332 if (maxnode % BITS_PER_LONG)
1333 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1334
1335 return 0;
1336}
8bccd85f
CL
1337
1338/* Copy a node mask from user space. */
39743889 1339static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85f
CL
1340 unsigned long maxnode)
1341{
8bccd85f
CL
1342 --maxnode;
1343 nodes_clear(*nodes);
1344 if (maxnode == 0 || !nmask)
1345 return 0;
a9c930ba 1346 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
636f13c1 1347 return -EINVAL;
8bccd85f 1348
56521e7a
YX
1349 /*
1350 * When the user specified more nodes than supported just check
e130242d
AB
1351 * if the non supported part is all zero, one word at a time,
1352 * starting at the end.
56521e7a 1353 */
e130242d
AB
1354 while (maxnode > MAX_NUMNODES) {
1355 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1356 unsigned long t;
8bccd85f 1357
000eca5d 1358 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
56521e7a 1359 return -EFAULT;
e130242d
AB
1360
1361 if (maxnode - bits >= MAX_NUMNODES) {
1362 maxnode -= bits;
1363 } else {
1364 maxnode = MAX_NUMNODES;
1365 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1366 }
1367 if (t)
56521e7a
YX
1368 return -EINVAL;
1369 }
1370
e130242d 1371 return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
8bccd85f
CL
1372}
1373
1374/* Copy a kernel node mask to user space */
1375static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1376 nodemask_t *nodes)
1377{
1378 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
050c17f2 1379 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
e130242d
AB
1380 bool compat = in_compat_syscall();
1381
1382 if (compat)
1383 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
8bccd85f
CL
1384
1385 if (copy > nbytes) {
1386 if (copy > PAGE_SIZE)
1387 return -EINVAL;
1388 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1389 return -EFAULT;
1390 copy = nbytes;
e130242d 1391 maxnode = nr_node_ids;
8bccd85f 1392 }
e130242d
AB
1393
1394 if (compat)
1395 return compat_put_bitmap((compat_ulong_t __user *)mask,
1396 nodes_addr(*nodes), maxnode);
1397
8bccd85f
CL
1398 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1399}
1400
95837924
FT
1401/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1402static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1403{
1404 *flags = *mode & MPOL_MODE_FLAGS;
1405 *mode &= ~MPOL_MODE_FLAGS;
b27abacc 1406
a38a59fd 1407 if ((unsigned int)(*mode) >= MPOL_MAX)
95837924
FT
1408 return -EINVAL;
1409 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1410 return -EINVAL;
6d2aec9e
ED
1411 if (*flags & MPOL_F_NUMA_BALANCING) {
1412 if (*mode != MPOL_BIND)
1413 return -EINVAL;
1414 *flags |= (MPOL_F_MOF | MPOL_F_MORON);
1415 }
95837924
FT
1416 return 0;
1417}
1418
e7dc9ad6
DB
1419static long kernel_mbind(unsigned long start, unsigned long len,
1420 unsigned long mode, const unsigned long __user *nmask,
1421 unsigned long maxnode, unsigned int flags)
8bccd85f 1422{
95837924 1423 unsigned short mode_flags;
8bccd85f 1424 nodemask_t nodes;
95837924 1425 int lmode = mode;
8bccd85f
CL
1426 int err;
1427
057d3389 1428 start = untagged_addr(start);
95837924
FT
1429 err = sanitize_mpol_flags(&lmode, &mode_flags);
1430 if (err)
1431 return err;
1432
8bccd85f
CL
1433 err = get_nodes(&nodes, nmask, maxnode);
1434 if (err)
1435 return err;
95837924
FT
1436
1437 return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
8bccd85f
CL
1438}
1439
c6018b4b
AK
1440SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1441 unsigned long, home_node, unsigned long, flags)
1442{
1443 struct mm_struct *mm = current->mm;
f4e9e0e6 1444 struct vm_area_struct *vma, *prev;
e976936c 1445 struct mempolicy *new, *old;
c6018b4b
AK
1446 unsigned long end;
1447 int err = -ENOENT;
66850be5 1448 VMA_ITERATOR(vmi, mm, start);
c6018b4b
AK
1449
1450 start = untagged_addr(start);
1451 if (start & ~PAGE_MASK)
1452 return -EINVAL;
1453 /*
1454 * flags is used for future extension if any.
1455 */
1456 if (flags != 0)
1457 return -EINVAL;
1458
1459 /*
1460 * Check home_node is online to avoid accessing uninitialized
1461 * NODE_DATA.
1462 */
1463 if (home_node >= MAX_NUMNODES || !node_online(home_node))
1464 return -EINVAL;
1465
aaa31e05 1466 len = PAGE_ALIGN(len);
c6018b4b
AK
1467 end = start + len;
1468
1469 if (end < start)
1470 return -EINVAL;
1471 if (end == start)
1472 return 0;
1473 mmap_write_lock(mm);
f4e9e0e6 1474 prev = vma_prev(&vmi);
66850be5 1475 for_each_vma_range(vmi, vma, end) {
c6018b4b
AK
1476 /*
1477 * If any vma in the range got policy other than MPOL_BIND
1478 * or MPOL_PREFERRED_MANY we return error. We don't reset
1479 * the home node for vmas we already updated before.
1480 */
e976936c 1481 old = vma_policy(vma);
51f62537
LH
1482 if (!old) {
1483 prev = vma;
e976936c 1484 continue;
51f62537 1485 }
e976936c 1486 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
c6018b4b
AK
1487 err = -EOPNOTSUPP;
1488 break;
1489 }
e976936c
MH
1490 new = mpol_dup(old);
1491 if (IS_ERR(new)) {
1492 err = PTR_ERR(new);
1493 break;
1494 }
c6018b4b 1495
6c21e066 1496 vma_start_write(vma);
c6018b4b 1497 new->home_node = home_node;
f4e9e0e6 1498 err = mbind_range(&vmi, vma, &prev, start, end, new);
c6018b4b
AK
1499 mpol_put(new);
1500 if (err)
1501 break;
1502 }
1503 mmap_write_unlock(mm);
1504 return err;
1505}
1506
e7dc9ad6
DB
1507SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1508 unsigned long, mode, const unsigned long __user *, nmask,
1509 unsigned long, maxnode, unsigned int, flags)
1510{
1511 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1512}
1513
8bccd85f 1514/* Set the process memory policy */
af03c4ac
DB
1515static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1516 unsigned long maxnode)
8bccd85f 1517{
95837924 1518 unsigned short mode_flags;
8bccd85f 1519 nodemask_t nodes;
95837924
FT
1520 int lmode = mode;
1521 int err;
1522
1523 err = sanitize_mpol_flags(&lmode, &mode_flags);
1524 if (err)
1525 return err;
8bccd85f 1526
8bccd85f
CL
1527 err = get_nodes(&nodes, nmask, maxnode);
1528 if (err)
1529 return err;
95837924
FT
1530
1531 return do_set_mempolicy(lmode, mode_flags, &nodes);
8bccd85f
CL
1532}
1533
af03c4ac
DB
1534SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1535 unsigned long, maxnode)
1536{
1537 return kernel_set_mempolicy(mode, nmask, maxnode);
1538}
1539
b6e9b0ba
DB
1540static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1541 const unsigned long __user *old_nodes,
1542 const unsigned long __user *new_nodes)
39743889 1543{
596d7cfa 1544 struct mm_struct *mm = NULL;
39743889 1545 struct task_struct *task;
39743889
CL
1546 nodemask_t task_nodes;
1547 int err;
596d7cfa
KM
1548 nodemask_t *old;
1549 nodemask_t *new;
1550 NODEMASK_SCRATCH(scratch);
1551
1552 if (!scratch)
1553 return -ENOMEM;
39743889 1554
596d7cfa
KM
1555 old = &scratch->mask1;
1556 new = &scratch->mask2;
1557
1558 err = get_nodes(old, old_nodes, maxnode);
39743889 1559 if (err)
596d7cfa 1560 goto out;
39743889 1561
596d7cfa 1562 err = get_nodes(new, new_nodes, maxnode);
39743889 1563 if (err)
596d7cfa 1564 goto out;
39743889
CL
1565
1566 /* Find the mm_struct */
55cfaa3c 1567 rcu_read_lock();
228ebcbe 1568 task = pid ? find_task_by_vpid(pid) : current;
39743889 1569 if (!task) {
55cfaa3c 1570 rcu_read_unlock();
596d7cfa
KM
1571 err = -ESRCH;
1572 goto out;
39743889 1573 }
3268c63e 1574 get_task_struct(task);
39743889 1575
596d7cfa 1576 err = -EINVAL;
39743889
CL
1577
1578 /*
31367466
OE
1579 * Check if this process has the right to modify the specified process.
1580 * Use the regular "ptrace_may_access()" checks.
39743889 1581 */
31367466 1582 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
c69e8d9c 1583 rcu_read_unlock();
39743889 1584 err = -EPERM;
3268c63e 1585 goto out_put;
39743889 1586 }
c69e8d9c 1587 rcu_read_unlock();
39743889
CL
1588
1589 task_nodes = cpuset_mems_allowed(task);
1590 /* Is the user allowed to access the target nodes? */
596d7cfa 1591 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
39743889 1592 err = -EPERM;
3268c63e 1593 goto out_put;
39743889
CL
1594 }
1595
0486a38b
YX
1596 task_nodes = cpuset_mems_allowed(current);
1597 nodes_and(*new, *new, task_nodes);
1598 if (nodes_empty(*new))
1599 goto out_put;
1600
86c3a764
DQ
1601 err = security_task_movememory(task);
1602 if (err)
3268c63e 1603 goto out_put;
86c3a764 1604
3268c63e
CL
1605 mm = get_task_mm(task);
1606 put_task_struct(task);
f2a9ef88
SL
1607
1608 if (!mm) {
3268c63e 1609 err = -EINVAL;
f2a9ef88
SL
1610 goto out;
1611 }
1612
1613 err = do_migrate_pages(mm, old, new,
1614 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
3268c63e
CL
1615
1616 mmput(mm);
1617out:
596d7cfa
KM
1618 NODEMASK_SCRATCH_FREE(scratch);
1619
39743889 1620 return err;
3268c63e
CL
1621
1622out_put:
1623 put_task_struct(task);
1624 goto out;
39743889
CL
1625}
1626
b6e9b0ba
DB
1627SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1628 const unsigned long __user *, old_nodes,
1629 const unsigned long __user *, new_nodes)
1630{
1631 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1632}
1633
8bccd85f 1634/* Retrieve NUMA policy */
af03c4ac
DB
1635static int kernel_get_mempolicy(int __user *policy,
1636 unsigned long __user *nmask,
1637 unsigned long maxnode,
1638 unsigned long addr,
1639 unsigned long flags)
8bccd85f 1640{
dbcb0f19 1641 int err;
3f649ab7 1642 int pval;
8bccd85f
CL
1643 nodemask_t nodes;
1644
050c17f2 1645 if (nmask != NULL && maxnode < nr_node_ids)
8bccd85f
CL
1646 return -EINVAL;
1647
4605f057
WH
1648 addr = untagged_addr(addr);
1649
8bccd85f
CL
1650 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1651
1652 if (err)
1653 return err;
1654
1655 if (policy && put_user(pval, policy))
1656 return -EFAULT;
1657
1658 if (nmask)
1659 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1660
1661 return err;
1662}
1663
af03c4ac
DB
1664SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1665 unsigned long __user *, nmask, unsigned long, maxnode,
1666 unsigned long, addr, unsigned long, flags)
1667{
1668 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1669}
1670
20ca87f2
LX
1671bool vma_migratable(struct vm_area_struct *vma)
1672{
1673 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1674 return false;
1675
1676 /*
1677 * DAX device mappings require predictable access latency, so avoid
1678 * incurring periodic faults.
1679 */
1680 if (vma_is_dax(vma))
1681 return false;
1682
1683 if (is_vm_hugetlb_page(vma) &&
1684 !hugepage_migration_supported(hstate_vma(vma)))
1685 return false;
1686
1687 /*
1688 * Migration allocates pages in the highest zone. If we cannot
1689 * do so then migration (at least from node to node) is not
1690 * possible.
1691 */
1692 if (vma->vm_file &&
1693 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1694 < policy_zone)
1695 return false;
1696 return true;
1697}
1698
74d2c3a0 1699struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
ddc1a5cb 1700 unsigned long addr, pgoff_t *ilx)
1da177e4 1701{
ddc1a5cb
HD
1702 *ilx = 0;
1703 return (vma->vm_ops && vma->vm_ops->get_policy) ?
1704 vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
74d2c3a0
ON
1705}
1706
1707/*
ddc1a5cb 1708 * get_vma_policy(@vma, @addr, @order, @ilx)
74d2c3a0
ON
1709 * @vma: virtual memory area whose policy is sought
1710 * @addr: address in @vma for shared policy lookup
ddc1a5cb
HD
1711 * @order: 0, or appropriate huge_page_order for interleaving
1712 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE
74d2c3a0
ON
1713 *
1714 * Returns effective policy for a VMA at specified address.
dd6eecb9 1715 * Falls back to current->mempolicy or system default policy, as necessary.
74d2c3a0
ON
1716 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1717 * count--added by the get_policy() vm_op, as appropriate--to protect against
1718 * freeing by another task. It is the caller's responsibility to free the
1719 * extra reference for shared policies.
1720 */
ddc1a5cb
HD
1721struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1722 unsigned long addr, int order, pgoff_t *ilx)
74d2c3a0 1723{
ddc1a5cb 1724 struct mempolicy *pol;
74d2c3a0 1725
ddc1a5cb 1726 pol = __get_vma_policy(vma, addr, ilx);
8d90274b 1727 if (!pol)
dd6eecb9 1728 pol = get_task_policy(current);
ddc1a5cb
HD
1729 if (pol->mode == MPOL_INTERLEAVE) {
1730 *ilx += vma->vm_pgoff >> order;
1731 *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
1732 }
1da177e4
LT
1733 return pol;
1734}
1735
6b6482bb 1736bool vma_policy_mof(struct vm_area_struct *vma)
fc314724 1737{
6b6482bb 1738 struct mempolicy *pol;
fc314724 1739
6b6482bb
ON
1740 if (vma->vm_ops && vma->vm_ops->get_policy) {
1741 bool ret = false;
ddc1a5cb 1742 pgoff_t ilx; /* ignored here */
fc314724 1743
ddc1a5cb 1744 pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
6b6482bb
ON
1745 if (pol && (pol->flags & MPOL_F_MOF))
1746 ret = true;
1747 mpol_cond_put(pol);
8d90274b 1748
6b6482bb 1749 return ret;
fc314724
MG
1750 }
1751
6b6482bb 1752 pol = vma->vm_policy;
8d90274b 1753 if (!pol)
6b6482bb 1754 pol = get_task_policy(current);
8d90274b 1755
fc314724
MG
1756 return pol->flags & MPOL_F_MOF;
1757}
1758
d2226ebd 1759bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
d3eb1570
LJ
1760{
1761 enum zone_type dynamic_policy_zone = policy_zone;
1762
1763 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1764
1765 /*
269fbe72 1766 * if policy->nodes has movable memory only,
d3eb1570
LJ
1767 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1768 *
269fbe72 1769 * policy->nodes is intersect with node_states[N_MEMORY].
f0953a1b 1770 * so if the following test fails, it implies
269fbe72 1771 * policy->nodes has movable memory only.
d3eb1570 1772 */
269fbe72 1773 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
d3eb1570
LJ
1774 dynamic_policy_zone = ZONE_MOVABLE;
1775
1776 return zone >= dynamic_policy_zone;
1777}
1778
1da177e4 1779/* Do dynamic interleaving for a process */
c36f6e6d 1780static unsigned int interleave_nodes(struct mempolicy *policy)
1da177e4 1781{
c36f6e6d 1782 unsigned int nid;
1da177e4 1783
c36f6e6d
HD
1784 nid = next_node_in(current->il_prev, policy->nodes);
1785 if (nid < MAX_NUMNODES)
1786 current->il_prev = nid;
1787 return nid;
1da177e4
LT
1788}
1789
dc85da15
CL
1790/*
1791 * Depending on the memory policy provide a node from which to allocate the
1792 * next slab entry.
1793 */
2a389610 1794unsigned int mempolicy_slab_node(void)
dc85da15 1795{
e7b691b0 1796 struct mempolicy *policy;
2a389610 1797 int node = numa_mem_id();
e7b691b0 1798
38b031dd 1799 if (!in_task())
2a389610 1800 return node;
e7b691b0
AK
1801
1802 policy = current->mempolicy;
7858d7bc 1803 if (!policy)
2a389610 1804 return node;
bea904d5
LS
1805
1806 switch (policy->mode) {
1807 case MPOL_PREFERRED:
269fbe72 1808 return first_node(policy->nodes);
765c4507 1809
dc85da15
CL
1810 case MPOL_INTERLEAVE:
1811 return interleave_nodes(policy);
1812
b27abacc
DH
1813 case MPOL_BIND:
1814 case MPOL_PREFERRED_MANY:
1815 {
c33d6c06
MG
1816 struct zoneref *z;
1817
dc85da15
CL
1818 /*
1819 * Follow bind policy behavior and start allocation at the
1820 * first node.
1821 */
19770b32 1822 struct zonelist *zonelist;
19770b32 1823 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
c9634cf0 1824 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
c33d6c06 1825 z = first_zones_zonelist(zonelist, highest_zoneidx,
269fbe72 1826 &policy->nodes);
c1093b74 1827 return z->zone ? zone_to_nid(z->zone) : node;
dd1a239f 1828 }
7858d7bc
FT
1829 case MPOL_LOCAL:
1830 return node;
dc85da15 1831
dc85da15 1832 default:
bea904d5 1833 BUG();
dc85da15
CL
1834 }
1835}
1836
fee83b3a 1837/*
ddc1a5cb
HD
1838 * Do static interleaving for interleave index @ilx. Returns the ilx'th
1839 * node in pol->nodes (starting from ilx=0), wrapping around if ilx
1840 * exceeds the number of present nodes.
fee83b3a 1841 */
ddc1a5cb 1842static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
1da177e4 1843{
276aeee1 1844 nodemask_t nodemask = pol->nodes;
1845 unsigned int target, nnodes;
fee83b3a
AM
1846 int i;
1847 int nid;
276aeee1 1848 /*
1849 * The barrier will stabilize the nodemask in a register or on
1850 * the stack so that it will stop changing under the code.
1851 *
1852 * Between first_node() and next_node(), pol->nodes could be changed
1853 * by other threads. So we put pol->nodes in a local stack.
1854 */
1855 barrier();
1da177e4 1856
276aeee1 1857 nnodes = nodes_weight(nodemask);
f5b087b5
DR
1858 if (!nnodes)
1859 return numa_node_id();
ddc1a5cb 1860 target = ilx % nnodes;
276aeee1 1861 nid = first_node(nodemask);
fee83b3a 1862 for (i = 0; i < target; i++)
276aeee1 1863 nid = next_node(nid, nodemask);
1da177e4
LT
1864 return nid;
1865}
1866
ddc1a5cb
HD
1867/*
1868 * Return a nodemask representing a mempolicy for filtering nodes for
1869 * page allocation, together with preferred node id (or the input node id).
1870 */
1871static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
1872 pgoff_t ilx, int *nid)
5da7ca86 1873{
ddc1a5cb 1874 nodemask_t *nodemask = NULL;
5da7ca86 1875
ddc1a5cb
HD
1876 switch (pol->mode) {
1877 case MPOL_PREFERRED:
1878 /* Override input node id */
1879 *nid = first_node(pol->nodes);
1880 break;
1881 case MPOL_PREFERRED_MANY:
1882 nodemask = &pol->nodes;
1883 if (pol->home_node != NUMA_NO_NODE)
1884 *nid = pol->home_node;
1885 break;
1886 case MPOL_BIND:
1887 /* Restrict to nodemask (but not on lower zones) */
1888 if (apply_policy_zone(pol, gfp_zone(gfp)) &&
1889 cpuset_nodemask_valid_mems_allowed(&pol->nodes))
1890 nodemask = &pol->nodes;
1891 if (pol->home_node != NUMA_NO_NODE)
1892 *nid = pol->home_node;
3b98b087 1893 /*
ddc1a5cb
HD
1894 * __GFP_THISNODE shouldn't even be used with the bind policy
1895 * because we might easily break the expectation to stay on the
1896 * requested node and not break the policy.
3b98b087 1897 */
ddc1a5cb
HD
1898 WARN_ON_ONCE(gfp & __GFP_THISNODE);
1899 break;
1900 case MPOL_INTERLEAVE:
1901 /* Override input node id */
1902 *nid = (ilx == NO_INTERLEAVE_INDEX) ?
1903 interleave_nodes(pol) : interleave_nid(pol, ilx);
1904 break;
1905 }
1906
1907 return nodemask;
5da7ca86
CL
1908}
1909
00ac59ad 1910#ifdef CONFIG_HUGETLBFS
480eccf9 1911/*
04ec6264 1912 * huge_node(@vma, @addr, @gfp_flags, @mpol)
b46e14ac
FF
1913 * @vma: virtual memory area whose policy is sought
1914 * @addr: address in @vma for shared policy lookup and interleave policy
1915 * @gfp_flags: for requested zone
1916 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
b27abacc 1917 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
480eccf9 1918 *
04ec6264 1919 * Returns a nid suitable for a huge page allocation and a pointer
52cd3b07 1920 * to the struct mempolicy for conditional unref after allocation.
b27abacc
DH
1921 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
1922 * to the mempolicy's @nodemask for filtering the zonelist.
480eccf9 1923 */
04ec6264 1924int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
ddc1a5cb 1925 struct mempolicy **mpol, nodemask_t **nodemask)
5da7ca86 1926{
ddc1a5cb 1927 pgoff_t ilx;
04ec6264 1928 int nid;
5da7ca86 1929
ddc1a5cb
HD
1930 nid = numa_node_id();
1931 *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
1932 *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
04ec6264 1933 return nid;
5da7ca86 1934}
06808b08
LS
1935
1936/*
1937 * init_nodemask_of_mempolicy
1938 *
1939 * If the current task's mempolicy is "default" [NULL], return 'false'
1940 * to indicate default policy. Otherwise, extract the policy nodemask
1941 * for 'bind' or 'interleave' policy into the argument nodemask, or
1942 * initialize the argument nodemask to contain the single node for
1943 * 'preferred' or 'local' policy and return 'true' to indicate presence
1944 * of non-default mempolicy.
1945 *
1946 * We don't bother with reference counting the mempolicy [mpol_get/put]
1947 * because the current task is examining it's own mempolicy and a task's
1948 * mempolicy is only ever changed by the task itself.
1949 *
1950 * N.B., it is the caller's responsibility to free a returned nodemask.
1951 */
1952bool init_nodemask_of_mempolicy(nodemask_t *mask)
1953{
1954 struct mempolicy *mempolicy;
06808b08
LS
1955
1956 if (!(mask && current->mempolicy))
1957 return false;
1958
c0ff7453 1959 task_lock(current);
06808b08
LS
1960 mempolicy = current->mempolicy;
1961 switch (mempolicy->mode) {
1962 case MPOL_PREFERRED:
b27abacc 1963 case MPOL_PREFERRED_MANY:
06808b08 1964 case MPOL_BIND:
06808b08 1965 case MPOL_INTERLEAVE:
269fbe72 1966 *mask = mempolicy->nodes;
7858d7bc
FT
1967 break;
1968
1969 case MPOL_LOCAL:
269fbe72 1970 init_nodemask_of_node(mask, numa_node_id());
06808b08
LS
1971 break;
1972
1973 default:
1974 BUG();
1975 }
c0ff7453 1976 task_unlock(current);
06808b08
LS
1977
1978 return true;
1979}
00ac59ad 1980#endif
5da7ca86 1981
6f48d0eb 1982/*
b26e517a 1983 * mempolicy_in_oom_domain
6f48d0eb 1984 *
b26e517a
FT
1985 * If tsk's mempolicy is "bind", check for intersection between mask and
1986 * the policy nodemask. Otherwise, return true for all other policies
1987 * including "interleave", as a tsk with "interleave" policy may have
1988 * memory allocated from all nodes in system.
6f48d0eb
DR
1989 *
1990 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1991 */
b26e517a 1992bool mempolicy_in_oom_domain(struct task_struct *tsk,
6f48d0eb
DR
1993 const nodemask_t *mask)
1994{
1995 struct mempolicy *mempolicy;
1996 bool ret = true;
1997
1998 if (!mask)
1999 return ret;
b26e517a 2000
6f48d0eb
DR
2001 task_lock(tsk);
2002 mempolicy = tsk->mempolicy;
b26e517a 2003 if (mempolicy && mempolicy->mode == MPOL_BIND)
269fbe72 2004 ret = nodes_intersects(mempolicy->nodes, *mask);
6f48d0eb 2005 task_unlock(tsk);
b26e517a 2006
6f48d0eb
DR
2007 return ret;
2008}
2009
4c54d949 2010static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
ddc1a5cb 2011 int nid, nodemask_t *nodemask)
4c54d949
FT
2012{
2013 struct page *page;
2014 gfp_t preferred_gfp;
2015
2016 /*
2017 * This is a two pass approach. The first pass will only try the
2018 * preferred nodes but skip the direct reclaim and allow the
2019 * allocation to fail, while the second pass will try all the
2020 * nodes in system.
2021 */
2022 preferred_gfp = gfp | __GFP_NOWARN;
2023 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
ddc1a5cb 2024 page = __alloc_pages(preferred_gfp, order, nid, nodemask);
4c54d949 2025 if (!page)
c0455116 2026 page = __alloc_pages(gfp, order, nid, NULL);
4c54d949
FT
2027
2028 return page;
2029}
2030
1da177e4 2031/**
ddc1a5cb 2032 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
eb350739 2033 * @gfp: GFP flags.
ddc1a5cb
HD
2034 * @order: Order of the page allocation.
2035 * @pol: Pointer to the NUMA mempolicy.
2036 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2037 * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
1da177e4 2038 *
ddc1a5cb 2039 * Return: The page on success or NULL if allocation fails.
1da177e4 2040 */
ddc1a5cb
HD
2041struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2042 struct mempolicy *pol, pgoff_t ilx, int nid)
1da177e4 2043{
ddc1a5cb
HD
2044 nodemask_t *nodemask;
2045 struct page *page;
adf88aa8 2046
ddc1a5cb 2047 nodemask = policy_nodemask(gfp, pol, ilx, &nid);
4c54d949 2048
ddc1a5cb
HD
2049 if (pol->mode == MPOL_PREFERRED_MANY)
2050 return alloc_pages_preferred_many(gfp, order, nid, nodemask);
19deb769 2051
ddc1a5cb
HD
2052 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2053 /* filter "hugepage" allocation, unless from alloc_pages() */
2054 order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
19deb769
DR
2055 /*
2056 * For hugepage allocation and non-interleave policy which
2057 * allows the current node (or other explicitly preferred
2058 * node) we only try to allocate from the current/preferred
2059 * node and don't fall back to other nodes, as the cost of
2060 * remote accesses would likely offset THP benefits.
2061 *
b27abacc 2062 * If the policy is interleave or does not allow the current
19deb769
DR
2063 * node in its nodemask, we allocate the standard way.
2064 */
ddc1a5cb
HD
2065 if (pol->mode != MPOL_INTERLEAVE &&
2066 (!nodemask || node_isset(nid, *nodemask))) {
cc638f32
VB
2067 /*
2068 * First, try to allocate THP only on local node, but
2069 * don't reclaim unnecessarily, just compact.
2070 */
ddc1a5cb
HD
2071 page = __alloc_pages_node(nid,
2072 gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2073 if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2074 return page;
76e654cc
DR
2075 /*
2076 * If hugepage allocations are configured to always
2077 * synchronous compact or the vma has been madvised
2078 * to prefer hugepage backing, retry allowing remote
cc638f32 2079 * memory with both reclaim and compact as well.
76e654cc 2080 */
ddc1a5cb
HD
2081 }
2082 }
76e654cc 2083
ddc1a5cb
HD
2084 page = __alloc_pages(gfp, order, nid, nodemask);
2085
2086 if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) {
2087 /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2088 if (static_branch_likely(&vm_numa_stat_key) &&
2089 page_to_nid(page) == nid) {
2090 preempt_disable();
2091 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2092 preempt_enable();
19deb769 2093 }
356ff8a9
DR
2094 }
2095
ddc1a5cb
HD
2096 return page;
2097}
2098
2099/**
2100 * vma_alloc_folio - Allocate a folio for a VMA.
2101 * @gfp: GFP flags.
2102 * @order: Order of the folio.
2103 * @vma: Pointer to VMA.
2104 * @addr: Virtual address of the allocation. Must be inside @vma.
2105 * @hugepage: Unused (was: For hugepages try only preferred node if possible).
2106 *
2107 * Allocate a folio for a specific address in @vma, using the appropriate
2108 * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
2109 * VMA to prevent it from going away. Should be used for all allocations
2110 * for folios that will be mapped into user space, excepting hugetlbfs, and
2111 * excepting where direct use of alloc_pages_mpol() is more appropriate.
2112 *
2113 * Return: The folio on success or NULL if allocation fails.
2114 */
2115struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
2116 unsigned long addr, bool hugepage)
2117{
2118 struct mempolicy *pol;
2119 pgoff_t ilx;
2120 struct page *page;
2121
2122 pol = get_vma_policy(vma, addr, order, &ilx);
2123 page = alloc_pages_mpol(gfp | __GFP_COMP, order,
2124 pol, ilx, numa_node_id());
d51e9894 2125 mpol_cond_put(pol);
ddc1a5cb 2126 return page_rmappable_folio(page);
f584b680 2127}
adf88aa8 2128EXPORT_SYMBOL(vma_alloc_folio);
f584b680 2129
1da177e4 2130/**
6421ec76
MWO
2131 * alloc_pages - Allocate pages.
2132 * @gfp: GFP flags.
2133 * @order: Power of two of number of pages to allocate.
1da177e4 2134 *
6421ec76
MWO
2135 * Allocate 1 << @order contiguous pages. The physical address of the
2136 * first page is naturally aligned (eg an order-3 allocation will be aligned
2137 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
2138 * process is honoured when in process context.
1da177e4 2139 *
6421ec76
MWO
2140 * Context: Can be called from any context, providing the appropriate GFP
2141 * flags are used.
2142 * Return: The page on success or NULL if allocation fails.
1da177e4 2143 */
ddc1a5cb 2144struct page *alloc_pages(gfp_t gfp, unsigned int order)
1da177e4 2145{
8d90274b 2146 struct mempolicy *pol = &default_policy;
52cd3b07
LS
2147
2148 /*
2149 * No reference counting needed for current->mempolicy
2150 * nor system default_policy
2151 */
ddc1a5cb
HD
2152 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2153 pol = get_task_policy(current);
cc9a6c87 2154
ddc1a5cb
HD
2155 return alloc_pages_mpol(gfp, order,
2156 pol, NO_INTERLEAVE_INDEX, numa_node_id());
1da177e4 2157}
d7f946d0 2158EXPORT_SYMBOL(alloc_pages);
1da177e4 2159
ddc1a5cb 2160struct folio *folio_alloc(gfp_t gfp, unsigned int order)
cc09cb13 2161{
23e48832 2162 return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order));
cc09cb13
MWO
2163}
2164EXPORT_SYMBOL(folio_alloc);
2165
c00b6b96
CW
2166static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
2167 struct mempolicy *pol, unsigned long nr_pages,
2168 struct page **page_array)
2169{
2170 int nodes;
2171 unsigned long nr_pages_per_node;
2172 int delta;
2173 int i;
2174 unsigned long nr_allocated;
2175 unsigned long total_allocated = 0;
2176
2177 nodes = nodes_weight(pol->nodes);
2178 nr_pages_per_node = nr_pages / nodes;
2179 delta = nr_pages - nodes * nr_pages_per_node;
2180
2181 for (i = 0; i < nodes; i++) {
2182 if (delta) {
2183 nr_allocated = __alloc_pages_bulk(gfp,
2184 interleave_nodes(pol), NULL,
2185 nr_pages_per_node + 1, NULL,
2186 page_array);
2187 delta--;
2188 } else {
2189 nr_allocated = __alloc_pages_bulk(gfp,
2190 interleave_nodes(pol), NULL,
2191 nr_pages_per_node, NULL, page_array);
2192 }
2193
2194 page_array += nr_allocated;
2195 total_allocated += nr_allocated;
2196 }
2197
2198 return total_allocated;
2199}
2200
2201static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
2202 struct mempolicy *pol, unsigned long nr_pages,
2203 struct page **page_array)
2204{
2205 gfp_t preferred_gfp;
2206 unsigned long nr_allocated = 0;
2207
2208 preferred_gfp = gfp | __GFP_NOWARN;
2209 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2210
2211 nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
2212 nr_pages, NULL, page_array);
2213
2214 if (nr_allocated < nr_pages)
2215 nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
2216 nr_pages - nr_allocated, NULL,
2217 page_array + nr_allocated);
2218 return nr_allocated;
2219}
2220
2221/* alloc pages bulk and mempolicy should be considered at the
2222 * same time in some situation such as vmalloc.
2223 *
2224 * It can accelerate memory allocation especially interleaving
2225 * allocate memory.
2226 */
2227unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
2228 unsigned long nr_pages, struct page **page_array)
2229{
2230 struct mempolicy *pol = &default_policy;
ddc1a5cb
HD
2231 nodemask_t *nodemask;
2232 int nid;
c00b6b96
CW
2233
2234 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2235 pol = get_task_policy(current);
2236
2237 if (pol->mode == MPOL_INTERLEAVE)
2238 return alloc_pages_bulk_array_interleave(gfp, pol,
2239 nr_pages, page_array);
2240
2241 if (pol->mode == MPOL_PREFERRED_MANY)
2242 return alloc_pages_bulk_array_preferred_many(gfp,
2243 numa_node_id(), pol, nr_pages, page_array);
2244
ddc1a5cb
HD
2245 nid = numa_node_id();
2246 nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2247 return __alloc_pages_bulk(gfp, nid, nodemask,
2248 nr_pages, NULL, page_array);
c00b6b96
CW
2249}
2250
ef0855d3
ON
2251int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2252{
c36f6e6d 2253 struct mempolicy *pol = mpol_dup(src->vm_policy);
ef0855d3
ON
2254
2255 if (IS_ERR(pol))
2256 return PTR_ERR(pol);
2257 dst->vm_policy = pol;
2258 return 0;
2259}
2260
4225399a 2261/*
846a16bf 2262 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
4225399a
PJ
2263 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2264 * with the mems_allowed returned by cpuset_mems_allowed(). This
2265 * keeps mempolicies cpuset relative after its cpuset moves. See
2266 * further kernel/cpuset.c update_nodemask().
708c1bbc
MX
2267 *
2268 * current's mempolicy may be rebinded by the other task(the task that changes
2269 * cpuset's mems), so we needn't do rebind work for current task.
4225399a 2270 */
4225399a 2271
846a16bf
LS
2272/* Slow path of a mempolicy duplicate */
2273struct mempolicy *__mpol_dup(struct mempolicy *old)
1da177e4
LT
2274{
2275 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2276
2277 if (!new)
2278 return ERR_PTR(-ENOMEM);
708c1bbc
MX
2279
2280 /* task's mempolicy is protected by alloc_lock */
2281 if (old == current->mempolicy) {
2282 task_lock(current);
2283 *new = *old;
2284 task_unlock(current);
2285 } else
2286 *new = *old;
2287
4225399a
PJ
2288 if (current_cpuset_is_being_rebound()) {
2289 nodemask_t mems = cpuset_mems_allowed(current);
213980c0 2290 mpol_rebind_policy(new, &mems);
4225399a 2291 }
1da177e4 2292 atomic_set(&new->refcnt, 1);
1da177e4
LT
2293 return new;
2294}
2295
2296/* Slow path of a mempolicy comparison */
fcfb4dcc 2297bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1da177e4
LT
2298{
2299 if (!a || !b)
fcfb4dcc 2300 return false;
45c4745a 2301 if (a->mode != b->mode)
fcfb4dcc 2302 return false;
19800502 2303 if (a->flags != b->flags)
fcfb4dcc 2304 return false;
c6018b4b
AK
2305 if (a->home_node != b->home_node)
2306 return false;
19800502
BL
2307 if (mpol_store_user_nodemask(a))
2308 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
fcfb4dcc 2309 return false;
19800502 2310
45c4745a 2311 switch (a->mode) {
19770b32 2312 case MPOL_BIND:
1da177e4 2313 case MPOL_INTERLEAVE:
1da177e4 2314 case MPOL_PREFERRED:
b27abacc 2315 case MPOL_PREFERRED_MANY:
269fbe72 2316 return !!nodes_equal(a->nodes, b->nodes);
7858d7bc
FT
2317 case MPOL_LOCAL:
2318 return true;
1da177e4
LT
2319 default:
2320 BUG();
fcfb4dcc 2321 return false;
1da177e4
LT
2322 }
2323}
2324
1da177e4
LT
2325/*
2326 * Shared memory backing store policy support.
2327 *
2328 * Remember policies even when nobody has shared memory mapped.
2329 * The policies are kept in Red-Black tree linked from the inode.
4a8c7bb5 2330 * They are protected by the sp->lock rwlock, which should be held
1da177e4
LT
2331 * for any accesses to the tree.
2332 */
2333
4a8c7bb5
NZ
2334/*
2335 * lookup first element intersecting start-end. Caller holds sp->lock for
2336 * reading or for writing
2337 */
93397c3b
HD
2338static struct sp_node *sp_lookup(struct shared_policy *sp,
2339 pgoff_t start, pgoff_t end)
1da177e4
LT
2340{
2341 struct rb_node *n = sp->root.rb_node;
2342
2343 while (n) {
2344 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2345
2346 if (start >= p->end)
2347 n = n->rb_right;
2348 else if (end <= p->start)
2349 n = n->rb_left;
2350 else
2351 break;
2352 }
2353 if (!n)
2354 return NULL;
2355 for (;;) {
2356 struct sp_node *w = NULL;
2357 struct rb_node *prev = rb_prev(n);
2358 if (!prev)
2359 break;
2360 w = rb_entry(prev, struct sp_node, nd);
2361 if (w->end <= start)
2362 break;
2363 n = prev;
2364 }
2365 return rb_entry(n, struct sp_node, nd);
2366}
2367
4a8c7bb5
NZ
2368/*
2369 * Insert a new shared policy into the list. Caller holds sp->lock for
2370 * writing.
2371 */
1da177e4
LT
2372static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2373{
2374 struct rb_node **p = &sp->root.rb_node;
2375 struct rb_node *parent = NULL;
2376 struct sp_node *nd;
2377
2378 while (*p) {
2379 parent = *p;
2380 nd = rb_entry(parent, struct sp_node, nd);
2381 if (new->start < nd->start)
2382 p = &(*p)->rb_left;
2383 else if (new->end > nd->end)
2384 p = &(*p)->rb_right;
2385 else
2386 BUG();
2387 }
2388 rb_link_node(&new->nd, parent, p);
2389 rb_insert_color(&new->nd, &sp->root);
1da177e4
LT
2390}
2391
2392/* Find shared policy intersecting idx */
93397c3b
HD
2393struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2394 pgoff_t idx)
1da177e4
LT
2395{
2396 struct mempolicy *pol = NULL;
2397 struct sp_node *sn;
2398
2399 if (!sp->root.rb_node)
2400 return NULL;
4a8c7bb5 2401 read_lock(&sp->lock);
1da177e4
LT
2402 sn = sp_lookup(sp, idx, idx+1);
2403 if (sn) {
2404 mpol_get(sn->policy);
2405 pol = sn->policy;
2406 }
4a8c7bb5 2407 read_unlock(&sp->lock);
1da177e4
LT
2408 return pol;
2409}
2410
63f74ca2
KM
2411static void sp_free(struct sp_node *n)
2412{
2413 mpol_put(n->policy);
2414 kmem_cache_free(sn_cache, n);
2415}
2416
771fb4d8 2417/**
75c70128 2418 * mpol_misplaced - check whether current folio node is valid in policy
771fb4d8 2419 *
75c70128
KW
2420 * @folio: folio to be checked
2421 * @vma: vm area where folio mapped
2422 * @addr: virtual address in @vma for shared policy lookup and interleave policy
771fb4d8 2423 *
75c70128 2424 * Lookup current policy node id for vma,addr and "compare to" folio's
5f076944 2425 * node id. Policy determination "mimics" alloc_page_vma().
771fb4d8 2426 * Called from fault path where we know the vma and faulting address.
5f076944 2427 *
062db293 2428 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
75c70128 2429 * policy, or a suitable node ID to allocate a replacement folio from.
771fb4d8 2430 */
75c70128
KW
2431int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
2432 unsigned long addr)
771fb4d8
LS
2433{
2434 struct mempolicy *pol;
ddc1a5cb 2435 pgoff_t ilx;
c33d6c06 2436 struct zoneref *z;
75c70128 2437 int curnid = folio_nid(folio);
90572890
PZ
2438 int thiscpu = raw_smp_processor_id();
2439 int thisnid = cpu_to_node(thiscpu);
98fa15f3 2440 int polnid = NUMA_NO_NODE;
062db293 2441 int ret = NUMA_NO_NODE;
771fb4d8 2442
ddc1a5cb 2443 pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
771fb4d8
LS
2444 if (!(pol->flags & MPOL_F_MOF))
2445 goto out;
2446
2447 switch (pol->mode) {
2448 case MPOL_INTERLEAVE:
ddc1a5cb 2449 polnid = interleave_nid(pol, ilx);
771fb4d8
LS
2450 break;
2451
2452 case MPOL_PREFERRED:
b27abacc
DH
2453 if (node_isset(curnid, pol->nodes))
2454 goto out;
269fbe72 2455 polnid = first_node(pol->nodes);
7858d7bc
FT
2456 break;
2457
2458 case MPOL_LOCAL:
2459 polnid = numa_node_id();
771fb4d8
LS
2460 break;
2461
2462 case MPOL_BIND:
bda420b9
YH
2463 /* Optimize placement among multiple nodes via NUMA balancing */
2464 if (pol->flags & MPOL_F_MORON) {
269fbe72 2465 if (node_isset(thisnid, pol->nodes))
bda420b9
YH
2466 break;
2467 goto out;
2468 }
b27abacc 2469 fallthrough;
c33d6c06 2470
b27abacc 2471 case MPOL_PREFERRED_MANY:
771fb4d8 2472 /*
771fb4d8
LS
2473 * use current page if in policy nodemask,
2474 * else select nearest allowed node, if any.
2475 * If no allowed nodes, use current [!misplaced].
2476 */
269fbe72 2477 if (node_isset(curnid, pol->nodes))
771fb4d8 2478 goto out;
c33d6c06 2479 z = first_zones_zonelist(
771fb4d8
LS
2480 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2481 gfp_zone(GFP_HIGHUSER),
269fbe72 2482 &pol->nodes);
c1093b74 2483 polnid = zone_to_nid(z->zone);
771fb4d8
LS
2484 break;
2485
2486 default:
2487 BUG();
2488 }
5606e387 2489
75c70128 2490 /* Migrate the folio towards the node whose CPU is referencing it */
e42c8ff2 2491 if (pol->flags & MPOL_F_MORON) {
90572890 2492 polnid = thisnid;
5606e387 2493
8c9ae56d 2494 if (!should_numa_migrate_memory(current, folio, curnid,
75c70128 2495 thiscpu))
de1c9ce6 2496 goto out;
e42c8ff2
MG
2497 }
2498
771fb4d8
LS
2499 if (curnid != polnid)
2500 ret = polnid;
2501out:
2502 mpol_cond_put(pol);
2503
2504 return ret;
2505}
2506
c11600e4
DR
2507/*
2508 * Drop the (possibly final) reference to task->mempolicy. It needs to be
2509 * dropped after task->mempolicy is set to NULL so that any allocation done as
2510 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2511 * policy.
2512 */
2513void mpol_put_task_policy(struct task_struct *task)
2514{
2515 struct mempolicy *pol;
2516
2517 task_lock(task);
2518 pol = task->mempolicy;
2519 task->mempolicy = NULL;
2520 task_unlock(task);
2521 mpol_put(pol);
2522}
2523
1da177e4
LT
2524static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2525{
1da177e4 2526 rb_erase(&n->nd, &sp->root);
63f74ca2 2527 sp_free(n);
1da177e4
LT
2528}
2529
42288fe3
MG
2530static void sp_node_init(struct sp_node *node, unsigned long start,
2531 unsigned long end, struct mempolicy *pol)
2532{
2533 node->start = start;
2534 node->end = end;
2535 node->policy = pol;
2536}
2537
dbcb0f19
AB
2538static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2539 struct mempolicy *pol)
1da177e4 2540{
869833f2
KM
2541 struct sp_node *n;
2542 struct mempolicy *newpol;
1da177e4 2543
869833f2 2544 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1da177e4
LT
2545 if (!n)
2546 return NULL;
869833f2
KM
2547
2548 newpol = mpol_dup(pol);
2549 if (IS_ERR(newpol)) {
2550 kmem_cache_free(sn_cache, n);
2551 return NULL;
2552 }
2553 newpol->flags |= MPOL_F_SHARED;
42288fe3 2554 sp_node_init(n, start, end, newpol);
869833f2 2555
1da177e4
LT
2556 return n;
2557}
2558
2559/* Replace a policy range. */
93397c3b
HD
2560static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
2561 pgoff_t end, struct sp_node *new)
1da177e4 2562{
b22d127a 2563 struct sp_node *n;
42288fe3
MG
2564 struct sp_node *n_new = NULL;
2565 struct mempolicy *mpol_new = NULL;
b22d127a 2566 int ret = 0;
1da177e4 2567
42288fe3 2568restart:
4a8c7bb5 2569 write_lock(&sp->lock);
1da177e4
LT
2570 n = sp_lookup(sp, start, end);
2571 /* Take care of old policies in the same range. */
2572 while (n && n->start < end) {
2573 struct rb_node *next = rb_next(&n->nd);
2574 if (n->start >= start) {
2575 if (n->end <= end)
2576 sp_delete(sp, n);
2577 else
2578 n->start = end;
2579 } else {
2580 /* Old policy spanning whole new range. */
2581 if (n->end > end) {
42288fe3
MG
2582 if (!n_new)
2583 goto alloc_new;
2584
2585 *mpol_new = *n->policy;
2586 atomic_set(&mpol_new->refcnt, 1);
7880639c 2587 sp_node_init(n_new, end, n->end, mpol_new);
1da177e4 2588 n->end = start;
5ca39575 2589 sp_insert(sp, n_new);
42288fe3
MG
2590 n_new = NULL;
2591 mpol_new = NULL;
1da177e4
LT
2592 break;
2593 } else
2594 n->end = start;
2595 }
2596 if (!next)
2597 break;
2598 n = rb_entry(next, struct sp_node, nd);
2599 }
2600 if (new)
2601 sp_insert(sp, new);
4a8c7bb5 2602 write_unlock(&sp->lock);
42288fe3
MG
2603 ret = 0;
2604
2605err_out:
2606 if (mpol_new)
2607 mpol_put(mpol_new);
2608 if (n_new)
2609 kmem_cache_free(sn_cache, n_new);
2610
b22d127a 2611 return ret;
42288fe3
MG
2612
2613alloc_new:
4a8c7bb5 2614 write_unlock(&sp->lock);
42288fe3
MG
2615 ret = -ENOMEM;
2616 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2617 if (!n_new)
2618 goto err_out;
2619 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2620 if (!mpol_new)
2621 goto err_out;
4ad09955 2622 atomic_set(&mpol_new->refcnt, 1);
42288fe3 2623 goto restart;
1da177e4
LT
2624}
2625
71fe804b
LS
2626/**
2627 * mpol_shared_policy_init - initialize shared policy for inode
2628 * @sp: pointer to inode shared policy
2629 * @mpol: struct mempolicy to install
2630 *
2631 * Install non-NULL @mpol in inode's shared policy rb-tree.
2632 * On entry, the current task has a reference on a non-NULL @mpol.
2633 * This must be released on exit.
4bfc4495 2634 * This is called at get_inode() calls and we can use GFP_KERNEL.
71fe804b
LS
2635 */
2636void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2637{
58568d2a
MX
2638 int ret;
2639
71fe804b 2640 sp->root = RB_ROOT; /* empty tree == default mempolicy */
4a8c7bb5 2641 rwlock_init(&sp->lock);
71fe804b
LS
2642
2643 if (mpol) {
35ec8fa0
HD
2644 struct sp_node *sn;
2645 struct mempolicy *npol;
4bfc4495 2646 NODEMASK_SCRATCH(scratch);
71fe804b 2647
4bfc4495 2648 if (!scratch)
5c0c1654 2649 goto put_mpol;
35ec8fa0
HD
2650
2651 /* contextualize the tmpfs mount point mempolicy to this file */
2652 npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2653 if (IS_ERR(npol))
0cae3457 2654 goto free_scratch; /* no valid nodemask intersection */
58568d2a
MX
2655
2656 task_lock(current);
35ec8fa0 2657 ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
58568d2a 2658 task_unlock(current);
15d77835 2659 if (ret)
35ec8fa0
HD
2660 goto put_npol;
2661
2662 /* alloc node covering entire file; adds ref to file's npol */
2663 sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
2664 if (sn)
2665 sp_insert(sp, sn);
2666put_npol:
2667 mpol_put(npol); /* drop initial ref on file's npol */
0cae3457 2668free_scratch:
4bfc4495 2669 NODEMASK_SCRATCH_FREE(scratch);
5c0c1654
LS
2670put_mpol:
2671 mpol_put(mpol); /* drop our incoming ref on sb mpol */
7339ff83
RH
2672 }
2673}
2674
c36f6e6d
HD
2675int mpol_set_shared_policy(struct shared_policy *sp,
2676 struct vm_area_struct *vma, struct mempolicy *pol)
1da177e4
LT
2677{
2678 int err;
2679 struct sp_node *new = NULL;
2680 unsigned long sz = vma_pages(vma);
2681
c36f6e6d
HD
2682 if (pol) {
2683 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
1da177e4
LT
2684 if (!new)
2685 return -ENOMEM;
2686 }
c36f6e6d 2687 err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
1da177e4 2688 if (err && new)
63f74ca2 2689 sp_free(new);
1da177e4
LT
2690 return err;
2691}
2692
2693/* Free a backing policy store on inode delete. */
c36f6e6d 2694void mpol_free_shared_policy(struct shared_policy *sp)
1da177e4
LT
2695{
2696 struct sp_node *n;
2697 struct rb_node *next;
2698
c36f6e6d 2699 if (!sp->root.rb_node)
1da177e4 2700 return;
c36f6e6d
HD
2701 write_lock(&sp->lock);
2702 next = rb_first(&sp->root);
1da177e4
LT
2703 while (next) {
2704 n = rb_entry(next, struct sp_node, nd);
2705 next = rb_next(&n->nd);
c36f6e6d 2706 sp_delete(sp, n);
1da177e4 2707 }
c36f6e6d 2708 write_unlock(&sp->lock);
1da177e4
LT
2709}
2710
1a687c2e 2711#ifdef CONFIG_NUMA_BALANCING
c297663c 2712static int __initdata numabalancing_override;
1a687c2e
MG
2713
2714static void __init check_numabalancing_enable(void)
2715{
2716 bool numabalancing_default = false;
2717
2718 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2719 numabalancing_default = true;
2720
c297663c
MG
2721 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2722 if (numabalancing_override)
2723 set_numabalancing_state(numabalancing_override == 1);
2724
b0dc2b9b 2725 if (num_online_nodes() > 1 && !numabalancing_override) {
756a025f 2726 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
c297663c 2727 numabalancing_default ? "Enabling" : "Disabling");
1a687c2e
MG
2728 set_numabalancing_state(numabalancing_default);
2729 }
2730}
2731
2732static int __init setup_numabalancing(char *str)
2733{
2734 int ret = 0;
2735 if (!str)
2736 goto out;
1a687c2e
MG
2737
2738 if (!strcmp(str, "enable")) {
c297663c 2739 numabalancing_override = 1;
1a687c2e
MG
2740 ret = 1;
2741 } else if (!strcmp(str, "disable")) {
c297663c 2742 numabalancing_override = -1;
1a687c2e
MG
2743 ret = 1;
2744 }
2745out:
2746 if (!ret)
4a404bea 2747 pr_warn("Unable to parse numa_balancing=\n");
1a687c2e
MG
2748
2749 return ret;
2750}
2751__setup("numa_balancing=", setup_numabalancing);
2752#else
2753static inline void __init check_numabalancing_enable(void)
2754{
2755}
2756#endif /* CONFIG_NUMA_BALANCING */
2757
1da177e4
LT
2758void __init numa_policy_init(void)
2759{
b71636e2
PM
2760 nodemask_t interleave_nodes;
2761 unsigned long largest = 0;
2762 int nid, prefer = 0;
2763
1da177e4
LT
2764 policy_cache = kmem_cache_create("numa_policy",
2765 sizeof(struct mempolicy),
20c2df83 2766 0, SLAB_PANIC, NULL);
1da177e4
LT
2767
2768 sn_cache = kmem_cache_create("shared_policy_node",
2769 sizeof(struct sp_node),
20c2df83 2770 0, SLAB_PANIC, NULL);
1da177e4 2771
5606e387
MG
2772 for_each_node(nid) {
2773 preferred_node_policy[nid] = (struct mempolicy) {
2774 .refcnt = ATOMIC_INIT(1),
2775 .mode = MPOL_PREFERRED,
2776 .flags = MPOL_F_MOF | MPOL_F_MORON,
269fbe72 2777 .nodes = nodemask_of_node(nid),
5606e387
MG
2778 };
2779 }
2780
b71636e2
PM
2781 /*
2782 * Set interleaving policy for system init. Interleaving is only
2783 * enabled across suitably sized nodes (default is >= 16MB), or
2784 * fall back to the largest node if they're all smaller.
2785 */
2786 nodes_clear(interleave_nodes);
01f13bd6 2787 for_each_node_state(nid, N_MEMORY) {
b71636e2
PM
2788 unsigned long total_pages = node_present_pages(nid);
2789
2790 /* Preserve the largest node */
2791 if (largest < total_pages) {
2792 largest = total_pages;
2793 prefer = nid;
2794 }
2795
2796 /* Interleave this node? */
2797 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2798 node_set(nid, interleave_nodes);
2799 }
2800
2801 /* All too small, use the largest */
2802 if (unlikely(nodes_empty(interleave_nodes)))
2803 node_set(prefer, interleave_nodes);
1da177e4 2804
028fec41 2805 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
b1de0d13 2806 pr_err("%s: interleaving failed\n", __func__);
1a687c2e
MG
2807
2808 check_numabalancing_enable();
1da177e4
LT
2809}
2810
8bccd85f 2811/* Reset policy of current process to default */
1da177e4
LT
2812void numa_default_policy(void)
2813{
028fec41 2814 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1da177e4 2815}
68860ec1 2816
095f1fc4
LS
2817/*
2818 * Parse and format mempolicy from/to strings
2819 */
345ace9c
LS
2820static const char * const policy_modes[] =
2821{
2822 [MPOL_DEFAULT] = "default",
2823 [MPOL_PREFERRED] = "prefer",
2824 [MPOL_BIND] = "bind",
2825 [MPOL_INTERLEAVE] = "interleave",
d3a71033 2826 [MPOL_LOCAL] = "local",
b27abacc 2827 [MPOL_PREFERRED_MANY] = "prefer (many)",
345ace9c 2828};
1a75a6c8 2829
095f1fc4
LS
2830#ifdef CONFIG_TMPFS
2831/**
f2a07f40 2832 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
095f1fc4 2833 * @str: string containing mempolicy to parse
71fe804b 2834 * @mpol: pointer to struct mempolicy pointer, returned on success.
095f1fc4
LS
2835 *
2836 * Format of input:
2837 * <mode>[=<flags>][:<nodelist>]
2838 *
dad5b023 2839 * Return: %0 on success, else %1
095f1fc4 2840 */
a7a88b23 2841int mpol_parse_str(char *str, struct mempolicy **mpol)
095f1fc4 2842{
71fe804b 2843 struct mempolicy *new = NULL;
f2a07f40 2844 unsigned short mode_flags;
71fe804b 2845 nodemask_t nodes;
095f1fc4
LS
2846 char *nodelist = strchr(str, ':');
2847 char *flags = strchr(str, '=');
dedf2c73 2848 int err = 1, mode;
095f1fc4 2849
c7a91bc7
DC
2850 if (flags)
2851 *flags++ = '\0'; /* terminate mode string */
2852
095f1fc4
LS
2853 if (nodelist) {
2854 /* NUL-terminate mode or flags string */
2855 *nodelist++ = '\0';
71fe804b 2856 if (nodelist_parse(nodelist, nodes))
095f1fc4 2857 goto out;
01f13bd6 2858 if (!nodes_subset(nodes, node_states[N_MEMORY]))
095f1fc4 2859 goto out;
71fe804b
LS
2860 } else
2861 nodes_clear(nodes);
2862
dedf2c73 2863 mode = match_string(policy_modes, MPOL_MAX, str);
2864 if (mode < 0)
095f1fc4
LS
2865 goto out;
2866
71fe804b 2867 switch (mode) {
095f1fc4 2868 case MPOL_PREFERRED:
71fe804b 2869 /*
aa9f7d51
RD
2870 * Insist on a nodelist of one node only, although later
2871 * we use first_node(nodes) to grab a single node, so here
2872 * nodelist (or nodes) cannot be empty.
71fe804b 2873 */
095f1fc4
LS
2874 if (nodelist) {
2875 char *rest = nodelist;
2876 while (isdigit(*rest))
2877 rest++;
926f2ae0
KM
2878 if (*rest)
2879 goto out;
aa9f7d51
RD
2880 if (nodes_empty(nodes))
2881 goto out;
095f1fc4
LS
2882 }
2883 break;
095f1fc4
LS
2884 case MPOL_INTERLEAVE:
2885 /*
2886 * Default to online nodes with memory if no nodelist
2887 */
2888 if (!nodelist)
01f13bd6 2889 nodes = node_states[N_MEMORY];
3f226aa1 2890 break;
71fe804b 2891 case MPOL_LOCAL:
3f226aa1 2892 /*
71fe804b 2893 * Don't allow a nodelist; mpol_new() checks flags
3f226aa1 2894 */
71fe804b 2895 if (nodelist)
3f226aa1 2896 goto out;
3f226aa1 2897 break;
413b43de
RT
2898 case MPOL_DEFAULT:
2899 /*
2900 * Insist on a empty nodelist
2901 */
2902 if (!nodelist)
2903 err = 0;
2904 goto out;
b27abacc 2905 case MPOL_PREFERRED_MANY:
d69b2e63
KM
2906 case MPOL_BIND:
2907 /*
2908 * Insist on a nodelist
2909 */
2910 if (!nodelist)
2911 goto out;
095f1fc4
LS
2912 }
2913
71fe804b 2914 mode_flags = 0;
095f1fc4
LS
2915 if (flags) {
2916 /*
2917 * Currently, we only support two mutually exclusive
2918 * mode flags.
2919 */
2920 if (!strcmp(flags, "static"))
71fe804b 2921 mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4 2922 else if (!strcmp(flags, "relative"))
71fe804b 2923 mode_flags |= MPOL_F_RELATIVE_NODES;
095f1fc4 2924 else
926f2ae0 2925 goto out;
095f1fc4 2926 }
71fe804b
LS
2927
2928 new = mpol_new(mode, mode_flags, &nodes);
2929 if (IS_ERR(new))
926f2ae0
KM
2930 goto out;
2931
f2a07f40
HD
2932 /*
2933 * Save nodes for mpol_to_str() to show the tmpfs mount options
2934 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2935 */
269fbe72
BW
2936 if (mode != MPOL_PREFERRED) {
2937 new->nodes = nodes;
2938 } else if (nodelist) {
2939 nodes_clear(new->nodes);
2940 node_set(first_node(nodes), new->nodes);
2941 } else {
7858d7bc 2942 new->mode = MPOL_LOCAL;
269fbe72 2943 }
f2a07f40
HD
2944
2945 /*
2946 * Save nodes for contextualization: this will be used to "clone"
2947 * the mempolicy in a specific context [cpuset] at a later time.
2948 */
2949 new->w.user_nodemask = nodes;
2950
926f2ae0 2951 err = 0;
71fe804b 2952
095f1fc4
LS
2953out:
2954 /* Restore string for error message */
2955 if (nodelist)
2956 *--nodelist = ':';
2957 if (flags)
2958 *--flags = '=';
71fe804b
LS
2959 if (!err)
2960 *mpol = new;
095f1fc4
LS
2961 return err;
2962}
2963#endif /* CONFIG_TMPFS */
2964
71fe804b
LS
2965/**
2966 * mpol_to_str - format a mempolicy structure for printing
2967 * @buffer: to contain formatted mempolicy string
2968 * @maxlen: length of @buffer
2969 * @pol: pointer to mempolicy to be formatted
71fe804b 2970 *
948927ee
DR
2971 * Convert @pol into a string. If @buffer is too short, truncate the string.
2972 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2973 * longest flag, "relative", and to display at least a few node ids.
1a75a6c8 2974 */
948927ee 2975void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1a75a6c8
CL
2976{
2977 char *p = buffer;
948927ee
DR
2978 nodemask_t nodes = NODE_MASK_NONE;
2979 unsigned short mode = MPOL_DEFAULT;
2980 unsigned short flags = 0;
2291990a 2981
8790c71a 2982 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
bea904d5 2983 mode = pol->mode;
948927ee
DR
2984 flags = pol->flags;
2985 }
bea904d5 2986
1a75a6c8
CL
2987 switch (mode) {
2988 case MPOL_DEFAULT:
7858d7bc 2989 case MPOL_LOCAL:
1a75a6c8 2990 break;
1a75a6c8 2991 case MPOL_PREFERRED:
b27abacc 2992 case MPOL_PREFERRED_MANY:
1a75a6c8 2993 case MPOL_BIND:
1a75a6c8 2994 case MPOL_INTERLEAVE:
269fbe72 2995 nodes = pol->nodes;
1a75a6c8 2996 break;
1a75a6c8 2997 default:
948927ee
DR
2998 WARN_ON_ONCE(1);
2999 snprintf(p, maxlen, "unknown");
3000 return;
1a75a6c8
CL
3001 }
3002
b7a9f420 3003 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
1a75a6c8 3004
fc36b8d3 3005 if (flags & MPOL_MODE_FLAGS) {
948927ee 3006 p += snprintf(p, buffer + maxlen - p, "=");
f5b087b5 3007
2291990a
LS
3008 /*
3009 * Currently, the only defined flags are mutually exclusive
3010 */
f5b087b5 3011 if (flags & MPOL_F_STATIC_NODES)
2291990a
LS
3012 p += snprintf(p, buffer + maxlen - p, "static");
3013 else if (flags & MPOL_F_RELATIVE_NODES)
3014 p += snprintf(p, buffer + maxlen - p, "relative");
f5b087b5
DR
3015 }
3016
9e763e0f
TH
3017 if (!nodes_empty(nodes))
3018 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3019 nodemask_pr_args(&nodes));
1a75a6c8 3020}
This page took 1.983745 seconds and 4 git commands to generate.