]>
Commit | Line | Data |
---|---|---|
c942fddf | 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
133ff0ea JG |
2 | /* |
3 | * Copyright 2013 Red Hat Inc. | |
4 | * | |
f813f219 | 5 | * Authors: Jérôme Glisse <[email protected]> |
133ff0ea JG |
6 | */ |
7 | /* | |
8 | * Refer to include/linux/hmm.h for information about heterogeneous memory | |
9 | * management or HMM for short. | |
10 | */ | |
a520110e | 11 | #include <linux/pagewalk.h> |
133ff0ea | 12 | #include <linux/hmm.h> |
858b54da | 13 | #include <linux/init.h> |
da4c3c73 JG |
14 | #include <linux/rmap.h> |
15 | #include <linux/swap.h> | |
133ff0ea JG |
16 | #include <linux/slab.h> |
17 | #include <linux/sched.h> | |
4ef589dc JG |
18 | #include <linux/mmzone.h> |
19 | #include <linux/pagemap.h> | |
da4c3c73 JG |
20 | #include <linux/swapops.h> |
21 | #include <linux/hugetlb.h> | |
4ef589dc | 22 | #include <linux/memremap.h> |
c8a53b2d | 23 | #include <linux/sched/mm.h> |
7b2d55d2 | 24 | #include <linux/jump_label.h> |
55c0ece8 | 25 | #include <linux/dma-mapping.h> |
c0b12405 | 26 | #include <linux/mmu_notifier.h> |
4ef589dc JG |
27 | #include <linux/memory_hotplug.h> |
28 | ||
74eee180 JG |
29 | struct hmm_vma_walk { |
30 | struct hmm_range *range; | |
992de9a8 | 31 | struct dev_pagemap *pgmap; |
74eee180 | 32 | unsigned long last; |
9a4903e4 | 33 | unsigned int flags; |
74eee180 JG |
34 | }; |
35 | ||
2aee09d8 JG |
36 | static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, |
37 | bool write_fault, uint64_t *pfn) | |
74eee180 | 38 | { |
9b1ae605 | 39 | unsigned int flags = FAULT_FLAG_REMOTE; |
74eee180 | 40 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
f88a1e90 | 41 | struct hmm_range *range = hmm_vma_walk->range; |
74eee180 | 42 | struct vm_area_struct *vma = walk->vma; |
50a7ca3c | 43 | vm_fault_t ret; |
74eee180 | 44 | |
6c64f2bb RC |
45 | if (!vma) |
46 | goto err; | |
47 | ||
9a4903e4 CH |
48 | if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) |
49 | flags |= FAULT_FLAG_ALLOW_RETRY; | |
50 | if (write_fault) | |
51 | flags |= FAULT_FLAG_WRITE; | |
52 | ||
50a7ca3c | 53 | ret = handle_mm_fault(vma, addr, flags); |
e709accc JG |
54 | if (ret & VM_FAULT_RETRY) { |
55 | /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ | |
73231612 | 56 | return -EAGAIN; |
e709accc | 57 | } |
6c64f2bb RC |
58 | if (ret & VM_FAULT_ERROR) |
59 | goto err; | |
74eee180 | 60 | |
73231612 | 61 | return -EBUSY; |
6c64f2bb RC |
62 | |
63 | err: | |
64 | *pfn = range->values[HMM_PFN_ERROR]; | |
65 | return -EFAULT; | |
74eee180 JG |
66 | } |
67 | ||
da4c3c73 JG |
68 | static int hmm_pfns_bad(unsigned long addr, |
69 | unsigned long end, | |
70 | struct mm_walk *walk) | |
71 | { | |
c719547f JG |
72 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
73 | struct hmm_range *range = hmm_vma_walk->range; | |
ff05c0c6 | 74 | uint64_t *pfns = range->pfns; |
da4c3c73 JG |
75 | unsigned long i; |
76 | ||
77 | i = (addr - range->start) >> PAGE_SHIFT; | |
78 | for (; addr < end; addr += PAGE_SIZE, i++) | |
f88a1e90 | 79 | pfns[i] = range->values[HMM_PFN_ERROR]; |
da4c3c73 JG |
80 | |
81 | return 0; | |
82 | } | |
83 | ||
5504ed29 | 84 | /* |
d2e8d551 RC |
85 | * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s) |
86 | * @addr: range virtual start address (inclusive) | |
5504ed29 | 87 | * @end: range virtual end address (exclusive) |
2aee09d8 JG |
88 | * @fault: should we fault or not ? |
89 | * @write_fault: write fault ? | |
5504ed29 | 90 | * @walk: mm_walk structure |
085ea250 | 91 | * Return: 0 on success, -EBUSY after page fault, or page fault error |
5504ed29 JG |
92 | * |
93 | * This function will be called whenever pmd_none() or pte_none() returns true, | |
94 | * or whenever there is no page directory covering the virtual address range. | |
95 | */ | |
2aee09d8 JG |
96 | static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, |
97 | bool fault, bool write_fault, | |
98 | struct mm_walk *walk) | |
da4c3c73 | 99 | { |
74eee180 JG |
100 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
101 | struct hmm_range *range = hmm_vma_walk->range; | |
ff05c0c6 | 102 | uint64_t *pfns = range->pfns; |
7f08263d | 103 | unsigned long i; |
da4c3c73 | 104 | |
74eee180 | 105 | hmm_vma_walk->last = addr; |
7f08263d | 106 | i = (addr - range->start) >> PAGE_SHIFT; |
63d5066f | 107 | |
c18ce674 RC |
108 | if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE)) |
109 | return -EPERM; | |
110 | ||
7f08263d | 111 | for (; addr < end; addr += PAGE_SIZE, i++) { |
f88a1e90 | 112 | pfns[i] = range->values[HMM_PFN_NONE]; |
2aee09d8 | 113 | if (fault || write_fault) { |
74eee180 | 114 | int ret; |
da4c3c73 | 115 | |
2aee09d8 JG |
116 | ret = hmm_vma_do_fault(walk, addr, write_fault, |
117 | &pfns[i]); | |
73231612 | 118 | if (ret != -EBUSY) |
74eee180 JG |
119 | return ret; |
120 | } | |
121 | } | |
122 | ||
73231612 | 123 | return (fault || write_fault) ? -EBUSY : 0; |
2aee09d8 JG |
124 | } |
125 | ||
126 | static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, | |
127 | uint64_t pfns, uint64_t cpu_flags, | |
128 | bool *fault, bool *write_fault) | |
129 | { | |
f88a1e90 JG |
130 | struct hmm_range *range = hmm_vma_walk->range; |
131 | ||
d45d464b | 132 | if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) |
2aee09d8 JG |
133 | return; |
134 | ||
023a019a JG |
135 | /* |
136 | * So we not only consider the individual per page request we also | |
137 | * consider the default flags requested for the range. The API can | |
d2e8d551 RC |
138 | * be used 2 ways. The first one where the HMM user coalesces |
139 | * multiple page faults into one request and sets flags per pfn for | |
140 | * those faults. The second one where the HMM user wants to pre- | |
023a019a JG |
141 | * fault a range with specific flags. For the latter one it is a |
142 | * waste to have the user pre-fill the pfn arrays with a default | |
143 | * flags value. | |
144 | */ | |
145 | pfns = (pfns & range->pfn_flags_mask) | range->default_flags; | |
146 | ||
2aee09d8 | 147 | /* We aren't ask to do anything ... */ |
f88a1e90 | 148 | if (!(pfns & range->flags[HMM_PFN_VALID])) |
2aee09d8 | 149 | return; |
d2e8d551 | 150 | /* If this is device memory then only fault if explicitly requested */ |
f88a1e90 JG |
151 | if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { |
152 | /* Do we fault on device memory ? */ | |
153 | if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { | |
154 | *write_fault = pfns & range->flags[HMM_PFN_WRITE]; | |
155 | *fault = true; | |
156 | } | |
2aee09d8 JG |
157 | return; |
158 | } | |
f88a1e90 JG |
159 | |
160 | /* If CPU page table is not valid then we need to fault */ | |
161 | *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); | |
162 | /* Need to write fault ? */ | |
163 | if ((pfns & range->flags[HMM_PFN_WRITE]) && | |
164 | !(cpu_flags & range->flags[HMM_PFN_WRITE])) { | |
165 | *write_fault = true; | |
2aee09d8 JG |
166 | *fault = true; |
167 | } | |
168 | } | |
169 | ||
170 | static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, | |
171 | const uint64_t *pfns, unsigned long npages, | |
172 | uint64_t cpu_flags, bool *fault, | |
173 | bool *write_fault) | |
174 | { | |
175 | unsigned long i; | |
176 | ||
d45d464b | 177 | if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) { |
2aee09d8 JG |
178 | *fault = *write_fault = false; |
179 | return; | |
180 | } | |
181 | ||
a3e0d41c | 182 | *fault = *write_fault = false; |
2aee09d8 JG |
183 | for (i = 0; i < npages; ++i) { |
184 | hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, | |
185 | fault, write_fault); | |
a3e0d41c | 186 | if ((*write_fault)) |
2aee09d8 JG |
187 | return; |
188 | } | |
189 | } | |
190 | ||
191 | static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, | |
192 | struct mm_walk *walk) | |
193 | { | |
194 | struct hmm_vma_walk *hmm_vma_walk = walk->private; | |
195 | struct hmm_range *range = hmm_vma_walk->range; | |
196 | bool fault, write_fault; | |
197 | unsigned long i, npages; | |
198 | uint64_t *pfns; | |
199 | ||
200 | i = (addr - range->start) >> PAGE_SHIFT; | |
201 | npages = (end - addr) >> PAGE_SHIFT; | |
202 | pfns = &range->pfns[i]; | |
203 | hmm_range_need_fault(hmm_vma_walk, pfns, npages, | |
204 | 0, &fault, &write_fault); | |
205 | return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); | |
206 | } | |
207 | ||
f88a1e90 | 208 | static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) |
2aee09d8 JG |
209 | { |
210 | if (pmd_protnone(pmd)) | |
211 | return 0; | |
f88a1e90 JG |
212 | return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | |
213 | range->flags[HMM_PFN_WRITE] : | |
214 | range->flags[HMM_PFN_VALID]; | |
da4c3c73 JG |
215 | } |
216 | ||
992de9a8 | 217 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
9d3973d6 CH |
218 | static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, |
219 | unsigned long end, uint64_t *pfns, pmd_t pmd) | |
220 | { | |
53f5c3f4 | 221 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
f88a1e90 | 222 | struct hmm_range *range = hmm_vma_walk->range; |
2aee09d8 | 223 | unsigned long pfn, npages, i; |
2aee09d8 | 224 | bool fault, write_fault; |
f88a1e90 | 225 | uint64_t cpu_flags; |
53f5c3f4 | 226 | |
2aee09d8 | 227 | npages = (end - addr) >> PAGE_SHIFT; |
f88a1e90 | 228 | cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); |
2aee09d8 JG |
229 | hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, |
230 | &fault, &write_fault); | |
53f5c3f4 | 231 | |
2aee09d8 JG |
232 | if (pmd_protnone(pmd) || fault || write_fault) |
233 | return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); | |
53f5c3f4 | 234 | |
309f9a4f | 235 | pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); |
992de9a8 JG |
236 | for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { |
237 | if (pmd_devmap(pmd)) { | |
238 | hmm_vma_walk->pgmap = get_dev_pagemap(pfn, | |
239 | hmm_vma_walk->pgmap); | |
240 | if (unlikely(!hmm_vma_walk->pgmap)) | |
241 | return -EBUSY; | |
242 | } | |
391aab11 | 243 | pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; |
992de9a8 JG |
244 | } |
245 | if (hmm_vma_walk->pgmap) { | |
246 | put_dev_pagemap(hmm_vma_walk->pgmap); | |
247 | hmm_vma_walk->pgmap = NULL; | |
248 | } | |
53f5c3f4 JG |
249 | hmm_vma_walk->last = end; |
250 | return 0; | |
251 | } | |
9d3973d6 CH |
252 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ |
253 | /* stub to allow the code below to compile */ | |
254 | int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, | |
255 | unsigned long end, uint64_t *pfns, pmd_t pmd); | |
256 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | |
53f5c3f4 | 257 | |
f88a1e90 | 258 | static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) |
2aee09d8 | 259 | { |
789c2af8 | 260 | if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) |
2aee09d8 | 261 | return 0; |
f88a1e90 JG |
262 | return pte_write(pte) ? range->flags[HMM_PFN_VALID] | |
263 | range->flags[HMM_PFN_WRITE] : | |
264 | range->flags[HMM_PFN_VALID]; | |
2aee09d8 JG |
265 | } |
266 | ||
53f5c3f4 JG |
267 | static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, |
268 | unsigned long end, pmd_t *pmdp, pte_t *ptep, | |
269 | uint64_t *pfn) | |
270 | { | |
271 | struct hmm_vma_walk *hmm_vma_walk = walk->private; | |
f88a1e90 | 272 | struct hmm_range *range = hmm_vma_walk->range; |
2aee09d8 JG |
273 | bool fault, write_fault; |
274 | uint64_t cpu_flags; | |
53f5c3f4 | 275 | pte_t pte = *ptep; |
f88a1e90 | 276 | uint64_t orig_pfn = *pfn; |
53f5c3f4 | 277 | |
f88a1e90 | 278 | *pfn = range->values[HMM_PFN_NONE]; |
73231612 | 279 | fault = write_fault = false; |
53f5c3f4 JG |
280 | |
281 | if (pte_none(pte)) { | |
73231612 JG |
282 | hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, |
283 | &fault, &write_fault); | |
2aee09d8 | 284 | if (fault || write_fault) |
53f5c3f4 JG |
285 | goto fault; |
286 | return 0; | |
287 | } | |
288 | ||
289 | if (!pte_present(pte)) { | |
290 | swp_entry_t entry = pte_to_swp_entry(pte); | |
291 | ||
292 | if (!non_swap_entry(entry)) { | |
e3fe8e55 YP |
293 | cpu_flags = pte_to_hmm_pfn_flags(range, pte); |
294 | hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, | |
295 | &fault, &write_fault); | |
2aee09d8 | 296 | if (fault || write_fault) |
53f5c3f4 JG |
297 | goto fault; |
298 | return 0; | |
299 | } | |
300 | ||
301 | /* | |
302 | * This is a special swap entry, ignore migration, use | |
303 | * device and report anything else as error. | |
304 | */ | |
305 | if (is_device_private_entry(entry)) { | |
f88a1e90 JG |
306 | cpu_flags = range->flags[HMM_PFN_VALID] | |
307 | range->flags[HMM_PFN_DEVICE_PRIVATE]; | |
2aee09d8 | 308 | cpu_flags |= is_write_device_private_entry(entry) ? |
f88a1e90 JG |
309 | range->flags[HMM_PFN_WRITE] : 0; |
310 | hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, | |
311 | &fault, &write_fault); | |
312 | if (fault || write_fault) | |
313 | goto fault; | |
391aab11 JG |
314 | *pfn = hmm_device_entry_from_pfn(range, |
315 | swp_offset(entry)); | |
f88a1e90 | 316 | *pfn |= cpu_flags; |
53f5c3f4 JG |
317 | return 0; |
318 | } | |
319 | ||
320 | if (is_migration_entry(entry)) { | |
2aee09d8 | 321 | if (fault || write_fault) { |
53f5c3f4 JG |
322 | pte_unmap(ptep); |
323 | hmm_vma_walk->last = addr; | |
d2e8d551 | 324 | migration_entry_wait(walk->mm, pmdp, addr); |
73231612 | 325 | return -EBUSY; |
53f5c3f4 JG |
326 | } |
327 | return 0; | |
328 | } | |
329 | ||
330 | /* Report error for everything else */ | |
f88a1e90 | 331 | *pfn = range->values[HMM_PFN_ERROR]; |
53f5c3f4 | 332 | return -EFAULT; |
73231612 JG |
333 | } else { |
334 | cpu_flags = pte_to_hmm_pfn_flags(range, pte); | |
335 | hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, | |
336 | &fault, &write_fault); | |
53f5c3f4 JG |
337 | } |
338 | ||
2aee09d8 | 339 | if (fault || write_fault) |
53f5c3f4 JG |
340 | goto fault; |
341 | ||
992de9a8 JG |
342 | if (pte_devmap(pte)) { |
343 | hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), | |
344 | hmm_vma_walk->pgmap); | |
345 | if (unlikely(!hmm_vma_walk->pgmap)) | |
346 | return -EBUSY; | |
347 | } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { | |
ac541f25 RC |
348 | if (!is_zero_pfn(pte_pfn(pte))) { |
349 | *pfn = range->values[HMM_PFN_SPECIAL]; | |
350 | return -EFAULT; | |
351 | } | |
352 | /* | |
353 | * Since each architecture defines a struct page for the zero | |
354 | * page, just fall through and treat it like a normal page. | |
355 | */ | |
992de9a8 JG |
356 | } |
357 | ||
391aab11 | 358 | *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; |
53f5c3f4 JG |
359 | return 0; |
360 | ||
361 | fault: | |
992de9a8 JG |
362 | if (hmm_vma_walk->pgmap) { |
363 | put_dev_pagemap(hmm_vma_walk->pgmap); | |
364 | hmm_vma_walk->pgmap = NULL; | |
365 | } | |
53f5c3f4 JG |
366 | pte_unmap(ptep); |
367 | /* Fault any virtual address we were asked to fault */ | |
2aee09d8 | 368 | return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); |
53f5c3f4 JG |
369 | } |
370 | ||
da4c3c73 JG |
371 | static int hmm_vma_walk_pmd(pmd_t *pmdp, |
372 | unsigned long start, | |
373 | unsigned long end, | |
374 | struct mm_walk *walk) | |
375 | { | |
74eee180 JG |
376 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
377 | struct hmm_range *range = hmm_vma_walk->range; | |
ff05c0c6 | 378 | uint64_t *pfns = range->pfns; |
da4c3c73 | 379 | unsigned long addr = start, i; |
da4c3c73 | 380 | pte_t *ptep; |
d08faca0 | 381 | pmd_t pmd; |
da4c3c73 | 382 | |
da4c3c73 | 383 | again: |
d08faca0 JG |
384 | pmd = READ_ONCE(*pmdp); |
385 | if (pmd_none(pmd)) | |
da4c3c73 JG |
386 | return hmm_vma_walk_hole(start, end, walk); |
387 | ||
d08faca0 JG |
388 | if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { |
389 | bool fault, write_fault; | |
390 | unsigned long npages; | |
391 | uint64_t *pfns; | |
392 | ||
393 | i = (addr - range->start) >> PAGE_SHIFT; | |
394 | npages = (end - addr) >> PAGE_SHIFT; | |
395 | pfns = &range->pfns[i]; | |
396 | ||
397 | hmm_range_need_fault(hmm_vma_walk, pfns, npages, | |
398 | 0, &fault, &write_fault); | |
399 | if (fault || write_fault) { | |
400 | hmm_vma_walk->last = addr; | |
d2e8d551 | 401 | pmd_migration_entry_wait(walk->mm, pmdp); |
73231612 | 402 | return -EBUSY; |
d08faca0 JG |
403 | } |
404 | return 0; | |
405 | } else if (!pmd_present(pmd)) | |
406 | return hmm_pfns_bad(start, end, walk); | |
da4c3c73 | 407 | |
d08faca0 | 408 | if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { |
da4c3c73 | 409 | /* |
d2e8d551 | 410 | * No need to take pmd_lock here, even if some other thread |
da4c3c73 JG |
411 | * is splitting the huge pmd we will get that event through |
412 | * mmu_notifier callback. | |
413 | * | |
d2e8d551 | 414 | * So just read pmd value and check again it's a transparent |
da4c3c73 JG |
415 | * huge or device mapping one and compute corresponding pfn |
416 | * values. | |
417 | */ | |
418 | pmd = pmd_read_atomic(pmdp); | |
419 | barrier(); | |
420 | if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) | |
421 | goto again; | |
74eee180 | 422 | |
d08faca0 | 423 | i = (addr - range->start) >> PAGE_SHIFT; |
53f5c3f4 | 424 | return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); |
da4c3c73 JG |
425 | } |
426 | ||
d08faca0 | 427 | /* |
d2e8d551 | 428 | * We have handled all the valid cases above ie either none, migration, |
d08faca0 JG |
429 | * huge or transparent huge. At this point either it is a valid pmd |
430 | * entry pointing to pte directory or it is a bad pmd that will not | |
431 | * recover. | |
432 | */ | |
433 | if (pmd_bad(pmd)) | |
da4c3c73 JG |
434 | return hmm_pfns_bad(start, end, walk); |
435 | ||
436 | ptep = pte_offset_map(pmdp, addr); | |
d08faca0 | 437 | i = (addr - range->start) >> PAGE_SHIFT; |
da4c3c73 | 438 | for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { |
53f5c3f4 | 439 | int r; |
74eee180 | 440 | |
53f5c3f4 JG |
441 | r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); |
442 | if (r) { | |
443 | /* hmm_vma_handle_pte() did unmap pte directory */ | |
444 | hmm_vma_walk->last = addr; | |
445 | return r; | |
74eee180 | 446 | } |
da4c3c73 | 447 | } |
992de9a8 JG |
448 | if (hmm_vma_walk->pgmap) { |
449 | /* | |
450 | * We do put_dev_pagemap() here and not in hmm_vma_handle_pte() | |
451 | * so that we can leverage get_dev_pagemap() optimization which | |
452 | * will not re-take a reference on a pgmap if we already have | |
453 | * one. | |
454 | */ | |
455 | put_dev_pagemap(hmm_vma_walk->pgmap); | |
456 | hmm_vma_walk->pgmap = NULL; | |
457 | } | |
da4c3c73 JG |
458 | pte_unmap(ptep - 1); |
459 | ||
53f5c3f4 | 460 | hmm_vma_walk->last = addr; |
da4c3c73 JG |
461 | return 0; |
462 | } | |
463 | ||
f0b3c45c CH |
464 | #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ |
465 | defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) | |
466 | static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) | |
467 | { | |
468 | if (!pud_present(pud)) | |
469 | return 0; | |
470 | return pud_write(pud) ? range->flags[HMM_PFN_VALID] | | |
471 | range->flags[HMM_PFN_WRITE] : | |
472 | range->flags[HMM_PFN_VALID]; | |
473 | } | |
474 | ||
475 | static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, | |
476 | struct mm_walk *walk) | |
992de9a8 JG |
477 | { |
478 | struct hmm_vma_walk *hmm_vma_walk = walk->private; | |
479 | struct hmm_range *range = hmm_vma_walk->range; | |
480 | unsigned long addr = start, next; | |
481 | pmd_t *pmdp; | |
482 | pud_t pud; | |
483 | int ret; | |
484 | ||
485 | again: | |
486 | pud = READ_ONCE(*pudp); | |
487 | if (pud_none(pud)) | |
488 | return hmm_vma_walk_hole(start, end, walk); | |
489 | ||
490 | if (pud_huge(pud) && pud_devmap(pud)) { | |
491 | unsigned long i, npages, pfn; | |
492 | uint64_t *pfns, cpu_flags; | |
493 | bool fault, write_fault; | |
494 | ||
495 | if (!pud_present(pud)) | |
496 | return hmm_vma_walk_hole(start, end, walk); | |
497 | ||
498 | i = (addr - range->start) >> PAGE_SHIFT; | |
499 | npages = (end - addr) >> PAGE_SHIFT; | |
500 | pfns = &range->pfns[i]; | |
501 | ||
502 | cpu_flags = pud_to_hmm_pfn_flags(range, pud); | |
503 | hmm_range_need_fault(hmm_vma_walk, pfns, npages, | |
504 | cpu_flags, &fault, &write_fault); | |
505 | if (fault || write_fault) | |
506 | return hmm_vma_walk_hole_(addr, end, fault, | |
507 | write_fault, walk); | |
508 | ||
992de9a8 JG |
509 | pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); |
510 | for (i = 0; i < npages; ++i, ++pfn) { | |
511 | hmm_vma_walk->pgmap = get_dev_pagemap(pfn, | |
512 | hmm_vma_walk->pgmap); | |
513 | if (unlikely(!hmm_vma_walk->pgmap)) | |
514 | return -EBUSY; | |
391aab11 JG |
515 | pfns[i] = hmm_device_entry_from_pfn(range, pfn) | |
516 | cpu_flags; | |
992de9a8 JG |
517 | } |
518 | if (hmm_vma_walk->pgmap) { | |
519 | put_dev_pagemap(hmm_vma_walk->pgmap); | |
520 | hmm_vma_walk->pgmap = NULL; | |
521 | } | |
522 | hmm_vma_walk->last = end; | |
523 | return 0; | |
992de9a8 JG |
524 | } |
525 | ||
526 | split_huge_pud(walk->vma, pudp, addr); | |
527 | if (pud_none(*pudp)) | |
528 | goto again; | |
529 | ||
530 | pmdp = pmd_offset(pudp, addr); | |
531 | do { | |
532 | next = pmd_addr_end(addr, end); | |
533 | ret = hmm_vma_walk_pmd(pmdp, addr, next, walk); | |
534 | if (ret) | |
535 | return ret; | |
536 | } while (pmdp++, addr = next, addr != end); | |
537 | ||
538 | return 0; | |
539 | } | |
f0b3c45c CH |
540 | #else |
541 | #define hmm_vma_walk_pud NULL | |
542 | #endif | |
992de9a8 | 543 | |
251bbe59 | 544 | #ifdef CONFIG_HUGETLB_PAGE |
63d5066f JG |
545 | static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, |
546 | unsigned long start, unsigned long end, | |
547 | struct mm_walk *walk) | |
548 | { | |
05c23af4 | 549 | unsigned long addr = start, i, pfn; |
63d5066f JG |
550 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
551 | struct hmm_range *range = hmm_vma_walk->range; | |
552 | struct vm_area_struct *vma = walk->vma; | |
63d5066f JG |
553 | uint64_t orig_pfn, cpu_flags; |
554 | bool fault, write_fault; | |
555 | spinlock_t *ptl; | |
556 | pte_t entry; | |
557 | int ret = 0; | |
558 | ||
d2e8d551 | 559 | ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); |
63d5066f JG |
560 | entry = huge_ptep_get(pte); |
561 | ||
7f08263d | 562 | i = (start - range->start) >> PAGE_SHIFT; |
63d5066f JG |
563 | orig_pfn = range->pfns[i]; |
564 | range->pfns[i] = range->values[HMM_PFN_NONE]; | |
565 | cpu_flags = pte_to_hmm_pfn_flags(range, entry); | |
566 | fault = write_fault = false; | |
567 | hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, | |
568 | &fault, &write_fault); | |
569 | if (fault || write_fault) { | |
570 | ret = -ENOENT; | |
571 | goto unlock; | |
572 | } | |
573 | ||
05c23af4 | 574 | pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); |
7f08263d | 575 | for (; addr < end; addr += PAGE_SIZE, i++, pfn++) |
391aab11 JG |
576 | range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | |
577 | cpu_flags; | |
63d5066f JG |
578 | hmm_vma_walk->last = end; |
579 | ||
580 | unlock: | |
581 | spin_unlock(ptl); | |
582 | ||
583 | if (ret == -ENOENT) | |
584 | return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); | |
585 | ||
586 | return ret; | |
63d5066f | 587 | } |
251bbe59 CH |
588 | #else |
589 | #define hmm_vma_walk_hugetlb_entry NULL | |
590 | #endif /* CONFIG_HUGETLB_PAGE */ | |
63d5066f | 591 | |
f88a1e90 JG |
592 | static void hmm_pfns_clear(struct hmm_range *range, |
593 | uint64_t *pfns, | |
33cd47dc JG |
594 | unsigned long addr, |
595 | unsigned long end) | |
596 | { | |
597 | for (; addr < end; addr += PAGE_SIZE, pfns++) | |
f88a1e90 | 598 | *pfns = range->values[HMM_PFN_NONE]; |
33cd47dc JG |
599 | } |
600 | ||
7b86ac33 CH |
601 | static const struct mm_walk_ops hmm_walk_ops = { |
602 | .pud_entry = hmm_vma_walk_pud, | |
603 | .pmd_entry = hmm_vma_walk_pmd, | |
604 | .pte_hole = hmm_vma_walk_hole, | |
605 | .hugetlb_entry = hmm_vma_walk_hugetlb_entry, | |
606 | }; | |
607 | ||
9a4903e4 CH |
608 | /** |
609 | * hmm_range_fault - try to fault some address in a virtual address range | |
610 | * @range: range being faulted | |
611 | * @flags: HMM_FAULT_* flags | |
612 | * | |
613 | * Return: the number of valid pages in range->pfns[] (from range start | |
614 | * address), which may be zero. On error one of the following status codes | |
615 | * can be returned: | |
73231612 | 616 | * |
9a4903e4 CH |
617 | * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma |
618 | * (e.g., device file vma). | |
619 | * -ENOMEM: Out of memory. | |
620 | * -EPERM: Invalid permission (e.g., asking for write and range is read | |
621 | * only). | |
622 | * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped. | |
623 | * -EBUSY: The range has been invalidated and the caller needs to wait for | |
624 | * the invalidation to finish. | |
625 | * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access | |
626 | * that range) number of valid pages in range->pfns[] (from | |
627 | * range start address). | |
74eee180 JG |
628 | * |
629 | * This is similar to a regular CPU page fault except that it will not trigger | |
73231612 JG |
630 | * any memory migration if the memory being faulted is not accessible by CPUs |
631 | * and caller does not ask for migration. | |
74eee180 | 632 | * |
ff05c0c6 JG |
633 | * On error, for one virtual address in the range, the function will mark the |
634 | * corresponding HMM pfn entry with an error flag. | |
74eee180 | 635 | */ |
9a4903e4 | 636 | long hmm_range_fault(struct hmm_range *range, unsigned int flags) |
74eee180 | 637 | { |
63d5066f | 638 | const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; |
a3e0d41c | 639 | unsigned long start = range->start, end; |
74eee180 | 640 | struct hmm_vma_walk hmm_vma_walk; |
a22dd506 | 641 | struct mm_struct *mm = range->notifier->mm; |
a3e0d41c | 642 | struct vm_area_struct *vma; |
74eee180 JG |
643 | int ret; |
644 | ||
04ec32fb | 645 | lockdep_assert_held(&mm->mmap_sem); |
704f3f2c | 646 | |
a3e0d41c JG |
647 | do { |
648 | /* If range is no longer valid force retry. */ | |
a22dd506 JG |
649 | if (mmu_interval_check_retry(range->notifier, |
650 | range->notifier_seq)) | |
2bcbeaef | 651 | return -EBUSY; |
74eee180 | 652 | |
04ec32fb | 653 | vma = find_vma(mm, start); |
63d5066f | 654 | if (vma == NULL || (vma->vm_flags & device_vma)) |
a3e0d41c | 655 | return -EFAULT; |
704f3f2c | 656 | |
a3e0d41c JG |
657 | if (!(vma->vm_flags & VM_READ)) { |
658 | /* | |
659 | * If vma do not allow read access, then assume that it | |
660 | * does not allow write access, either. HMM does not | |
661 | * support architecture that allow write without read. | |
662 | */ | |
663 | hmm_pfns_clear(range, range->pfns, | |
664 | range->start, range->end); | |
665 | return -EPERM; | |
666 | } | |
74eee180 | 667 | |
992de9a8 | 668 | hmm_vma_walk.pgmap = NULL; |
a3e0d41c | 669 | hmm_vma_walk.last = start; |
9a4903e4 | 670 | hmm_vma_walk.flags = flags; |
a3e0d41c | 671 | hmm_vma_walk.range = range; |
a3e0d41c JG |
672 | end = min(range->end, vma->vm_end); |
673 | ||
7b86ac33 CH |
674 | walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops, |
675 | &hmm_vma_walk); | |
a3e0d41c JG |
676 | |
677 | do { | |
7b86ac33 CH |
678 | ret = walk_page_range(vma->vm_mm, start, end, |
679 | &hmm_walk_ops, &hmm_vma_walk); | |
a3e0d41c JG |
680 | start = hmm_vma_walk.last; |
681 | ||
682 | /* Keep trying while the range is valid. */ | |
a22dd506 JG |
683 | } while (ret == -EBUSY && |
684 | !mmu_interval_check_retry(range->notifier, | |
685 | range->notifier_seq)); | |
a3e0d41c JG |
686 | |
687 | if (ret) { | |
688 | unsigned long i; | |
689 | ||
690 | i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; | |
691 | hmm_pfns_clear(range, &range->pfns[i], | |
692 | hmm_vma_walk.last, range->end); | |
693 | return ret; | |
694 | } | |
695 | start = end; | |
74eee180 | 696 | |
a3e0d41c | 697 | } while (start < range->end); |
704f3f2c | 698 | |
73231612 | 699 | return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; |
74eee180 | 700 | } |
73231612 | 701 | EXPORT_SYMBOL(hmm_range_fault); |
55c0ece8 JG |
702 | |
703 | /** | |
9a4903e4 CH |
704 | * hmm_range_dma_map - hmm_range_fault() and dma map page all in one. |
705 | * @range: range being faulted | |
706 | * @device: device to map page to | |
707 | * @daddrs: array of dma addresses for the mapped pages | |
708 | * @flags: HMM_FAULT_* | |
55c0ece8 | 709 | * |
9a4903e4 CH |
710 | * Return: the number of pages mapped on success (including zero), or any |
711 | * status return from hmm_range_fault() otherwise. | |
55c0ece8 | 712 | */ |
9a4903e4 CH |
713 | long hmm_range_dma_map(struct hmm_range *range, struct device *device, |
714 | dma_addr_t *daddrs, unsigned int flags) | |
55c0ece8 JG |
715 | { |
716 | unsigned long i, npages, mapped; | |
717 | long ret; | |
718 | ||
9a4903e4 | 719 | ret = hmm_range_fault(range, flags); |
55c0ece8 JG |
720 | if (ret <= 0) |
721 | return ret ? ret : -EBUSY; | |
722 | ||
723 | npages = (range->end - range->start) >> PAGE_SHIFT; | |
724 | for (i = 0, mapped = 0; i < npages; ++i) { | |
725 | enum dma_data_direction dir = DMA_TO_DEVICE; | |
726 | struct page *page; | |
727 | ||
728 | /* | |
729 | * FIXME need to update DMA API to provide invalid DMA address | |
730 | * value instead of a function to test dma address value. This | |
731 | * would remove lot of dumb code duplicated accross many arch. | |
732 | * | |
733 | * For now setting it to 0 here is good enough as the pfns[] | |
734 | * value is what is use to check what is valid and what isn't. | |
735 | */ | |
736 | daddrs[i] = 0; | |
737 | ||
391aab11 | 738 | page = hmm_device_entry_to_page(range, range->pfns[i]); |
55c0ece8 JG |
739 | if (page == NULL) |
740 | continue; | |
741 | ||
742 | /* Check if range is being invalidated */ | |
a22dd506 JG |
743 | if (mmu_interval_check_retry(range->notifier, |
744 | range->notifier_seq)) { | |
55c0ece8 JG |
745 | ret = -EBUSY; |
746 | goto unmap; | |
747 | } | |
748 | ||
749 | /* If it is read and write than map bi-directional. */ | |
750 | if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) | |
751 | dir = DMA_BIDIRECTIONAL; | |
752 | ||
753 | daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); | |
754 | if (dma_mapping_error(device, daddrs[i])) { | |
755 | ret = -EFAULT; | |
756 | goto unmap; | |
757 | } | |
758 | ||
759 | mapped++; | |
760 | } | |
761 | ||
762 | return mapped; | |
763 | ||
764 | unmap: | |
765 | for (npages = i, i = 0; (i < npages) && mapped; ++i) { | |
766 | enum dma_data_direction dir = DMA_TO_DEVICE; | |
767 | struct page *page; | |
768 | ||
391aab11 | 769 | page = hmm_device_entry_to_page(range, range->pfns[i]); |
55c0ece8 JG |
770 | if (page == NULL) |
771 | continue; | |
772 | ||
773 | if (dma_mapping_error(device, daddrs[i])) | |
774 | continue; | |
775 | ||
776 | /* If it is read and write than map bi-directional. */ | |
777 | if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) | |
778 | dir = DMA_BIDIRECTIONAL; | |
779 | ||
780 | dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); | |
781 | mapped--; | |
782 | } | |
783 | ||
784 | return ret; | |
785 | } | |
786 | EXPORT_SYMBOL(hmm_range_dma_map); | |
787 | ||
788 | /** | |
789 | * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() | |
790 | * @range: range being unmapped | |
55c0ece8 JG |
791 | * @device: device against which dma map was done |
792 | * @daddrs: dma address of mapped pages | |
793 | * @dirty: dirty page if it had the write flag set | |
085ea250 | 794 | * Return: number of page unmapped on success, -EINVAL otherwise |
55c0ece8 JG |
795 | * |
796 | * Note that caller MUST abide by mmu notifier or use HMM mirror and abide | |
797 | * to the sync_cpu_device_pagetables() callback so that it is safe here to | |
798 | * call set_page_dirty(). Caller must also take appropriate locks to avoid | |
799 | * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. | |
800 | */ | |
801 | long hmm_range_dma_unmap(struct hmm_range *range, | |
55c0ece8 JG |
802 | struct device *device, |
803 | dma_addr_t *daddrs, | |
804 | bool dirty) | |
805 | { | |
806 | unsigned long i, npages; | |
807 | long cpages = 0; | |
808 | ||
809 | /* Sanity check. */ | |
810 | if (range->end <= range->start) | |
811 | return -EINVAL; | |
812 | if (!daddrs) | |
813 | return -EINVAL; | |
814 | if (!range->pfns) | |
815 | return -EINVAL; | |
816 | ||
817 | npages = (range->end - range->start) >> PAGE_SHIFT; | |
818 | for (i = 0; i < npages; ++i) { | |
819 | enum dma_data_direction dir = DMA_TO_DEVICE; | |
820 | struct page *page; | |
821 | ||
391aab11 | 822 | page = hmm_device_entry_to_page(range, range->pfns[i]); |
55c0ece8 JG |
823 | if (page == NULL) |
824 | continue; | |
825 | ||
826 | /* If it is read and write than map bi-directional. */ | |
827 | if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) { | |
828 | dir = DMA_BIDIRECTIONAL; | |
829 | ||
830 | /* | |
831 | * See comments in function description on why it is | |
832 | * safe here to call set_page_dirty() | |
833 | */ | |
834 | if (dirty) | |
835 | set_page_dirty(page); | |
836 | } | |
837 | ||
838 | /* Unmap and clear pfns/dma address */ | |
839 | dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); | |
840 | range->pfns[i] = range->values[HMM_PFN_NONE]; | |
841 | /* FIXME see comments in hmm_vma_dma_map() */ | |
842 | daddrs[i] = 0; | |
843 | cpages++; | |
844 | } | |
845 | ||
846 | return cpages; | |
847 | } | |
848 | EXPORT_SYMBOL(hmm_range_dma_unmap); |