]>
Commit | Line | Data |
---|---|---|
c942fddf | 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
133ff0ea JG |
2 | /* |
3 | * Copyright 2013 Red Hat Inc. | |
4 | * | |
f813f219 | 5 | * Authors: Jérôme Glisse <[email protected]> |
133ff0ea JG |
6 | */ |
7 | /* | |
8 | * Refer to include/linux/hmm.h for information about heterogeneous memory | |
9 | * management or HMM for short. | |
10 | */ | |
a520110e | 11 | #include <linux/pagewalk.h> |
133ff0ea | 12 | #include <linux/hmm.h> |
858b54da | 13 | #include <linux/init.h> |
da4c3c73 JG |
14 | #include <linux/rmap.h> |
15 | #include <linux/swap.h> | |
133ff0ea JG |
16 | #include <linux/slab.h> |
17 | #include <linux/sched.h> | |
4ef589dc JG |
18 | #include <linux/mmzone.h> |
19 | #include <linux/pagemap.h> | |
da4c3c73 JG |
20 | #include <linux/swapops.h> |
21 | #include <linux/hugetlb.h> | |
4ef589dc | 22 | #include <linux/memremap.h> |
c8a53b2d | 23 | #include <linux/sched/mm.h> |
7b2d55d2 | 24 | #include <linux/jump_label.h> |
55c0ece8 | 25 | #include <linux/dma-mapping.h> |
c0b12405 | 26 | #include <linux/mmu_notifier.h> |
4ef589dc JG |
27 | #include <linux/memory_hotplug.h> |
28 | ||
c7d8b782 | 29 | static struct mmu_notifier *hmm_alloc_notifier(struct mm_struct *mm) |
133ff0ea | 30 | { |
8a9320b7 | 31 | struct hmm *hmm; |
133ff0ea | 32 | |
c7d8b782 | 33 | hmm = kzalloc(sizeof(*hmm), GFP_KERNEL); |
c0b12405 | 34 | if (!hmm) |
c7d8b782 JG |
35 | return ERR_PTR(-ENOMEM); |
36 | ||
a3e0d41c | 37 | init_waitqueue_head(&hmm->wq); |
c0b12405 JG |
38 | INIT_LIST_HEAD(&hmm->mirrors); |
39 | init_rwsem(&hmm->mirrors_sem); | |
da4c3c73 | 40 | INIT_LIST_HEAD(&hmm->ranges); |
5a136b4a | 41 | spin_lock_init(&hmm->ranges_lock); |
a3e0d41c | 42 | hmm->notifiers = 0; |
c7d8b782 | 43 | return &hmm->mmu_notifier; |
133ff0ea | 44 | } |
86a2d598 | 45 | |
c7d8b782 | 46 | static void hmm_free_notifier(struct mmu_notifier *mn) |
6d7c3cde | 47 | { |
c7d8b782 | 48 | struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); |
8a9320b7 | 49 | |
c7d8b782 JG |
50 | WARN_ON(!list_empty(&hmm->ranges)); |
51 | WARN_ON(!list_empty(&hmm->mirrors)); | |
86a2d598 | 52 | kfree(hmm); |
133ff0ea JG |
53 | } |
54 | ||
a3e0d41c | 55 | static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) |
133ff0ea | 56 | { |
6d7c3cde | 57 | struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); |
c0b12405 | 58 | struct hmm_mirror *mirror; |
704f3f2c | 59 | |
47f24598 JG |
60 | /* |
61 | * Since hmm_range_register() holds the mmget() lock hmm_release() is | |
62 | * prevented as long as a range exists. | |
63 | */ | |
64 | WARN_ON(!list_empty_careful(&hmm->ranges)); | |
e1401513 | 65 | |
14331726 JG |
66 | down_read(&hmm->mirrors_sem); |
67 | list_for_each_entry(mirror, &hmm->mirrors, list) { | |
68 | /* | |
69 | * Note: The driver is not allowed to trigger | |
70 | * hmm_mirror_unregister() from this thread. | |
71 | */ | |
72 | if (mirror->ops->release) | |
e1401513 | 73 | mirror->ops->release(mirror); |
704f3f2c | 74 | } |
14331726 | 75 | up_read(&hmm->mirrors_sem); |
133ff0ea | 76 | } |
c0b12405 | 77 | |
5a136b4a | 78 | static void notifiers_decrement(struct hmm *hmm) |
c0b12405 | 79 | { |
5a136b4a | 80 | unsigned long flags; |
da4c3c73 | 81 | |
5a136b4a JG |
82 | spin_lock_irqsave(&hmm->ranges_lock, flags); |
83 | hmm->notifiers--; | |
84 | if (!hmm->notifiers) { | |
85 | struct hmm_range *range; | |
e1401513 | 86 | |
5a136b4a JG |
87 | list_for_each_entry(range, &hmm->ranges, list) { |
88 | if (range->valid) | |
89 | continue; | |
90 | range->valid = true; | |
e1401513 | 91 | } |
5a136b4a | 92 | wake_up_all(&hmm->wq); |
e1401513 | 93 | } |
5a136b4a | 94 | spin_unlock_irqrestore(&hmm->ranges_lock, flags); |
e1401513 RC |
95 | } |
96 | ||
93065ac7 | 97 | static int hmm_invalidate_range_start(struct mmu_notifier *mn, |
a3e0d41c | 98 | const struct mmu_notifier_range *nrange) |
c0b12405 | 99 | { |
6d7c3cde | 100 | struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); |
a3e0d41c | 101 | struct hmm_mirror *mirror; |
a3e0d41c | 102 | struct hmm_range *range; |
5a136b4a | 103 | unsigned long flags; |
a3e0d41c | 104 | int ret = 0; |
c0b12405 | 105 | |
5a136b4a | 106 | spin_lock_irqsave(&hmm->ranges_lock, flags); |
a3e0d41c JG |
107 | hmm->notifiers++; |
108 | list_for_each_entry(range, &hmm->ranges, list) { | |
1f961807 | 109 | if (nrange->end < range->start || nrange->start >= range->end) |
a3e0d41c JG |
110 | continue; |
111 | ||
112 | range->valid = false; | |
113 | } | |
5a136b4a | 114 | spin_unlock_irqrestore(&hmm->ranges_lock, flags); |
a3e0d41c | 115 | |
dfcd6660 | 116 | if (mmu_notifier_range_blockable(nrange)) |
a3e0d41c JG |
117 | down_read(&hmm->mirrors_sem); |
118 | else if (!down_read_trylock(&hmm->mirrors_sem)) { | |
119 | ret = -EAGAIN; | |
120 | goto out; | |
121 | } | |
5a136b4a | 122 | |
a3e0d41c | 123 | list_for_each_entry(mirror, &hmm->mirrors, list) { |
5a136b4a | 124 | int rc; |
a3e0d41c | 125 | |
1f961807 | 126 | rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange); |
5a136b4a | 127 | if (rc) { |
1f961807 RC |
128 | if (WARN_ON(mmu_notifier_range_blockable(nrange) || |
129 | rc != -EAGAIN)) | |
5a136b4a | 130 | continue; |
a3e0d41c | 131 | ret = -EAGAIN; |
085ea250 | 132 | break; |
a3e0d41c JG |
133 | } |
134 | } | |
135 | up_read(&hmm->mirrors_sem); | |
136 | ||
137 | out: | |
5a136b4a JG |
138 | if (ret) |
139 | notifiers_decrement(hmm); | |
704f3f2c | 140 | return ret; |
c0b12405 JG |
141 | } |
142 | ||
143 | static void hmm_invalidate_range_end(struct mmu_notifier *mn, | |
a3e0d41c | 144 | const struct mmu_notifier_range *nrange) |
c0b12405 | 145 | { |
6d7c3cde | 146 | struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); |
c0b12405 | 147 | |
5a136b4a | 148 | notifiers_decrement(hmm); |
c0b12405 JG |
149 | } |
150 | ||
151 | static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { | |
e1401513 | 152 | .release = hmm_release, |
c0b12405 JG |
153 | .invalidate_range_start = hmm_invalidate_range_start, |
154 | .invalidate_range_end = hmm_invalidate_range_end, | |
c7d8b782 JG |
155 | .alloc_notifier = hmm_alloc_notifier, |
156 | .free_notifier = hmm_free_notifier, | |
c0b12405 JG |
157 | }; |
158 | ||
159 | /* | |
160 | * hmm_mirror_register() - register a mirror against an mm | |
161 | * | |
162 | * @mirror: new mirror struct to register | |
163 | * @mm: mm to register against | |
085ea250 | 164 | * Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments |
c0b12405 JG |
165 | * |
166 | * To start mirroring a process address space, the device driver must register | |
167 | * an HMM mirror struct. | |
c7d8b782 JG |
168 | * |
169 | * The caller cannot unregister the hmm_mirror while any ranges are | |
170 | * registered. | |
171 | * | |
172 | * Callers using this function must put a call to mmu_notifier_synchronize() | |
173 | * in their module exit functions. | |
c0b12405 JG |
174 | */ |
175 | int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) | |
176 | { | |
c7d8b782 JG |
177 | struct mmu_notifier *mn; |
178 | ||
fec88ab0 | 179 | lockdep_assert_held_write(&mm->mmap_sem); |
8a1a0cd0 | 180 | |
c0b12405 JG |
181 | /* Sanity check */ |
182 | if (!mm || !mirror || !mirror->ops) | |
183 | return -EINVAL; | |
184 | ||
c7d8b782 JG |
185 | mn = mmu_notifier_get_locked(&hmm_mmu_notifier_ops, mm); |
186 | if (IS_ERR(mn)) | |
187 | return PTR_ERR(mn); | |
188 | mirror->hmm = container_of(mn, struct hmm, mmu_notifier); | |
c0b12405 JG |
189 | |
190 | down_write(&mirror->hmm->mirrors_sem); | |
704f3f2c JG |
191 | list_add(&mirror->list, &mirror->hmm->mirrors); |
192 | up_write(&mirror->hmm->mirrors_sem); | |
c0b12405 JG |
193 | |
194 | return 0; | |
195 | } | |
196 | EXPORT_SYMBOL(hmm_mirror_register); | |
197 | ||
198 | /* | |
199 | * hmm_mirror_unregister() - unregister a mirror | |
200 | * | |
085ea250 | 201 | * @mirror: mirror struct to unregister |
c0b12405 JG |
202 | * |
203 | * Stop mirroring a process address space, and cleanup. | |
204 | */ | |
205 | void hmm_mirror_unregister(struct hmm_mirror *mirror) | |
206 | { | |
187229c2 | 207 | struct hmm *hmm = mirror->hmm; |
c0b12405 JG |
208 | |
209 | down_write(&hmm->mirrors_sem); | |
14331726 | 210 | list_del(&mirror->list); |
c0b12405 | 211 | up_write(&hmm->mirrors_sem); |
c7d8b782 | 212 | mmu_notifier_put(&hmm->mmu_notifier); |
c0b12405 JG |
213 | } |
214 | EXPORT_SYMBOL(hmm_mirror_unregister); | |
da4c3c73 | 215 | |
74eee180 JG |
216 | struct hmm_vma_walk { |
217 | struct hmm_range *range; | |
992de9a8 | 218 | struct dev_pagemap *pgmap; |
74eee180 | 219 | unsigned long last; |
9a4903e4 | 220 | unsigned int flags; |
74eee180 JG |
221 | }; |
222 | ||
2aee09d8 JG |
223 | static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, |
224 | bool write_fault, uint64_t *pfn) | |
74eee180 | 225 | { |
9b1ae605 | 226 | unsigned int flags = FAULT_FLAG_REMOTE; |
74eee180 | 227 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
f88a1e90 | 228 | struct hmm_range *range = hmm_vma_walk->range; |
74eee180 | 229 | struct vm_area_struct *vma = walk->vma; |
50a7ca3c | 230 | vm_fault_t ret; |
74eee180 | 231 | |
6c64f2bb RC |
232 | if (!vma) |
233 | goto err; | |
234 | ||
9a4903e4 CH |
235 | if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) |
236 | flags |= FAULT_FLAG_ALLOW_RETRY; | |
237 | if (write_fault) | |
238 | flags |= FAULT_FLAG_WRITE; | |
239 | ||
50a7ca3c | 240 | ret = handle_mm_fault(vma, addr, flags); |
e709accc JG |
241 | if (ret & VM_FAULT_RETRY) { |
242 | /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ | |
73231612 | 243 | return -EAGAIN; |
e709accc | 244 | } |
6c64f2bb RC |
245 | if (ret & VM_FAULT_ERROR) |
246 | goto err; | |
74eee180 | 247 | |
73231612 | 248 | return -EBUSY; |
6c64f2bb RC |
249 | |
250 | err: | |
251 | *pfn = range->values[HMM_PFN_ERROR]; | |
252 | return -EFAULT; | |
74eee180 JG |
253 | } |
254 | ||
da4c3c73 JG |
255 | static int hmm_pfns_bad(unsigned long addr, |
256 | unsigned long end, | |
257 | struct mm_walk *walk) | |
258 | { | |
c719547f JG |
259 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
260 | struct hmm_range *range = hmm_vma_walk->range; | |
ff05c0c6 | 261 | uint64_t *pfns = range->pfns; |
da4c3c73 JG |
262 | unsigned long i; |
263 | ||
264 | i = (addr - range->start) >> PAGE_SHIFT; | |
265 | for (; addr < end; addr += PAGE_SIZE, i++) | |
f88a1e90 | 266 | pfns[i] = range->values[HMM_PFN_ERROR]; |
da4c3c73 JG |
267 | |
268 | return 0; | |
269 | } | |
270 | ||
5504ed29 | 271 | /* |
d2e8d551 RC |
272 | * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s) |
273 | * @addr: range virtual start address (inclusive) | |
5504ed29 | 274 | * @end: range virtual end address (exclusive) |
2aee09d8 JG |
275 | * @fault: should we fault or not ? |
276 | * @write_fault: write fault ? | |
5504ed29 | 277 | * @walk: mm_walk structure |
085ea250 | 278 | * Return: 0 on success, -EBUSY after page fault, or page fault error |
5504ed29 JG |
279 | * |
280 | * This function will be called whenever pmd_none() or pte_none() returns true, | |
281 | * or whenever there is no page directory covering the virtual address range. | |
282 | */ | |
2aee09d8 JG |
283 | static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, |
284 | bool fault, bool write_fault, | |
285 | struct mm_walk *walk) | |
da4c3c73 | 286 | { |
74eee180 JG |
287 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
288 | struct hmm_range *range = hmm_vma_walk->range; | |
ff05c0c6 | 289 | uint64_t *pfns = range->pfns; |
7f08263d | 290 | unsigned long i; |
da4c3c73 | 291 | |
74eee180 | 292 | hmm_vma_walk->last = addr; |
7f08263d | 293 | i = (addr - range->start) >> PAGE_SHIFT; |
63d5066f | 294 | |
c18ce674 RC |
295 | if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE)) |
296 | return -EPERM; | |
297 | ||
7f08263d | 298 | for (; addr < end; addr += PAGE_SIZE, i++) { |
f88a1e90 | 299 | pfns[i] = range->values[HMM_PFN_NONE]; |
2aee09d8 | 300 | if (fault || write_fault) { |
74eee180 | 301 | int ret; |
da4c3c73 | 302 | |
2aee09d8 JG |
303 | ret = hmm_vma_do_fault(walk, addr, write_fault, |
304 | &pfns[i]); | |
73231612 | 305 | if (ret != -EBUSY) |
74eee180 JG |
306 | return ret; |
307 | } | |
308 | } | |
309 | ||
73231612 | 310 | return (fault || write_fault) ? -EBUSY : 0; |
2aee09d8 JG |
311 | } |
312 | ||
313 | static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, | |
314 | uint64_t pfns, uint64_t cpu_flags, | |
315 | bool *fault, bool *write_fault) | |
316 | { | |
f88a1e90 JG |
317 | struct hmm_range *range = hmm_vma_walk->range; |
318 | ||
d45d464b | 319 | if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) |
2aee09d8 JG |
320 | return; |
321 | ||
023a019a JG |
322 | /* |
323 | * So we not only consider the individual per page request we also | |
324 | * consider the default flags requested for the range. The API can | |
d2e8d551 RC |
325 | * be used 2 ways. The first one where the HMM user coalesces |
326 | * multiple page faults into one request and sets flags per pfn for | |
327 | * those faults. The second one where the HMM user wants to pre- | |
023a019a JG |
328 | * fault a range with specific flags. For the latter one it is a |
329 | * waste to have the user pre-fill the pfn arrays with a default | |
330 | * flags value. | |
331 | */ | |
332 | pfns = (pfns & range->pfn_flags_mask) | range->default_flags; | |
333 | ||
2aee09d8 | 334 | /* We aren't ask to do anything ... */ |
f88a1e90 | 335 | if (!(pfns & range->flags[HMM_PFN_VALID])) |
2aee09d8 | 336 | return; |
d2e8d551 | 337 | /* If this is device memory then only fault if explicitly requested */ |
f88a1e90 JG |
338 | if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { |
339 | /* Do we fault on device memory ? */ | |
340 | if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { | |
341 | *write_fault = pfns & range->flags[HMM_PFN_WRITE]; | |
342 | *fault = true; | |
343 | } | |
2aee09d8 JG |
344 | return; |
345 | } | |
f88a1e90 JG |
346 | |
347 | /* If CPU page table is not valid then we need to fault */ | |
348 | *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); | |
349 | /* Need to write fault ? */ | |
350 | if ((pfns & range->flags[HMM_PFN_WRITE]) && | |
351 | !(cpu_flags & range->flags[HMM_PFN_WRITE])) { | |
352 | *write_fault = true; | |
2aee09d8 JG |
353 | *fault = true; |
354 | } | |
355 | } | |
356 | ||
357 | static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, | |
358 | const uint64_t *pfns, unsigned long npages, | |
359 | uint64_t cpu_flags, bool *fault, | |
360 | bool *write_fault) | |
361 | { | |
362 | unsigned long i; | |
363 | ||
d45d464b | 364 | if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) { |
2aee09d8 JG |
365 | *fault = *write_fault = false; |
366 | return; | |
367 | } | |
368 | ||
a3e0d41c | 369 | *fault = *write_fault = false; |
2aee09d8 JG |
370 | for (i = 0; i < npages; ++i) { |
371 | hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, | |
372 | fault, write_fault); | |
a3e0d41c | 373 | if ((*write_fault)) |
2aee09d8 JG |
374 | return; |
375 | } | |
376 | } | |
377 | ||
378 | static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, | |
379 | struct mm_walk *walk) | |
380 | { | |
381 | struct hmm_vma_walk *hmm_vma_walk = walk->private; | |
382 | struct hmm_range *range = hmm_vma_walk->range; | |
383 | bool fault, write_fault; | |
384 | unsigned long i, npages; | |
385 | uint64_t *pfns; | |
386 | ||
387 | i = (addr - range->start) >> PAGE_SHIFT; | |
388 | npages = (end - addr) >> PAGE_SHIFT; | |
389 | pfns = &range->pfns[i]; | |
390 | hmm_range_need_fault(hmm_vma_walk, pfns, npages, | |
391 | 0, &fault, &write_fault); | |
392 | return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); | |
393 | } | |
394 | ||
f88a1e90 | 395 | static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) |
2aee09d8 JG |
396 | { |
397 | if (pmd_protnone(pmd)) | |
398 | return 0; | |
f88a1e90 JG |
399 | return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | |
400 | range->flags[HMM_PFN_WRITE] : | |
401 | range->flags[HMM_PFN_VALID]; | |
da4c3c73 JG |
402 | } |
403 | ||
992de9a8 | 404 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
9d3973d6 CH |
405 | static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, |
406 | unsigned long end, uint64_t *pfns, pmd_t pmd) | |
407 | { | |
53f5c3f4 | 408 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
f88a1e90 | 409 | struct hmm_range *range = hmm_vma_walk->range; |
2aee09d8 | 410 | unsigned long pfn, npages, i; |
2aee09d8 | 411 | bool fault, write_fault; |
f88a1e90 | 412 | uint64_t cpu_flags; |
53f5c3f4 | 413 | |
2aee09d8 | 414 | npages = (end - addr) >> PAGE_SHIFT; |
f88a1e90 | 415 | cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); |
2aee09d8 JG |
416 | hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, |
417 | &fault, &write_fault); | |
53f5c3f4 | 418 | |
2aee09d8 JG |
419 | if (pmd_protnone(pmd) || fault || write_fault) |
420 | return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); | |
53f5c3f4 | 421 | |
309f9a4f | 422 | pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); |
992de9a8 JG |
423 | for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { |
424 | if (pmd_devmap(pmd)) { | |
425 | hmm_vma_walk->pgmap = get_dev_pagemap(pfn, | |
426 | hmm_vma_walk->pgmap); | |
427 | if (unlikely(!hmm_vma_walk->pgmap)) | |
428 | return -EBUSY; | |
429 | } | |
391aab11 | 430 | pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; |
992de9a8 JG |
431 | } |
432 | if (hmm_vma_walk->pgmap) { | |
433 | put_dev_pagemap(hmm_vma_walk->pgmap); | |
434 | hmm_vma_walk->pgmap = NULL; | |
435 | } | |
53f5c3f4 JG |
436 | hmm_vma_walk->last = end; |
437 | return 0; | |
438 | } | |
9d3973d6 CH |
439 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ |
440 | /* stub to allow the code below to compile */ | |
441 | int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, | |
442 | unsigned long end, uint64_t *pfns, pmd_t pmd); | |
443 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | |
53f5c3f4 | 444 | |
f88a1e90 | 445 | static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) |
2aee09d8 | 446 | { |
789c2af8 | 447 | if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) |
2aee09d8 | 448 | return 0; |
f88a1e90 JG |
449 | return pte_write(pte) ? range->flags[HMM_PFN_VALID] | |
450 | range->flags[HMM_PFN_WRITE] : | |
451 | range->flags[HMM_PFN_VALID]; | |
2aee09d8 JG |
452 | } |
453 | ||
53f5c3f4 JG |
454 | static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, |
455 | unsigned long end, pmd_t *pmdp, pte_t *ptep, | |
456 | uint64_t *pfn) | |
457 | { | |
458 | struct hmm_vma_walk *hmm_vma_walk = walk->private; | |
f88a1e90 | 459 | struct hmm_range *range = hmm_vma_walk->range; |
2aee09d8 JG |
460 | bool fault, write_fault; |
461 | uint64_t cpu_flags; | |
53f5c3f4 | 462 | pte_t pte = *ptep; |
f88a1e90 | 463 | uint64_t orig_pfn = *pfn; |
53f5c3f4 | 464 | |
f88a1e90 | 465 | *pfn = range->values[HMM_PFN_NONE]; |
73231612 | 466 | fault = write_fault = false; |
53f5c3f4 JG |
467 | |
468 | if (pte_none(pte)) { | |
73231612 JG |
469 | hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, |
470 | &fault, &write_fault); | |
2aee09d8 | 471 | if (fault || write_fault) |
53f5c3f4 JG |
472 | goto fault; |
473 | return 0; | |
474 | } | |
475 | ||
476 | if (!pte_present(pte)) { | |
477 | swp_entry_t entry = pte_to_swp_entry(pte); | |
478 | ||
479 | if (!non_swap_entry(entry)) { | |
e3fe8e55 YP |
480 | cpu_flags = pte_to_hmm_pfn_flags(range, pte); |
481 | hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, | |
482 | &fault, &write_fault); | |
2aee09d8 | 483 | if (fault || write_fault) |
53f5c3f4 JG |
484 | goto fault; |
485 | return 0; | |
486 | } | |
487 | ||
488 | /* | |
489 | * This is a special swap entry, ignore migration, use | |
490 | * device and report anything else as error. | |
491 | */ | |
492 | if (is_device_private_entry(entry)) { | |
f88a1e90 JG |
493 | cpu_flags = range->flags[HMM_PFN_VALID] | |
494 | range->flags[HMM_PFN_DEVICE_PRIVATE]; | |
2aee09d8 | 495 | cpu_flags |= is_write_device_private_entry(entry) ? |
f88a1e90 JG |
496 | range->flags[HMM_PFN_WRITE] : 0; |
497 | hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, | |
498 | &fault, &write_fault); | |
499 | if (fault || write_fault) | |
500 | goto fault; | |
391aab11 JG |
501 | *pfn = hmm_device_entry_from_pfn(range, |
502 | swp_offset(entry)); | |
f88a1e90 | 503 | *pfn |= cpu_flags; |
53f5c3f4 JG |
504 | return 0; |
505 | } | |
506 | ||
507 | if (is_migration_entry(entry)) { | |
2aee09d8 | 508 | if (fault || write_fault) { |
53f5c3f4 JG |
509 | pte_unmap(ptep); |
510 | hmm_vma_walk->last = addr; | |
d2e8d551 | 511 | migration_entry_wait(walk->mm, pmdp, addr); |
73231612 | 512 | return -EBUSY; |
53f5c3f4 JG |
513 | } |
514 | return 0; | |
515 | } | |
516 | ||
517 | /* Report error for everything else */ | |
f88a1e90 | 518 | *pfn = range->values[HMM_PFN_ERROR]; |
53f5c3f4 | 519 | return -EFAULT; |
73231612 JG |
520 | } else { |
521 | cpu_flags = pte_to_hmm_pfn_flags(range, pte); | |
522 | hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, | |
523 | &fault, &write_fault); | |
53f5c3f4 JG |
524 | } |
525 | ||
2aee09d8 | 526 | if (fault || write_fault) |
53f5c3f4 JG |
527 | goto fault; |
528 | ||
992de9a8 JG |
529 | if (pte_devmap(pte)) { |
530 | hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), | |
531 | hmm_vma_walk->pgmap); | |
532 | if (unlikely(!hmm_vma_walk->pgmap)) | |
533 | return -EBUSY; | |
534 | } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { | |
535 | *pfn = range->values[HMM_PFN_SPECIAL]; | |
536 | return -EFAULT; | |
537 | } | |
538 | ||
391aab11 | 539 | *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; |
53f5c3f4 JG |
540 | return 0; |
541 | ||
542 | fault: | |
992de9a8 JG |
543 | if (hmm_vma_walk->pgmap) { |
544 | put_dev_pagemap(hmm_vma_walk->pgmap); | |
545 | hmm_vma_walk->pgmap = NULL; | |
546 | } | |
53f5c3f4 JG |
547 | pte_unmap(ptep); |
548 | /* Fault any virtual address we were asked to fault */ | |
2aee09d8 | 549 | return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); |
53f5c3f4 JG |
550 | } |
551 | ||
da4c3c73 JG |
552 | static int hmm_vma_walk_pmd(pmd_t *pmdp, |
553 | unsigned long start, | |
554 | unsigned long end, | |
555 | struct mm_walk *walk) | |
556 | { | |
74eee180 JG |
557 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
558 | struct hmm_range *range = hmm_vma_walk->range; | |
ff05c0c6 | 559 | uint64_t *pfns = range->pfns; |
da4c3c73 | 560 | unsigned long addr = start, i; |
da4c3c73 | 561 | pte_t *ptep; |
d08faca0 | 562 | pmd_t pmd; |
da4c3c73 | 563 | |
da4c3c73 | 564 | again: |
d08faca0 JG |
565 | pmd = READ_ONCE(*pmdp); |
566 | if (pmd_none(pmd)) | |
da4c3c73 JG |
567 | return hmm_vma_walk_hole(start, end, walk); |
568 | ||
d08faca0 JG |
569 | if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { |
570 | bool fault, write_fault; | |
571 | unsigned long npages; | |
572 | uint64_t *pfns; | |
573 | ||
574 | i = (addr - range->start) >> PAGE_SHIFT; | |
575 | npages = (end - addr) >> PAGE_SHIFT; | |
576 | pfns = &range->pfns[i]; | |
577 | ||
578 | hmm_range_need_fault(hmm_vma_walk, pfns, npages, | |
579 | 0, &fault, &write_fault); | |
580 | if (fault || write_fault) { | |
581 | hmm_vma_walk->last = addr; | |
d2e8d551 | 582 | pmd_migration_entry_wait(walk->mm, pmdp); |
73231612 | 583 | return -EBUSY; |
d08faca0 JG |
584 | } |
585 | return 0; | |
586 | } else if (!pmd_present(pmd)) | |
587 | return hmm_pfns_bad(start, end, walk); | |
da4c3c73 | 588 | |
d08faca0 | 589 | if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { |
da4c3c73 | 590 | /* |
d2e8d551 | 591 | * No need to take pmd_lock here, even if some other thread |
da4c3c73 JG |
592 | * is splitting the huge pmd we will get that event through |
593 | * mmu_notifier callback. | |
594 | * | |
d2e8d551 | 595 | * So just read pmd value and check again it's a transparent |
da4c3c73 JG |
596 | * huge or device mapping one and compute corresponding pfn |
597 | * values. | |
598 | */ | |
599 | pmd = pmd_read_atomic(pmdp); | |
600 | barrier(); | |
601 | if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) | |
602 | goto again; | |
74eee180 | 603 | |
d08faca0 | 604 | i = (addr - range->start) >> PAGE_SHIFT; |
53f5c3f4 | 605 | return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); |
da4c3c73 JG |
606 | } |
607 | ||
d08faca0 | 608 | /* |
d2e8d551 | 609 | * We have handled all the valid cases above ie either none, migration, |
d08faca0 JG |
610 | * huge or transparent huge. At this point either it is a valid pmd |
611 | * entry pointing to pte directory or it is a bad pmd that will not | |
612 | * recover. | |
613 | */ | |
614 | if (pmd_bad(pmd)) | |
da4c3c73 JG |
615 | return hmm_pfns_bad(start, end, walk); |
616 | ||
617 | ptep = pte_offset_map(pmdp, addr); | |
d08faca0 | 618 | i = (addr - range->start) >> PAGE_SHIFT; |
da4c3c73 | 619 | for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { |
53f5c3f4 | 620 | int r; |
74eee180 | 621 | |
53f5c3f4 JG |
622 | r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); |
623 | if (r) { | |
624 | /* hmm_vma_handle_pte() did unmap pte directory */ | |
625 | hmm_vma_walk->last = addr; | |
626 | return r; | |
74eee180 | 627 | } |
da4c3c73 | 628 | } |
992de9a8 JG |
629 | if (hmm_vma_walk->pgmap) { |
630 | /* | |
631 | * We do put_dev_pagemap() here and not in hmm_vma_handle_pte() | |
632 | * so that we can leverage get_dev_pagemap() optimization which | |
633 | * will not re-take a reference on a pgmap if we already have | |
634 | * one. | |
635 | */ | |
636 | put_dev_pagemap(hmm_vma_walk->pgmap); | |
637 | hmm_vma_walk->pgmap = NULL; | |
638 | } | |
da4c3c73 JG |
639 | pte_unmap(ptep - 1); |
640 | ||
53f5c3f4 | 641 | hmm_vma_walk->last = addr; |
da4c3c73 JG |
642 | return 0; |
643 | } | |
644 | ||
f0b3c45c CH |
645 | #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ |
646 | defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) | |
647 | static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) | |
648 | { | |
649 | if (!pud_present(pud)) | |
650 | return 0; | |
651 | return pud_write(pud) ? range->flags[HMM_PFN_VALID] | | |
652 | range->flags[HMM_PFN_WRITE] : | |
653 | range->flags[HMM_PFN_VALID]; | |
654 | } | |
655 | ||
656 | static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, | |
657 | struct mm_walk *walk) | |
992de9a8 JG |
658 | { |
659 | struct hmm_vma_walk *hmm_vma_walk = walk->private; | |
660 | struct hmm_range *range = hmm_vma_walk->range; | |
661 | unsigned long addr = start, next; | |
662 | pmd_t *pmdp; | |
663 | pud_t pud; | |
664 | int ret; | |
665 | ||
666 | again: | |
667 | pud = READ_ONCE(*pudp); | |
668 | if (pud_none(pud)) | |
669 | return hmm_vma_walk_hole(start, end, walk); | |
670 | ||
671 | if (pud_huge(pud) && pud_devmap(pud)) { | |
672 | unsigned long i, npages, pfn; | |
673 | uint64_t *pfns, cpu_flags; | |
674 | bool fault, write_fault; | |
675 | ||
676 | if (!pud_present(pud)) | |
677 | return hmm_vma_walk_hole(start, end, walk); | |
678 | ||
679 | i = (addr - range->start) >> PAGE_SHIFT; | |
680 | npages = (end - addr) >> PAGE_SHIFT; | |
681 | pfns = &range->pfns[i]; | |
682 | ||
683 | cpu_flags = pud_to_hmm_pfn_flags(range, pud); | |
684 | hmm_range_need_fault(hmm_vma_walk, pfns, npages, | |
685 | cpu_flags, &fault, &write_fault); | |
686 | if (fault || write_fault) | |
687 | return hmm_vma_walk_hole_(addr, end, fault, | |
688 | write_fault, walk); | |
689 | ||
992de9a8 JG |
690 | pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); |
691 | for (i = 0; i < npages; ++i, ++pfn) { | |
692 | hmm_vma_walk->pgmap = get_dev_pagemap(pfn, | |
693 | hmm_vma_walk->pgmap); | |
694 | if (unlikely(!hmm_vma_walk->pgmap)) | |
695 | return -EBUSY; | |
391aab11 JG |
696 | pfns[i] = hmm_device_entry_from_pfn(range, pfn) | |
697 | cpu_flags; | |
992de9a8 JG |
698 | } |
699 | if (hmm_vma_walk->pgmap) { | |
700 | put_dev_pagemap(hmm_vma_walk->pgmap); | |
701 | hmm_vma_walk->pgmap = NULL; | |
702 | } | |
703 | hmm_vma_walk->last = end; | |
704 | return 0; | |
992de9a8 JG |
705 | } |
706 | ||
707 | split_huge_pud(walk->vma, pudp, addr); | |
708 | if (pud_none(*pudp)) | |
709 | goto again; | |
710 | ||
711 | pmdp = pmd_offset(pudp, addr); | |
712 | do { | |
713 | next = pmd_addr_end(addr, end); | |
714 | ret = hmm_vma_walk_pmd(pmdp, addr, next, walk); | |
715 | if (ret) | |
716 | return ret; | |
717 | } while (pmdp++, addr = next, addr != end); | |
718 | ||
719 | return 0; | |
720 | } | |
f0b3c45c CH |
721 | #else |
722 | #define hmm_vma_walk_pud NULL | |
723 | #endif | |
992de9a8 | 724 | |
251bbe59 | 725 | #ifdef CONFIG_HUGETLB_PAGE |
63d5066f JG |
726 | static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, |
727 | unsigned long start, unsigned long end, | |
728 | struct mm_walk *walk) | |
729 | { | |
05c23af4 | 730 | unsigned long addr = start, i, pfn; |
63d5066f JG |
731 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
732 | struct hmm_range *range = hmm_vma_walk->range; | |
733 | struct vm_area_struct *vma = walk->vma; | |
63d5066f JG |
734 | uint64_t orig_pfn, cpu_flags; |
735 | bool fault, write_fault; | |
736 | spinlock_t *ptl; | |
737 | pte_t entry; | |
738 | int ret = 0; | |
739 | ||
d2e8d551 | 740 | ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); |
63d5066f JG |
741 | entry = huge_ptep_get(pte); |
742 | ||
7f08263d | 743 | i = (start - range->start) >> PAGE_SHIFT; |
63d5066f JG |
744 | orig_pfn = range->pfns[i]; |
745 | range->pfns[i] = range->values[HMM_PFN_NONE]; | |
746 | cpu_flags = pte_to_hmm_pfn_flags(range, entry); | |
747 | fault = write_fault = false; | |
748 | hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, | |
749 | &fault, &write_fault); | |
750 | if (fault || write_fault) { | |
751 | ret = -ENOENT; | |
752 | goto unlock; | |
753 | } | |
754 | ||
05c23af4 | 755 | pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); |
7f08263d | 756 | for (; addr < end; addr += PAGE_SIZE, i++, pfn++) |
391aab11 JG |
757 | range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | |
758 | cpu_flags; | |
63d5066f JG |
759 | hmm_vma_walk->last = end; |
760 | ||
761 | unlock: | |
762 | spin_unlock(ptl); | |
763 | ||
764 | if (ret == -ENOENT) | |
765 | return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); | |
766 | ||
767 | return ret; | |
63d5066f | 768 | } |
251bbe59 CH |
769 | #else |
770 | #define hmm_vma_walk_hugetlb_entry NULL | |
771 | #endif /* CONFIG_HUGETLB_PAGE */ | |
63d5066f | 772 | |
f88a1e90 JG |
773 | static void hmm_pfns_clear(struct hmm_range *range, |
774 | uint64_t *pfns, | |
33cd47dc JG |
775 | unsigned long addr, |
776 | unsigned long end) | |
777 | { | |
778 | for (; addr < end; addr += PAGE_SIZE, pfns++) | |
f88a1e90 | 779 | *pfns = range->values[HMM_PFN_NONE]; |
33cd47dc JG |
780 | } |
781 | ||
da4c3c73 | 782 | /* |
a3e0d41c | 783 | * hmm_range_register() - start tracking change to CPU page table over a range |
25f23a0c | 784 | * @range: range |
a3e0d41c | 785 | * @mm: the mm struct for the range of virtual address |
fac555ac | 786 | * |
d2e8d551 | 787 | * Return: 0 on success, -EFAULT if the address space is no longer valid |
25f23a0c | 788 | * |
a3e0d41c | 789 | * Track updates to the CPU page table see include/linux/hmm.h |
da4c3c73 | 790 | */ |
fac555ac | 791 | int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror) |
da4c3c73 | 792 | { |
e36acfe6 | 793 | struct hmm *hmm = mirror->hmm; |
5a136b4a | 794 | unsigned long flags; |
63d5066f | 795 | |
a3e0d41c | 796 | range->valid = false; |
704f3f2c JG |
797 | range->hmm = NULL; |
798 | ||
7f08263d | 799 | if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1))) |
63d5066f | 800 | return -EINVAL; |
fac555ac | 801 | if (range->start >= range->end) |
da4c3c73 JG |
802 | return -EINVAL; |
803 | ||
47f24598 | 804 | /* Prevent hmm_release() from running while the range is valid */ |
c7d8b782 | 805 | if (!mmget_not_zero(hmm->mmu_notifier.mm)) |
a3e0d41c | 806 | return -EFAULT; |
da4c3c73 | 807 | |
085ea250 | 808 | /* Initialize range to track CPU page table updates. */ |
5a136b4a | 809 | spin_lock_irqsave(&hmm->ranges_lock, flags); |
855ce7d2 | 810 | |
085ea250 | 811 | range->hmm = hmm; |
157816f3 | 812 | list_add(&range->list, &hmm->ranges); |
86586a41 | 813 | |
704f3f2c | 814 | /* |
a3e0d41c JG |
815 | * If there are any concurrent notifiers we have to wait for them for |
816 | * the range to be valid (see hmm_range_wait_until_valid()). | |
704f3f2c | 817 | */ |
085ea250 | 818 | if (!hmm->notifiers) |
a3e0d41c | 819 | range->valid = true; |
5a136b4a | 820 | spin_unlock_irqrestore(&hmm->ranges_lock, flags); |
a3e0d41c JG |
821 | |
822 | return 0; | |
da4c3c73 | 823 | } |
a3e0d41c | 824 | EXPORT_SYMBOL(hmm_range_register); |
da4c3c73 JG |
825 | |
826 | /* | |
a3e0d41c JG |
827 | * hmm_range_unregister() - stop tracking change to CPU page table over a range |
828 | * @range: range | |
da4c3c73 JG |
829 | * |
830 | * Range struct is used to track updates to the CPU page table after a call to | |
a3e0d41c | 831 | * hmm_range_register(). See include/linux/hmm.h for how to use it. |
da4c3c73 | 832 | */ |
a3e0d41c | 833 | void hmm_range_unregister(struct hmm_range *range) |
da4c3c73 | 834 | { |
085ea250 | 835 | struct hmm *hmm = range->hmm; |
5a136b4a | 836 | unsigned long flags; |
da4c3c73 | 837 | |
5a136b4a | 838 | spin_lock_irqsave(&hmm->ranges_lock, flags); |
47f24598 | 839 | list_del_init(&range->list); |
5a136b4a | 840 | spin_unlock_irqrestore(&hmm->ranges_lock, flags); |
da4c3c73 | 841 | |
a3e0d41c | 842 | /* Drop reference taken by hmm_range_register() */ |
c7d8b782 | 843 | mmput(hmm->mmu_notifier.mm); |
2dcc3eb8 JG |
844 | |
845 | /* | |
846 | * The range is now invalid and the ref on the hmm is dropped, so | |
847 | * poison the pointer. Leave other fields in place, for the caller's | |
848 | * use. | |
849 | */ | |
a3e0d41c | 850 | range->valid = false; |
2dcc3eb8 | 851 | memset(&range->hmm, POISON_INUSE, sizeof(range->hmm)); |
da4c3c73 | 852 | } |
a3e0d41c JG |
853 | EXPORT_SYMBOL(hmm_range_unregister); |
854 | ||
7b86ac33 CH |
855 | static const struct mm_walk_ops hmm_walk_ops = { |
856 | .pud_entry = hmm_vma_walk_pud, | |
857 | .pmd_entry = hmm_vma_walk_pmd, | |
858 | .pte_hole = hmm_vma_walk_hole, | |
859 | .hugetlb_entry = hmm_vma_walk_hugetlb_entry, | |
860 | }; | |
861 | ||
9a4903e4 CH |
862 | /** |
863 | * hmm_range_fault - try to fault some address in a virtual address range | |
864 | * @range: range being faulted | |
865 | * @flags: HMM_FAULT_* flags | |
866 | * | |
867 | * Return: the number of valid pages in range->pfns[] (from range start | |
868 | * address), which may be zero. On error one of the following status codes | |
869 | * can be returned: | |
73231612 | 870 | * |
9a4903e4 CH |
871 | * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma |
872 | * (e.g., device file vma). | |
873 | * -ENOMEM: Out of memory. | |
874 | * -EPERM: Invalid permission (e.g., asking for write and range is read | |
875 | * only). | |
876 | * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped. | |
877 | * -EBUSY: The range has been invalidated and the caller needs to wait for | |
878 | * the invalidation to finish. | |
879 | * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access | |
880 | * that range) number of valid pages in range->pfns[] (from | |
881 | * range start address). | |
74eee180 JG |
882 | * |
883 | * This is similar to a regular CPU page fault except that it will not trigger | |
73231612 JG |
884 | * any memory migration if the memory being faulted is not accessible by CPUs |
885 | * and caller does not ask for migration. | |
74eee180 | 886 | * |
ff05c0c6 JG |
887 | * On error, for one virtual address in the range, the function will mark the |
888 | * corresponding HMM pfn entry with an error flag. | |
74eee180 | 889 | */ |
9a4903e4 | 890 | long hmm_range_fault(struct hmm_range *range, unsigned int flags) |
74eee180 | 891 | { |
63d5066f | 892 | const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; |
a3e0d41c | 893 | unsigned long start = range->start, end; |
74eee180 | 894 | struct hmm_vma_walk hmm_vma_walk; |
a3e0d41c JG |
895 | struct hmm *hmm = range->hmm; |
896 | struct vm_area_struct *vma; | |
74eee180 JG |
897 | int ret; |
898 | ||
c7d8b782 | 899 | lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem); |
704f3f2c | 900 | |
a3e0d41c JG |
901 | do { |
902 | /* If range is no longer valid force retry. */ | |
2bcbeaef CH |
903 | if (!range->valid) |
904 | return -EBUSY; | |
74eee180 | 905 | |
c7d8b782 | 906 | vma = find_vma(hmm->mmu_notifier.mm, start); |
63d5066f | 907 | if (vma == NULL || (vma->vm_flags & device_vma)) |
a3e0d41c | 908 | return -EFAULT; |
704f3f2c | 909 | |
a3e0d41c JG |
910 | if (!(vma->vm_flags & VM_READ)) { |
911 | /* | |
912 | * If vma do not allow read access, then assume that it | |
913 | * does not allow write access, either. HMM does not | |
914 | * support architecture that allow write without read. | |
915 | */ | |
916 | hmm_pfns_clear(range, range->pfns, | |
917 | range->start, range->end); | |
918 | return -EPERM; | |
919 | } | |
74eee180 | 920 | |
992de9a8 | 921 | hmm_vma_walk.pgmap = NULL; |
a3e0d41c | 922 | hmm_vma_walk.last = start; |
9a4903e4 | 923 | hmm_vma_walk.flags = flags; |
a3e0d41c | 924 | hmm_vma_walk.range = range; |
a3e0d41c JG |
925 | end = min(range->end, vma->vm_end); |
926 | ||
7b86ac33 CH |
927 | walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops, |
928 | &hmm_vma_walk); | |
a3e0d41c JG |
929 | |
930 | do { | |
7b86ac33 CH |
931 | ret = walk_page_range(vma->vm_mm, start, end, |
932 | &hmm_walk_ops, &hmm_vma_walk); | |
a3e0d41c JG |
933 | start = hmm_vma_walk.last; |
934 | ||
935 | /* Keep trying while the range is valid. */ | |
936 | } while (ret == -EBUSY && range->valid); | |
937 | ||
938 | if (ret) { | |
939 | unsigned long i; | |
940 | ||
941 | i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; | |
942 | hmm_pfns_clear(range, &range->pfns[i], | |
943 | hmm_vma_walk.last, range->end); | |
944 | return ret; | |
945 | } | |
946 | start = end; | |
74eee180 | 947 | |
a3e0d41c | 948 | } while (start < range->end); |
704f3f2c | 949 | |
73231612 | 950 | return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; |
74eee180 | 951 | } |
73231612 | 952 | EXPORT_SYMBOL(hmm_range_fault); |
55c0ece8 JG |
953 | |
954 | /** | |
9a4903e4 CH |
955 | * hmm_range_dma_map - hmm_range_fault() and dma map page all in one. |
956 | * @range: range being faulted | |
957 | * @device: device to map page to | |
958 | * @daddrs: array of dma addresses for the mapped pages | |
959 | * @flags: HMM_FAULT_* | |
55c0ece8 | 960 | * |
9a4903e4 CH |
961 | * Return: the number of pages mapped on success (including zero), or any |
962 | * status return from hmm_range_fault() otherwise. | |
55c0ece8 | 963 | */ |
9a4903e4 CH |
964 | long hmm_range_dma_map(struct hmm_range *range, struct device *device, |
965 | dma_addr_t *daddrs, unsigned int flags) | |
55c0ece8 JG |
966 | { |
967 | unsigned long i, npages, mapped; | |
968 | long ret; | |
969 | ||
9a4903e4 | 970 | ret = hmm_range_fault(range, flags); |
55c0ece8 JG |
971 | if (ret <= 0) |
972 | return ret ? ret : -EBUSY; | |
973 | ||
974 | npages = (range->end - range->start) >> PAGE_SHIFT; | |
975 | for (i = 0, mapped = 0; i < npages; ++i) { | |
976 | enum dma_data_direction dir = DMA_TO_DEVICE; | |
977 | struct page *page; | |
978 | ||
979 | /* | |
980 | * FIXME need to update DMA API to provide invalid DMA address | |
981 | * value instead of a function to test dma address value. This | |
982 | * would remove lot of dumb code duplicated accross many arch. | |
983 | * | |
984 | * For now setting it to 0 here is good enough as the pfns[] | |
985 | * value is what is use to check what is valid and what isn't. | |
986 | */ | |
987 | daddrs[i] = 0; | |
988 | ||
391aab11 | 989 | page = hmm_device_entry_to_page(range, range->pfns[i]); |
55c0ece8 JG |
990 | if (page == NULL) |
991 | continue; | |
992 | ||
993 | /* Check if range is being invalidated */ | |
994 | if (!range->valid) { | |
995 | ret = -EBUSY; | |
996 | goto unmap; | |
997 | } | |
998 | ||
999 | /* If it is read and write than map bi-directional. */ | |
1000 | if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) | |
1001 | dir = DMA_BIDIRECTIONAL; | |
1002 | ||
1003 | daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); | |
1004 | if (dma_mapping_error(device, daddrs[i])) { | |
1005 | ret = -EFAULT; | |
1006 | goto unmap; | |
1007 | } | |
1008 | ||
1009 | mapped++; | |
1010 | } | |
1011 | ||
1012 | return mapped; | |
1013 | ||
1014 | unmap: | |
1015 | for (npages = i, i = 0; (i < npages) && mapped; ++i) { | |
1016 | enum dma_data_direction dir = DMA_TO_DEVICE; | |
1017 | struct page *page; | |
1018 | ||
391aab11 | 1019 | page = hmm_device_entry_to_page(range, range->pfns[i]); |
55c0ece8 JG |
1020 | if (page == NULL) |
1021 | continue; | |
1022 | ||
1023 | if (dma_mapping_error(device, daddrs[i])) | |
1024 | continue; | |
1025 | ||
1026 | /* If it is read and write than map bi-directional. */ | |
1027 | if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) | |
1028 | dir = DMA_BIDIRECTIONAL; | |
1029 | ||
1030 | dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); | |
1031 | mapped--; | |
1032 | } | |
1033 | ||
1034 | return ret; | |
1035 | } | |
1036 | EXPORT_SYMBOL(hmm_range_dma_map); | |
1037 | ||
1038 | /** | |
1039 | * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() | |
1040 | * @range: range being unmapped | |
55c0ece8 JG |
1041 | * @device: device against which dma map was done |
1042 | * @daddrs: dma address of mapped pages | |
1043 | * @dirty: dirty page if it had the write flag set | |
085ea250 | 1044 | * Return: number of page unmapped on success, -EINVAL otherwise |
55c0ece8 JG |
1045 | * |
1046 | * Note that caller MUST abide by mmu notifier or use HMM mirror and abide | |
1047 | * to the sync_cpu_device_pagetables() callback so that it is safe here to | |
1048 | * call set_page_dirty(). Caller must also take appropriate locks to avoid | |
1049 | * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. | |
1050 | */ | |
1051 | long hmm_range_dma_unmap(struct hmm_range *range, | |
55c0ece8 JG |
1052 | struct device *device, |
1053 | dma_addr_t *daddrs, | |
1054 | bool dirty) | |
1055 | { | |
1056 | unsigned long i, npages; | |
1057 | long cpages = 0; | |
1058 | ||
1059 | /* Sanity check. */ | |
1060 | if (range->end <= range->start) | |
1061 | return -EINVAL; | |
1062 | if (!daddrs) | |
1063 | return -EINVAL; | |
1064 | if (!range->pfns) | |
1065 | return -EINVAL; | |
1066 | ||
1067 | npages = (range->end - range->start) >> PAGE_SHIFT; | |
1068 | for (i = 0; i < npages; ++i) { | |
1069 | enum dma_data_direction dir = DMA_TO_DEVICE; | |
1070 | struct page *page; | |
1071 | ||
391aab11 | 1072 | page = hmm_device_entry_to_page(range, range->pfns[i]); |
55c0ece8 JG |
1073 | if (page == NULL) |
1074 | continue; | |
1075 | ||
1076 | /* If it is read and write than map bi-directional. */ | |
1077 | if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) { | |
1078 | dir = DMA_BIDIRECTIONAL; | |
1079 | ||
1080 | /* | |
1081 | * See comments in function description on why it is | |
1082 | * safe here to call set_page_dirty() | |
1083 | */ | |
1084 | if (dirty) | |
1085 | set_page_dirty(page); | |
1086 | } | |
1087 | ||
1088 | /* Unmap and clear pfns/dma address */ | |
1089 | dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); | |
1090 | range->pfns[i] = range->values[HMM_PFN_NONE]; | |
1091 | /* FIXME see comments in hmm_vma_dma_map() */ | |
1092 | daddrs[i] = 0; | |
1093 | cpages++; | |
1094 | } | |
1095 | ||
1096 | return cpages; | |
1097 | } | |
1098 | EXPORT_SYMBOL(hmm_range_dma_unmap); |