]>
Commit | Line | Data |
---|---|---|
133ff0ea JG |
1 | /* |
2 | * Copyright 2013 Red Hat Inc. | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of the GNU General Public License as published by | |
6 | * the Free Software Foundation; either version 2 of the License, or | |
7 | * (at your option) any later version. | |
8 | * | |
9 | * This program is distributed in the hope that it will be useful, | |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | * GNU General Public License for more details. | |
13 | * | |
14 | * Authors: Jérôme Glisse <[email protected]> | |
15 | */ | |
16 | /* | |
17 | * Heterogeneous Memory Management (HMM) | |
18 | * | |
19 | * See Documentation/vm/hmm.txt for reasons and overview of what HMM is and it | |
20 | * is for. Here we focus on the HMM API description, with some explanation of | |
21 | * the underlying implementation. | |
22 | * | |
23 | * Short description: HMM provides a set of helpers to share a virtual address | |
24 | * space between CPU and a device, so that the device can access any valid | |
25 | * address of the process (while still obeying memory protection). HMM also | |
26 | * provides helpers to migrate process memory to device memory, and back. Each | |
27 | * set of functionality (address space mirroring, and migration to and from | |
28 | * device memory) can be used independently of the other. | |
29 | * | |
30 | * | |
31 | * HMM address space mirroring API: | |
32 | * | |
33 | * Use HMM address space mirroring if you want to mirror range of the CPU page | |
34 | * table of a process into a device page table. Here, "mirror" means "keep | |
35 | * synchronized". Prerequisites: the device must provide the ability to write- | |
36 | * protect its page tables (at PAGE_SIZE granularity), and must be able to | |
37 | * recover from the resulting potential page faults. | |
38 | * | |
39 | * HMM guarantees that at any point in time, a given virtual address points to | |
40 | * either the same memory in both CPU and device page tables (that is: CPU and | |
41 | * device page tables each point to the same pages), or that one page table (CPU | |
42 | * or device) points to no entry, while the other still points to the old page | |
43 | * for the address. The latter case happens when the CPU page table update | |
44 | * happens first, and then the update is mirrored over to the device page table. | |
45 | * This does not cause any issue, because the CPU page table cannot start | |
46 | * pointing to a new page until the device page table is invalidated. | |
47 | * | |
48 | * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any | |
49 | * updates to each device driver that has registered a mirror. It also provides | |
50 | * some API calls to help with taking a snapshot of the CPU page table, and to | |
51 | * synchronize with any updates that might happen concurrently. | |
52 | * | |
53 | * | |
54 | * HMM migration to and from device memory: | |
55 | * | |
56 | * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with | |
57 | * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page | |
58 | * of the device memory, and allows the device driver to manage its memory | |
59 | * using those struct pages. Having struct pages for device memory makes | |
60 | * migration easier. Because that memory is not addressable by the CPU it must | |
61 | * never be pinned to the device; in other words, any CPU page fault can always | |
62 | * cause the device memory to be migrated (copied/moved) back to regular memory. | |
63 | * | |
64 | * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that | |
65 | * allows use of a device DMA engine to perform the copy operation between | |
66 | * regular system memory and device memory. | |
67 | */ | |
68 | #ifndef LINUX_HMM_H | |
69 | #define LINUX_HMM_H | |
70 | ||
71 | #include <linux/kconfig.h> | |
72 | ||
73 | #if IS_ENABLED(CONFIG_HMM) | |
74 | ||
858b54da | 75 | #include <linux/device.h> |
4ef589dc JG |
76 | #include <linux/migrate.h> |
77 | #include <linux/memremap.h> | |
78 | #include <linux/completion.h> | |
79 | ||
c0b12405 | 80 | struct hmm; |
133ff0ea JG |
81 | |
82 | /* | |
83 | * hmm_pfn_t - HMM uses its own pfn type to keep several flags per page | |
84 | * | |
85 | * Flags: | |
86 | * HMM_PFN_VALID: pfn is valid | |
da4c3c73 | 87 | * HMM_PFN_READ: CPU page table has read permission set |
133ff0ea | 88 | * HMM_PFN_WRITE: CPU page table has write permission set |
da4c3c73 JG |
89 | * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory |
90 | * HMM_PFN_EMPTY: corresponding CPU page table entry is pte_none() | |
91 | * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the | |
92 | * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not | |
93 | * be mirrored by a device, because the entry will never have HMM_PFN_VALID | |
94 | * set and the pfn value is undefined. | |
95 | * HMM_PFN_DEVICE_UNADDRESSABLE: unaddressable device memory (ZONE_DEVICE) | |
133ff0ea JG |
96 | */ |
97 | typedef unsigned long hmm_pfn_t; | |
98 | ||
99 | #define HMM_PFN_VALID (1 << 0) | |
da4c3c73 JG |
100 | #define HMM_PFN_READ (1 << 1) |
101 | #define HMM_PFN_WRITE (1 << 2) | |
102 | #define HMM_PFN_ERROR (1 << 3) | |
103 | #define HMM_PFN_EMPTY (1 << 4) | |
104 | #define HMM_PFN_SPECIAL (1 << 5) | |
105 | #define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 6) | |
106 | #define HMM_PFN_SHIFT 7 | |
133ff0ea JG |
107 | |
108 | /* | |
109 | * hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t | |
110 | * @pfn: hmm_pfn_t to convert to struct page | |
111 | * Returns: struct page pointer if pfn is a valid hmm_pfn_t, NULL otherwise | |
112 | * | |
113 | * If the hmm_pfn_t is valid (ie valid flag set) then return the struct page | |
114 | * matching the pfn value stored in the hmm_pfn_t. Otherwise return NULL. | |
115 | */ | |
116 | static inline struct page *hmm_pfn_t_to_page(hmm_pfn_t pfn) | |
117 | { | |
118 | if (!(pfn & HMM_PFN_VALID)) | |
119 | return NULL; | |
120 | return pfn_to_page(pfn >> HMM_PFN_SHIFT); | |
121 | } | |
122 | ||
123 | /* | |
124 | * hmm_pfn_t_to_pfn() - return pfn value store in a hmm_pfn_t | |
125 | * @pfn: hmm_pfn_t to extract pfn from | |
126 | * Returns: pfn value if hmm_pfn_t is valid, -1UL otherwise | |
127 | */ | |
128 | static inline unsigned long hmm_pfn_t_to_pfn(hmm_pfn_t pfn) | |
129 | { | |
130 | if (!(pfn & HMM_PFN_VALID)) | |
131 | return -1UL; | |
132 | return (pfn >> HMM_PFN_SHIFT); | |
133 | } | |
134 | ||
135 | /* | |
136 | * hmm_pfn_t_from_page() - create a valid hmm_pfn_t value from struct page | |
137 | * @page: struct page pointer for which to create the hmm_pfn_t | |
138 | * Returns: valid hmm_pfn_t for the page | |
139 | */ | |
140 | static inline hmm_pfn_t hmm_pfn_t_from_page(struct page *page) | |
141 | { | |
142 | return (page_to_pfn(page) << HMM_PFN_SHIFT) | HMM_PFN_VALID; | |
143 | } | |
144 | ||
145 | /* | |
146 | * hmm_pfn_t_from_pfn() - create a valid hmm_pfn_t value from pfn | |
147 | * @pfn: pfn value for which to create the hmm_pfn_t | |
148 | * Returns: valid hmm_pfn_t for the pfn | |
149 | */ | |
150 | static inline hmm_pfn_t hmm_pfn_t_from_pfn(unsigned long pfn) | |
151 | { | |
152 | return (pfn << HMM_PFN_SHIFT) | HMM_PFN_VALID; | |
153 | } | |
154 | ||
155 | ||
c0b12405 JG |
156 | #if IS_ENABLED(CONFIG_HMM_MIRROR) |
157 | /* | |
158 | * Mirroring: how to synchronize device page table with CPU page table. | |
159 | * | |
160 | * A device driver that is participating in HMM mirroring must always | |
161 | * synchronize with CPU page table updates. For this, device drivers can either | |
162 | * directly use mmu_notifier APIs or they can use the hmm_mirror API. Device | |
163 | * drivers can decide to register one mirror per device per process, or just | |
164 | * one mirror per process for a group of devices. The pattern is: | |
165 | * | |
166 | * int device_bind_address_space(..., struct mm_struct *mm, ...) | |
167 | * { | |
168 | * struct device_address_space *das; | |
169 | * | |
170 | * // Device driver specific initialization, and allocation of das | |
171 | * // which contains an hmm_mirror struct as one of its fields. | |
172 | * ... | |
173 | * | |
174 | * ret = hmm_mirror_register(&das->mirror, mm, &device_mirror_ops); | |
175 | * if (ret) { | |
176 | * // Cleanup on error | |
177 | * return ret; | |
178 | * } | |
179 | * | |
180 | * // Other device driver specific initialization | |
181 | * ... | |
182 | * } | |
183 | * | |
184 | * Once an hmm_mirror is registered for an address space, the device driver | |
185 | * will get callbacks through sync_cpu_device_pagetables() operation (see | |
186 | * hmm_mirror_ops struct). | |
187 | * | |
188 | * Device driver must not free the struct containing the hmm_mirror struct | |
189 | * before calling hmm_mirror_unregister(). The expected usage is to do that when | |
190 | * the device driver is unbinding from an address space. | |
191 | * | |
192 | * | |
193 | * void device_unbind_address_space(struct device_address_space *das) | |
194 | * { | |
195 | * // Device driver specific cleanup | |
196 | * ... | |
197 | * | |
198 | * hmm_mirror_unregister(&das->mirror); | |
199 | * | |
200 | * // Other device driver specific cleanup, and now das can be freed | |
201 | * ... | |
202 | * } | |
203 | */ | |
204 | ||
205 | struct hmm_mirror; | |
206 | ||
207 | /* | |
208 | * enum hmm_update_type - type of update | |
209 | * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why) | |
210 | */ | |
211 | enum hmm_update_type { | |
212 | HMM_UPDATE_INVALIDATE, | |
213 | }; | |
214 | ||
215 | /* | |
216 | * struct hmm_mirror_ops - HMM mirror device operations callback | |
217 | * | |
218 | * @update: callback to update range on a device | |
219 | */ | |
220 | struct hmm_mirror_ops { | |
221 | /* sync_cpu_device_pagetables() - synchronize page tables | |
222 | * | |
223 | * @mirror: pointer to struct hmm_mirror | |
224 | * @update_type: type of update that occurred to the CPU page table | |
225 | * @start: virtual start address of the range to update | |
226 | * @end: virtual end address of the range to update | |
227 | * | |
228 | * This callback ultimately originates from mmu_notifiers when the CPU | |
229 | * page table is updated. The device driver must update its page table | |
230 | * in response to this callback. The update argument tells what action | |
231 | * to perform. | |
232 | * | |
233 | * The device driver must not return from this callback until the device | |
234 | * page tables are completely updated (TLBs flushed, etc); this is a | |
235 | * synchronous call. | |
236 | */ | |
237 | void (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror, | |
238 | enum hmm_update_type update_type, | |
239 | unsigned long start, | |
240 | unsigned long end); | |
241 | }; | |
242 | ||
243 | /* | |
244 | * struct hmm_mirror - mirror struct for a device driver | |
245 | * | |
246 | * @hmm: pointer to struct hmm (which is unique per mm_struct) | |
247 | * @ops: device driver callback for HMM mirror operations | |
248 | * @list: for list of mirrors of a given mm | |
249 | * | |
250 | * Each address space (mm_struct) being mirrored by a device must register one | |
251 | * instance of an hmm_mirror struct with HMM. HMM will track the list of all | |
252 | * mirrors for each mm_struct. | |
253 | */ | |
254 | struct hmm_mirror { | |
255 | struct hmm *hmm; | |
256 | const struct hmm_mirror_ops *ops; | |
257 | struct list_head list; | |
258 | }; | |
259 | ||
260 | int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); | |
261 | void hmm_mirror_unregister(struct hmm_mirror *mirror); | |
da4c3c73 JG |
262 | |
263 | ||
264 | /* | |
265 | * struct hmm_range - track invalidation lock on virtual address range | |
266 | * | |
267 | * @list: all range lock are on a list | |
268 | * @start: range virtual start address (inclusive) | |
269 | * @end: range virtual end address (exclusive) | |
270 | * @pfns: array of pfns (big enough for the range) | |
271 | * @valid: pfns array did not change since it has been fill by an HMM function | |
272 | */ | |
273 | struct hmm_range { | |
274 | struct list_head list; | |
275 | unsigned long start; | |
276 | unsigned long end; | |
277 | hmm_pfn_t *pfns; | |
278 | bool valid; | |
279 | }; | |
280 | ||
281 | /* | |
282 | * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device | |
283 | * driver lock that serializes device page table updates, then call | |
284 | * hmm_vma_range_done(), to check if the snapshot is still valid. The same | |
285 | * device driver page table update lock must also be used in the | |
286 | * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page | |
287 | * table invalidation serializes on it. | |
288 | * | |
289 | * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL | |
290 | * hmm_vma_get_pfns() WITHOUT ERROR ! | |
291 | * | |
292 | * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID ! | |
293 | */ | |
294 | int hmm_vma_get_pfns(struct vm_area_struct *vma, | |
295 | struct hmm_range *range, | |
296 | unsigned long start, | |
297 | unsigned long end, | |
298 | hmm_pfn_t *pfns); | |
299 | bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range); | |
74eee180 JG |
300 | |
301 | ||
302 | /* | |
303 | * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will | |
304 | * not migrate any device memory back to system memory. The hmm_pfn_t array will | |
305 | * be updated with the fault result and current snapshot of the CPU page table | |
306 | * for the range. | |
307 | * | |
308 | * The mmap_sem must be taken in read mode before entering and it might be | |
309 | * dropped by the function if the block argument is false. In that case, the | |
310 | * function returns -EAGAIN. | |
311 | * | |
312 | * Return value does not reflect if the fault was successful for every single | |
313 | * address or not. Therefore, the caller must to inspect the hmm_pfn_t array to | |
314 | * determine fault status for each address. | |
315 | * | |
316 | * Trying to fault inside an invalid vma will result in -EINVAL. | |
317 | * | |
318 | * See the function description in mm/hmm.c for further documentation. | |
319 | */ | |
320 | int hmm_vma_fault(struct vm_area_struct *vma, | |
321 | struct hmm_range *range, | |
322 | unsigned long start, | |
323 | unsigned long end, | |
324 | hmm_pfn_t *pfns, | |
325 | bool write, | |
326 | bool block); | |
c0b12405 JG |
327 | #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ |
328 | ||
329 | ||
df6ad698 | 330 | #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) |
4ef589dc JG |
331 | struct hmm_devmem; |
332 | ||
333 | struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, | |
334 | unsigned long addr); | |
335 | ||
336 | /* | |
337 | * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events | |
338 | * | |
339 | * @free: call when refcount on page reach 1 and thus is no longer use | |
340 | * @fault: call when there is a page fault to unaddressable memory | |
341 | * | |
342 | * Both callback happens from page_free() and page_fault() callback of struct | |
343 | * dev_pagemap respectively. See include/linux/memremap.h for more details on | |
344 | * those. | |
345 | * | |
346 | * The hmm_devmem_ops callback are just here to provide a coherent and | |
347 | * uniq API to device driver and device driver should not register their | |
348 | * own page_free() or page_fault() but rely on the hmm_devmem_ops call- | |
349 | * back. | |
350 | */ | |
351 | struct hmm_devmem_ops { | |
352 | /* | |
353 | * free() - free a device page | |
354 | * @devmem: device memory structure (see struct hmm_devmem) | |
355 | * @page: pointer to struct page being freed | |
356 | * | |
357 | * Call back occurs whenever a device page refcount reach 1 which | |
358 | * means that no one is holding any reference on the page anymore | |
359 | * (ZONE_DEVICE page have an elevated refcount of 1 as default so | |
360 | * that they are not release to the general page allocator). | |
361 | * | |
362 | * Note that callback has exclusive ownership of the page (as no | |
363 | * one is holding any reference). | |
364 | */ | |
365 | void (*free)(struct hmm_devmem *devmem, struct page *page); | |
366 | /* | |
367 | * fault() - CPU page fault or get user page (GUP) | |
368 | * @devmem: device memory structure (see struct hmm_devmem) | |
369 | * @vma: virtual memory area containing the virtual address | |
370 | * @addr: virtual address that faulted or for which there is a GUP | |
371 | * @page: pointer to struct page backing virtual address (unreliable) | |
372 | * @flags: FAULT_FLAG_* (see include/linux/mm.h) | |
373 | * @pmdp: page middle directory | |
374 | * Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR | |
375 | * on error | |
376 | * | |
377 | * The callback occurs whenever there is a CPU page fault or GUP on a | |
378 | * virtual address. This means that the device driver must migrate the | |
379 | * page back to regular memory (CPU accessible). | |
380 | * | |
381 | * The device driver is free to migrate more than one page from the | |
382 | * fault() callback as an optimization. However if device decide to | |
383 | * migrate more than one page it must always priotirize the faulting | |
384 | * address over the others. | |
385 | * | |
386 | * The struct page pointer is only given as an hint to allow quick | |
387 | * lookup of internal device driver data. A concurrent migration | |
388 | * might have already free that page and the virtual address might | |
389 | * not longer be back by it. So it should not be modified by the | |
390 | * callback. | |
391 | * | |
392 | * Note that mmap semaphore is held in read mode at least when this | |
393 | * callback occurs, hence the vma is valid upon callback entry. | |
394 | */ | |
395 | int (*fault)(struct hmm_devmem *devmem, | |
396 | struct vm_area_struct *vma, | |
397 | unsigned long addr, | |
398 | const struct page *page, | |
399 | unsigned int flags, | |
400 | pmd_t *pmdp); | |
401 | }; | |
402 | ||
403 | /* | |
404 | * struct hmm_devmem - track device memory | |
405 | * | |
406 | * @completion: completion object for device memory | |
407 | * @pfn_first: first pfn for this resource (set by hmm_devmem_add()) | |
408 | * @pfn_last: last pfn for this resource (set by hmm_devmem_add()) | |
409 | * @resource: IO resource reserved for this chunk of memory | |
410 | * @pagemap: device page map for that chunk | |
411 | * @device: device to bind resource to | |
412 | * @ops: memory operations callback | |
413 | * @ref: per CPU refcount | |
414 | * | |
415 | * This an helper structure for device drivers that do not wish to implement | |
416 | * the gory details related to hotplugging new memoy and allocating struct | |
417 | * pages. | |
418 | * | |
419 | * Device drivers can directly use ZONE_DEVICE memory on their own if they | |
420 | * wish to do so. | |
421 | */ | |
422 | struct hmm_devmem { | |
423 | struct completion completion; | |
424 | unsigned long pfn_first; | |
425 | unsigned long pfn_last; | |
426 | struct resource *resource; | |
427 | struct device *device; | |
428 | struct dev_pagemap pagemap; | |
429 | const struct hmm_devmem_ops *ops; | |
430 | struct percpu_ref ref; | |
431 | }; | |
432 | ||
433 | /* | |
434 | * To add (hotplug) device memory, HMM assumes that there is no real resource | |
435 | * that reserves a range in the physical address space (this is intended to be | |
436 | * use by unaddressable device memory). It will reserve a physical range big | |
437 | * enough and allocate struct page for it. | |
438 | * | |
439 | * The device driver can wrap the hmm_devmem struct inside a private device | |
440 | * driver struct. The device driver must call hmm_devmem_remove() before the | |
441 | * device goes away and before freeing the hmm_devmem struct memory. | |
442 | */ | |
443 | struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, | |
444 | struct device *device, | |
445 | unsigned long size); | |
446 | void hmm_devmem_remove(struct hmm_devmem *devmem); | |
447 | ||
448 | /* | |
449 | * hmm_devmem_page_set_drvdata - set per-page driver data field | |
450 | * | |
451 | * @page: pointer to struct page | |
452 | * @data: driver data value to set | |
453 | * | |
454 | * Because page can not be on lru we have an unsigned long that driver can use | |
455 | * to store a per page field. This just a simple helper to do that. | |
456 | */ | |
457 | static inline void hmm_devmem_page_set_drvdata(struct page *page, | |
458 | unsigned long data) | |
459 | { | |
460 | unsigned long *drvdata = (unsigned long *)&page->pgmap; | |
461 | ||
462 | drvdata[1] = data; | |
463 | } | |
464 | ||
465 | /* | |
466 | * hmm_devmem_page_get_drvdata - get per page driver data field | |
467 | * | |
468 | * @page: pointer to struct page | |
469 | * Return: driver data value | |
470 | */ | |
471 | static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page) | |
472 | { | |
473 | unsigned long *drvdata = (unsigned long *)&page->pgmap; | |
474 | ||
475 | return drvdata[1]; | |
476 | } | |
858b54da JG |
477 | |
478 | ||
479 | /* | |
480 | * struct hmm_device - fake device to hang device memory onto | |
481 | * | |
482 | * @device: device struct | |
483 | * @minor: device minor number | |
484 | */ | |
485 | struct hmm_device { | |
486 | struct device device; | |
487 | unsigned int minor; | |
488 | }; | |
489 | ||
490 | /* | |
491 | * A device driver that wants to handle multiple devices memory through a | |
492 | * single fake device can use hmm_device to do so. This is purely a helper and | |
493 | * it is not strictly needed, in order to make use of any HMM functionality. | |
494 | */ | |
495 | struct hmm_device *hmm_device_new(void *drvdata); | |
496 | void hmm_device_put(struct hmm_device *hmm_device); | |
df6ad698 | 497 | #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ |
4ef589dc JG |
498 | |
499 | ||
133ff0ea JG |
500 | /* Below are for HMM internal use only! Not to be used by device driver! */ |
501 | void hmm_mm_destroy(struct mm_struct *mm); | |
502 | ||
503 | static inline void hmm_mm_init(struct mm_struct *mm) | |
504 | { | |
505 | mm->hmm = NULL; | |
506 | } | |
507 | ||
508 | #else /* IS_ENABLED(CONFIG_HMM) */ | |
509 | ||
510 | /* Below are for HMM internal use only! Not to be used by device driver! */ | |
511 | static inline void hmm_mm_destroy(struct mm_struct *mm) {} | |
512 | static inline void hmm_mm_init(struct mm_struct *mm) {} | |
513 | ||
514 | #endif /* IS_ENABLED(CONFIG_HMM) */ | |
515 | #endif /* LINUX_HMM_H */ |