]>
Commit | Line | Data |
---|---|---|
0e9b5cd6 AG |
1 | /* |
2 | * Linux UFFD-WP support | |
3 | * | |
4 | * Copyright Virtuozzo GmbH, 2020 | |
5 | * | |
6 | * Authors: | |
7 | * Andrey Gruzdev <[email protected]> | |
8 | * | |
9 | * This work is licensed under the terms of the GNU GPL, version 2 or | |
10 | * later. See the COPYING file in the top-level directory. | |
11 | */ | |
12 | ||
13 | #include "qemu/osdep.h" | |
14 | #include "qemu/bitops.h" | |
15 | #include "qemu/error-report.h" | |
16 | #include "qemu/userfaultfd.h" | |
17 | #include "trace.h" | |
18 | #include <poll.h> | |
19 | #include <sys/syscall.h> | |
20 | #include <sys/ioctl.h> | |
21 | ||
22 | /** | |
23 | * uffd_query_features: query UFFD features | |
24 | * | |
25 | * Returns: 0 on success, negative value in case of an error | |
26 | * | |
27 | * @features: parameter to receive 'uffdio_api.features' | |
28 | */ | |
29 | int uffd_query_features(uint64_t *features) | |
30 | { | |
31 | int uffd_fd; | |
32 | struct uffdio_api api_struct = { 0 }; | |
33 | int ret = -1; | |
34 | ||
35 | uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC); | |
36 | if (uffd_fd < 0) { | |
37 | trace_uffd_query_features_nosys(errno); | |
38 | return -1; | |
39 | } | |
40 | ||
41 | api_struct.api = UFFD_API; | |
42 | api_struct.features = 0; | |
43 | ||
44 | if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { | |
45 | trace_uffd_query_features_api_failed(errno); | |
46 | goto out; | |
47 | } | |
48 | *features = api_struct.features; | |
49 | ret = 0; | |
50 | ||
51 | out: | |
52 | close(uffd_fd); | |
53 | return ret; | |
54 | } | |
55 | ||
56 | /** | |
57 | * uffd_create_fd: create UFFD file descriptor | |
58 | * | |
59 | * Returns non-negative file descriptor or negative value in case of an error | |
60 | * | |
61 | * @features: UFFD features to request | |
62 | * @non_blocking: create UFFD file descriptor for non-blocking operation | |
63 | */ | |
64 | int uffd_create_fd(uint64_t features, bool non_blocking) | |
65 | { | |
66 | int uffd_fd; | |
67 | int flags; | |
68 | struct uffdio_api api_struct = { 0 }; | |
69 | uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER); | |
70 | ||
71 | flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0); | |
72 | uffd_fd = syscall(__NR_userfaultfd, flags); | |
73 | if (uffd_fd < 0) { | |
74 | trace_uffd_create_fd_nosys(errno); | |
75 | return -1; | |
76 | } | |
77 | ||
78 | api_struct.api = UFFD_API; | |
79 | api_struct.features = features; | |
80 | if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { | |
81 | trace_uffd_create_fd_api_failed(errno); | |
82 | goto fail; | |
83 | } | |
84 | if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) { | |
85 | trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls); | |
86 | goto fail; | |
87 | } | |
88 | ||
89 | return uffd_fd; | |
90 | ||
91 | fail: | |
92 | close(uffd_fd); | |
93 | return -1; | |
94 | } | |
95 | ||
96 | /** | |
97 | * uffd_close_fd: close UFFD file descriptor | |
98 | * | |
99 | * @uffd_fd: UFFD file descriptor | |
100 | */ | |
101 | void uffd_close_fd(int uffd_fd) | |
102 | { | |
103 | assert(uffd_fd >= 0); | |
104 | close(uffd_fd); | |
105 | } | |
106 | ||
107 | /** | |
108 | * uffd_register_memory: register memory range via UFFD-IO | |
109 | * | |
110 | * Returns 0 in case of success, negative value in case of an error | |
111 | * | |
112 | * @uffd_fd: UFFD file descriptor | |
113 | * @addr: base address of memory range | |
114 | * @length: length of memory range | |
115 | * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...) | |
116 | * @ioctls: optional pointer to receive supported IOCTL mask | |
117 | */ | |
118 | int uffd_register_memory(int uffd_fd, void *addr, uint64_t length, | |
119 | uint64_t mode, uint64_t *ioctls) | |
120 | { | |
121 | struct uffdio_register uffd_register; | |
122 | ||
123 | uffd_register.range.start = (uintptr_t) addr; | |
124 | uffd_register.range.len = length; | |
125 | uffd_register.mode = mode; | |
126 | ||
127 | if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) { | |
128 | trace_uffd_register_memory_failed(addr, length, mode, errno); | |
129 | return -1; | |
130 | } | |
131 | if (ioctls) { | |
132 | *ioctls = uffd_register.ioctls; | |
133 | } | |
134 | ||
135 | return 0; | |
136 | } | |
137 | ||
138 | /** | |
139 | * uffd_unregister_memory: un-register memory range with UFFD-IO | |
140 | * | |
141 | * Returns 0 in case of success, negative value in case of an error | |
142 | * | |
143 | * @uffd_fd: UFFD file descriptor | |
144 | * @addr: base address of memory range | |
145 | * @length: length of memory range | |
146 | */ | |
147 | int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length) | |
148 | { | |
149 | struct uffdio_range uffd_range; | |
150 | ||
151 | uffd_range.start = (uintptr_t) addr; | |
152 | uffd_range.len = length; | |
153 | ||
154 | if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) { | |
155 | trace_uffd_unregister_memory_failed(addr, length, errno); | |
156 | return -1; | |
157 | } | |
158 | ||
159 | return 0; | |
160 | } | |
161 | ||
162 | /** | |
163 | * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO | |
164 | * | |
165 | * Returns 0 on success, negative value in case of error | |
166 | * | |
167 | * @uffd_fd: UFFD file descriptor | |
168 | * @addr: base address of memory range | |
169 | * @length: length of memory range | |
170 | * @wp: write-protect/unprotect | |
171 | * @dont_wake: do not wake threads waiting on wr-protected page | |
172 | */ | |
173 | int uffd_change_protection(int uffd_fd, void *addr, uint64_t length, | |
174 | bool wp, bool dont_wake) | |
175 | { | |
176 | struct uffdio_writeprotect uffd_writeprotect; | |
177 | ||
178 | uffd_writeprotect.range.start = (uintptr_t) addr; | |
179 | uffd_writeprotect.range.len = length; | |
180 | if (!wp && dont_wake) { | |
181 | /* DONTWAKE is meaningful only on protection release */ | |
182 | uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE; | |
183 | } else { | |
184 | uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0); | |
185 | } | |
186 | ||
187 | if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { | |
188 | error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64 | |
189 | " mode=%" PRIx64 " errno=%i", addr, length, | |
190 | (uint64_t) uffd_writeprotect.mode, errno); | |
191 | return -1; | |
192 | } | |
193 | ||
194 | return 0; | |
195 | } | |
196 | ||
197 | /** | |
198 | * uffd_copy_page: copy range of pages to destination via UFFD-IO | |
199 | * | |
200 | * Copy range of source pages to the destination to resolve | |
201 | * missing page fault somewhere in the destination range. | |
202 | * | |
203 | * Returns 0 on success, negative value in case of an error | |
204 | * | |
205 | * @uffd_fd: UFFD file descriptor | |
206 | * @dst_addr: destination base address | |
207 | * @src_addr: source base address | |
208 | * @length: length of the range to copy | |
209 | * @dont_wake: do not wake threads waiting on missing page | |
210 | */ | |
211 | int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr, | |
212 | uint64_t length, bool dont_wake) | |
213 | { | |
214 | struct uffdio_copy uffd_copy; | |
215 | ||
216 | uffd_copy.dst = (uintptr_t) dst_addr; | |
217 | uffd_copy.src = (uintptr_t) src_addr; | |
218 | uffd_copy.len = length; | |
219 | uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0; | |
220 | ||
221 | if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) { | |
222 | error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64 | |
223 | " mode=%" PRIx64 " errno=%i", dst_addr, src_addr, | |
224 | length, (uint64_t) uffd_copy.mode, errno); | |
225 | return -1; | |
226 | } | |
227 | ||
228 | return 0; | |
229 | } | |
230 | ||
231 | /** | |
232 | * uffd_zero_page: fill range of pages with zeroes via UFFD-IO | |
233 | * | |
234 | * Fill range pages with zeroes to resolve missing page fault within the range. | |
235 | * | |
236 | * Returns 0 on success, negative value in case of an error | |
237 | * | |
238 | * @uffd_fd: UFFD file descriptor | |
239 | * @addr: base address | |
240 | * @length: length of the range to fill with zeroes | |
241 | * @dont_wake: do not wake threads waiting on missing page | |
242 | */ | |
243 | int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake) | |
244 | { | |
245 | struct uffdio_zeropage uffd_zeropage; | |
246 | ||
247 | uffd_zeropage.range.start = (uintptr_t) addr; | |
248 | uffd_zeropage.range.len = length; | |
249 | uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0; | |
250 | ||
251 | if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) { | |
252 | error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64 | |
253 | " mode=%" PRIx64 " errno=%i", addr, length, | |
254 | (uint64_t) uffd_zeropage.mode, errno); | |
255 | return -1; | |
256 | } | |
257 | ||
258 | return 0; | |
259 | } | |
260 | ||
261 | /** | |
262 | * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution | |
263 | * | |
264 | * Wake up threads waiting on any page/pages from the designated range. | |
265 | * The main use case is when during some period, page faults are resolved | |
266 | * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits | |
267 | * for the whole memory range are satisfied in a single call to uffd_wakeup(). | |
268 | * | |
269 | * Returns 0 on success, negative value in case of an error | |
270 | * | |
271 | * @uffd_fd: UFFD file descriptor | |
272 | * @addr: base address | |
273 | * @length: length of the range | |
274 | */ | |
275 | int uffd_wakeup(int uffd_fd, void *addr, uint64_t length) | |
276 | { | |
277 | struct uffdio_range uffd_range; | |
278 | ||
279 | uffd_range.start = (uintptr_t) addr; | |
280 | uffd_range.len = length; | |
281 | ||
282 | if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) { | |
283 | error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i", | |
284 | addr, length, errno); | |
285 | return -1; | |
286 | } | |
287 | ||
288 | return 0; | |
289 | } | |
290 | ||
291 | /** | |
292 | * uffd_read_events: read pending UFFD events | |
293 | * | |
294 | * Returns number of fetched messages, 0 if non is available or | |
295 | * negative value in case of an error | |
296 | * | |
297 | * @uffd_fd: UFFD file descriptor | |
298 | * @msgs: pointer to message buffer | |
299 | * @count: number of messages that can fit in the buffer | |
300 | */ | |
301 | int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count) | |
302 | { | |
303 | ssize_t res; | |
304 | do { | |
305 | res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg)); | |
306 | } while (res < 0 && errno == EINTR); | |
307 | ||
308 | if ((res < 0 && errno == EAGAIN)) { | |
309 | return 0; | |
310 | } | |
311 | if (res < 0) { | |
312 | error_report("uffd_read_events() failed: errno=%i", errno); | |
313 | return -1; | |
314 | } | |
315 | ||
316 | return (int) (res / sizeof(struct uffd_msg)); | |
317 | } | |
318 | ||
319 | /** | |
320 | * uffd_poll_events: poll UFFD file descriptor for read | |
321 | * | |
322 | * Returns true if events are available for read, false otherwise | |
323 | * | |
324 | * @uffd_fd: UFFD file descriptor | |
325 | * @tmo: timeout value | |
326 | */ | |
327 | bool uffd_poll_events(int uffd_fd, int tmo) | |
328 | { | |
329 | int res; | |
330 | struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 }; | |
331 | ||
332 | do { | |
333 | res = poll(&poll_fd, 1, tmo); | |
334 | } while (res < 0 && errno == EINTR); | |
335 | ||
336 | if (res == 0) { | |
337 | return false; | |
338 | } | |
339 | if (res < 0) { | |
340 | error_report("uffd_poll_events() failed: errno=%i", errno); | |
341 | return false; | |
342 | } | |
343 | ||
344 | return (poll_fd.revents & POLLIN) != 0; | |
345 | } |