1 // SPDX-License-Identifier: GPL-2.0-only
3 * COW (Copy On Write) tests.
5 * Copyright 2022, Red Hat, Inc.
18 #include <linux/mman.h>
20 #include <sys/ioctl.h>
22 #include <linux/memfd.h>
24 #include "local_config.h"
25 #ifdef LOCAL_CONFIG_HAVE_LIBURING
27 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
29 #include "../../../../mm/gup_test.h"
30 #include "../kselftest.h"
32 #include "thp_settings.h"
34 static size_t pagesize;
35 static int pagemap_fd;
36 static size_t pmdsize;
37 static int nr_thpsizes;
38 static size_t thpsizes[20];
39 static int nr_hugetlbsizes;
40 static size_t hugetlbsizes[10];
42 static bool has_huge_zeropage;
44 static int sz2ord(size_t size)
46 return __builtin_ctzll(size / pagesize);
49 static int detect_thp_sizes(size_t sizes[], int max)
56 /* thp not supported at all. */
60 orders = 1UL << sz2ord(pmdsize);
61 orders |= thp_supported_orders();
63 for (i = 0; orders && count < max; i++) {
64 if (!(orders & (1UL << i)))
66 orders &= ~(1UL << i);
67 kb = (pagesize >> 10) << i;
68 sizes[count++] = kb * 1024;
69 ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb);
75 static void detect_huge_zeropage(void)
77 int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
86 ret = pread(fd, buf, sizeof(buf), 0);
87 if (ret > 0 && ret < sizeof(buf)) {
90 enabled = strtoul(buf, NULL, 10);
92 has_huge_zeropage = true;
93 ksft_print_msg("[INFO] huge zeropage is enabled\n");
100 static bool range_is_swapped(void *addr, size_t size)
102 for (; size; addr += pagesize, size -= pagesize)
103 if (!pagemap_is_swapped(pagemap_fd, addr))
113 static int setup_comm_pipes(struct comm_pipes *comm_pipes)
115 if (pipe(comm_pipes->child_ready) < 0)
117 if (pipe(comm_pipes->parent_ready) < 0) {
118 close(comm_pipes->child_ready[0]);
119 close(comm_pipes->child_ready[1]);
126 static void close_comm_pipes(struct comm_pipes *comm_pipes)
128 close(comm_pipes->child_ready[0]);
129 close(comm_pipes->child_ready[1]);
130 close(comm_pipes->parent_ready[0]);
131 close(comm_pipes->parent_ready[1]);
134 static int child_memcmp_fn(char *mem, size_t size,
135 struct comm_pipes *comm_pipes)
137 char *old = malloc(size);
140 /* Backup the original content. */
141 memcpy(old, mem, size);
143 /* Wait until the parent modified the page. */
144 write(comm_pipes->child_ready[1], "0", 1);
145 while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
148 /* See if we still read the old values. */
149 return memcmp(old, mem, size);
152 static int child_vmsplice_memcmp_fn(char *mem, size_t size,
153 struct comm_pipes *comm_pipes)
159 ssize_t cur, total, transferred;
167 /* Backup the original content. */
168 memcpy(old, mem, size);
173 /* Trigger a read-only pin. */
174 transferred = vmsplice(fds[1], &iov, 1, 0);
177 if (transferred == 0)
180 /* Unmap it from our page tables. */
181 if (munmap(mem, size) < 0)
184 /* Wait until the parent modified it. */
185 write(comm_pipes->child_ready[1], "0", 1);
186 while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
189 /* See if we still read the old values via the pipe. */
190 for (total = 0; total < transferred; total += cur) {
191 cur = read(fds[0], new + total, transferred - total);
196 return memcmp(old, new, transferred);
199 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
201 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
202 child_fn fn, bool xfail)
204 struct comm_pipes comm_pipes;
208 ret = setup_comm_pipes(&comm_pipes);
210 ksft_test_result_fail("pipe() failed\n");
216 ksft_test_result_fail("fork() failed\n");
217 goto close_comm_pipes;
219 exit(fn(mem, size, &comm_pipes));
222 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
227 * mprotect() optimizations might try avoiding
228 * write-faults by directly mapping pages writable.
230 ret = mprotect(mem, size, PROT_READ);
231 ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
233 ksft_test_result_fail("mprotect() failed\n");
234 write(comm_pipes.parent_ready[1], "0", 1);
236 goto close_comm_pipes;
240 /* Modify the page. */
241 memset(mem, 0xff, size);
242 write(comm_pipes.parent_ready[1], "0", 1);
246 ret = WEXITSTATUS(ret);
251 ksft_test_result_pass("No leak from parent into child\n");
254 * With hugetlb, some vmsplice() tests are currently expected to
255 * fail because (a) harder to fix and (b) nobody really cares.
256 * Flag them as expected failure for now.
258 ksft_test_result_xfail("Leak from parent into child\n");
260 ksft_test_result_fail("Leak from parent into child\n");
263 close_comm_pipes(&comm_pipes);
266 static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb)
268 do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false);
271 static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb)
273 do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false);
276 static void test_vmsplice_in_child(char *mem, size_t size, bool is_hugetlb)
278 do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn,
282 static void test_vmsplice_in_child_mprotect(char *mem, size_t size,
285 do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn,
289 static void do_test_vmsplice_in_parent(char *mem, size_t size,
290 bool before_fork, bool xfail)
296 ssize_t cur, total, transferred;
297 struct comm_pipes comm_pipes;
305 memcpy(old, mem, size);
307 ret = setup_comm_pipes(&comm_pipes);
309 ksft_test_result_fail("pipe() failed\n");
314 ksft_test_result_fail("pipe() failed\n");
315 goto close_comm_pipes;
319 transferred = vmsplice(fds[1], &iov, 1, 0);
320 if (transferred <= 0) {
321 ksft_test_result_fail("vmsplice() failed\n");
328 ksft_test_result_fail("fork() failed\n");
331 write(comm_pipes.child_ready[1], "0", 1);
332 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
334 /* Modify page content in the child. */
335 memset(mem, 0xff, size);
340 transferred = vmsplice(fds[1], &iov, 1, 0);
341 if (transferred <= 0) {
342 ksft_test_result_fail("vmsplice() failed\n");
348 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
350 if (munmap(mem, size) < 0) {
351 ksft_test_result_fail("munmap() failed\n");
354 write(comm_pipes.parent_ready[1], "0", 1);
356 /* Wait until the child is done writing. */
358 if (!WIFEXITED(ret)) {
359 ksft_test_result_fail("wait() failed\n");
363 /* See if we still read the old values. */
364 for (total = 0; total < transferred; total += cur) {
365 cur = read(fds[0], new + total, transferred - total);
367 ksft_test_result_fail("read() failed\n");
372 if (!memcmp(old, new, transferred)) {
373 ksft_test_result_pass("No leak from child into parent\n");
376 * With hugetlb, some vmsplice() tests are currently expected to
377 * fail because (a) harder to fix and (b) nobody really cares.
378 * Flag them as expected failure for now.
380 ksft_test_result_xfail("Leak from child into parent\n");
382 ksft_test_result_fail("Leak from child into parent\n");
388 close_comm_pipes(&comm_pipes);
394 static void test_vmsplice_before_fork(char *mem, size_t size, bool is_hugetlb)
396 do_test_vmsplice_in_parent(mem, size, true, is_hugetlb);
399 static void test_vmsplice_after_fork(char *mem, size_t size, bool is_hugetlb)
401 do_test_vmsplice_in_parent(mem, size, false, is_hugetlb);
404 #ifdef LOCAL_CONFIG_HAVE_LIBURING
405 static void do_test_iouring(char *mem, size_t size, bool use_fork)
407 struct comm_pipes comm_pipes;
408 struct io_uring_cqe *cqe;
409 struct io_uring_sqe *sqe;
410 struct io_uring ring;
417 ret = setup_comm_pipes(&comm_pipes);
419 ksft_test_result_fail("pipe() failed\n");
425 ksft_test_result_fail("tmpfile() failed\n");
426 goto close_comm_pipes;
433 ksft_test_result_fail("malloc() failed\n");
437 /* Skip on errors, as we might just lack kernel support. */
438 ret = io_uring_queue_init(1, &ring, 0);
440 ksft_test_result_skip("io_uring_queue_init() failed\n");
445 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
446 * | FOLL_LONGTERM the range.
448 * Skip on errors, as we might just lack kernel support or might not
449 * have sufficient MEMLOCK permissions.
453 ret = io_uring_register_buffers(&ring, &iov, 1);
455 ksft_test_result_skip("io_uring_register_buffers() failed\n");
461 * fork() and keep the child alive until we're done. Note that
462 * we expect the pinned page to not get shared with the child.
466 ksft_test_result_fail("fork() failed\n");
467 goto unregister_buffers;
469 write(comm_pipes.child_ready[1], "0", 1);
470 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
475 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
479 * Map the page R/O into the page table. Enable softdirty
480 * tracking to stop the page from getting mapped R/W immediately
481 * again by mprotect() optimizations. Note that we don't have an
482 * easy way to test if that worked (the pagemap does not export
483 * if the page is mapped R/O vs. R/W).
485 ret = mprotect(mem, size, PROT_READ);
487 ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
489 ksft_test_result_fail("mprotect() failed\n");
490 goto unregister_buffers;
495 * Modify the page and write page content as observed by the fixed
496 * buffer pin to the file so we can verify it.
498 memset(mem, 0xff, size);
499 sqe = io_uring_get_sqe(&ring);
501 ksft_test_result_fail("io_uring_get_sqe() failed\n");
504 io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
506 ret = io_uring_submit(&ring);
508 ksft_test_result_fail("io_uring_submit() failed\n");
512 ret = io_uring_wait_cqe(&ring, &cqe);
514 ksft_test_result_fail("io_uring_wait_cqe() failed\n");
518 if (cqe->res != size) {
519 ksft_test_result_fail("write_fixed failed\n");
522 io_uring_cqe_seen(&ring, cqe);
524 /* Read back the file content to the temporary buffer. */
526 while (total < size) {
527 cur = pread(fd, tmp + total, size - total, total);
529 ksft_test_result_fail("pread() failed\n");
535 /* Finally, check if we read what we expected. */
536 ksft_test_result(!memcmp(mem, tmp, size),
537 "Longterm R/W pin is reliable\n");
541 write(comm_pipes.parent_ready[1], "0", 1);
545 io_uring_unregister_buffers(&ring);
547 io_uring_queue_exit(&ring);
553 close_comm_pipes(&comm_pipes);
556 static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb)
558 do_test_iouring(mem, size, false);
561 static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb)
563 do_test_iouring(mem, size, true);
566 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
571 RO_PIN_TEST_PREVIOUSLY_SHARED,
572 RO_PIN_TEST_RO_EXCLUSIVE,
575 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
578 struct pin_longterm_test args;
579 struct comm_pipes comm_pipes;
585 ksft_test_result_skip("gup_test not available\n");
591 ksft_test_result_fail("malloc() failed\n");
595 ret = setup_comm_pipes(&comm_pipes);
597 ksft_test_result_fail("pipe() failed\n");
604 case RO_PIN_TEST_SHARED:
605 case RO_PIN_TEST_PREVIOUSLY_SHARED:
607 * Share the pages with our child. As the pages are not pinned,
608 * this should just work.
612 ksft_test_result_fail("fork() failed\n");
613 goto close_comm_pipes;
615 write(comm_pipes.child_ready[1], "0", 1);
616 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
621 /* Wait until our child is ready. */
622 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
625 if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
627 * Tell the child to quit now and wait until it quit.
628 * The pages should now be mapped R/O into our page
629 * tables, but they are no longer shared.
631 write(comm_pipes.parent_ready[1], "0", 1);
634 ksft_print_msg("[INFO] wait() failed\n");
637 case RO_PIN_TEST_RO_EXCLUSIVE:
639 * Map the page R/O into the page table. Enable softdirty
640 * tracking to stop the page from getting mapped R/W immediately
641 * again by mprotect() optimizations. Note that we don't have an
642 * easy way to test if that worked (the pagemap does not export
643 * if the page is mapped R/O vs. R/W).
645 ret = mprotect(mem, size, PROT_READ);
647 ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
649 ksft_test_result_fail("mprotect() failed\n");
650 goto close_comm_pipes;
657 /* Take a R/O pin. This should trigger unsharing. */
658 args.addr = (__u64)(uintptr_t)mem;
660 args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
661 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
664 ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
666 ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
670 /* Modify the page. */
671 memset(mem, 0xff, size);
674 * Read back the content via the pin to the temporary buffer and
675 * test if we observed the modification.
677 tmp_val = (__u64)(uintptr_t)tmp;
678 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
680 ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
682 ksft_test_result(!memcmp(mem, tmp, size),
683 "Longterm R/O pin is reliable\n");
685 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
687 ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
690 case RO_PIN_TEST_SHARED:
691 write(comm_pipes.parent_ready[1], "0", 1);
694 ksft_print_msg("[INFO] wait() failed\n");
700 close_comm_pipes(&comm_pipes);
705 static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
707 do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
710 static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
712 do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
715 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size,
718 do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
721 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size,
724 do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
727 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size,
730 do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
733 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size,
736 do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
739 typedef void (*test_fn)(char *mem, size_t size, bool hugetlb);
741 static void do_run_with_base_page(test_fn fn, bool swapout)
746 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
747 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
748 if (mem == MAP_FAILED) {
749 ksft_test_result_fail("mmap() failed\n");
753 ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
754 /* Ignore if not around on a kernel. */
755 if (ret && errno != EINVAL) {
756 ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
760 /* Populate a base page. */
761 memset(mem, 0, pagesize);
764 madvise(mem, pagesize, MADV_PAGEOUT);
765 if (!pagemap_is_swapped(pagemap_fd, mem)) {
766 ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
771 fn(mem, pagesize, false);
773 munmap(mem, pagesize);
776 static void run_with_base_page(test_fn fn, const char *desc)
778 ksft_print_msg("[RUN] %s ... with base page\n", desc);
779 do_run_with_base_page(fn, false);
782 static void run_with_base_page_swap(test_fn fn, const char *desc)
784 ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
785 do_run_with_base_page(fn, true);
794 THP_RUN_SINGLE_PTE_SWAPOUT,
795 THP_RUN_PARTIAL_MREMAP,
796 THP_RUN_PARTIAL_SHARED,
799 static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
801 char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
802 size_t size, mmap_size, mremap_size;
805 /* For alignment purposes, we need twice the thp size. */
806 mmap_size = 2 * thpsize;
807 mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
808 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
809 if (mmap_mem == MAP_FAILED) {
810 ksft_test_result_fail("mmap() failed\n");
814 /* We need a THP-aligned memory area. */
815 mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
817 ret = madvise(mem, thpsize, MADV_HUGEPAGE);
819 ksft_test_result_fail("MADV_HUGEPAGE failed\n");
824 * Try to populate a THP. Touch the first sub-page and test if
825 * we get the last sub-page populated automatically.
828 if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
829 ksft_test_result_skip("Did not get a THP populated\n");
832 memset(mem, 0, thpsize);
837 case THP_RUN_PMD_SWAPOUT:
838 assert(thpsize == pmdsize);
841 case THP_RUN_PTE_SWAPOUT:
843 * Trigger PTE-mapping the THP by temporarily mapping a single
844 * subpage R/O. This is a noop if the THP is not pmdsize (and
845 * therefore already PTE-mapped).
847 ret = mprotect(mem + pagesize, pagesize, PROT_READ);
849 ksft_test_result_fail("mprotect() failed\n");
852 ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
854 ksft_test_result_fail("mprotect() failed\n");
858 case THP_RUN_SINGLE_PTE:
859 case THP_RUN_SINGLE_PTE_SWAPOUT:
861 * Discard all but a single subpage of that PTE-mapped THP. What
862 * remains is a single PTE mapping a single subpage.
864 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
866 ksft_test_result_fail("MADV_DONTNEED failed\n");
871 case THP_RUN_PARTIAL_MREMAP:
873 * Remap half of the THP. We need some new memory location
876 mremap_size = thpsize / 2;
877 mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
878 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
879 if (mem == MAP_FAILED) {
880 ksft_test_result_fail("mmap() failed\n");
883 tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
884 MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
885 if (tmp != mremap_mem) {
886 ksft_test_result_fail("mremap() failed\n");
891 case THP_RUN_PARTIAL_SHARED:
893 * Share the first page of the THP with a child and quit the
894 * child. This will result in some parts of the THP never
897 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
899 ksft_test_result_fail("MADV_DONTFORK failed\n");
904 ksft_test_result_fail("fork() failed\n");
910 /* Allow for sharing all pages again. */
911 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
913 ksft_test_result_fail("MADV_DOFORK failed\n");
922 case THP_RUN_PMD_SWAPOUT:
923 case THP_RUN_PTE_SWAPOUT:
924 case THP_RUN_SINGLE_PTE_SWAPOUT:
925 madvise(mem, size, MADV_PAGEOUT);
926 if (!range_is_swapped(mem, size)) {
927 ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
935 fn(mem, size, false);
937 munmap(mmap_mem, mmap_size);
938 if (mremap_mem != MAP_FAILED)
939 munmap(mremap_mem, mremap_size);
942 static void run_with_thp(test_fn fn, const char *desc, size_t size)
944 ksft_print_msg("[RUN] %s ... with THP (%zu kB)\n",
946 do_run_with_thp(fn, THP_RUN_PMD, size);
949 static void run_with_thp_swap(test_fn fn, const char *desc, size_t size)
951 ksft_print_msg("[RUN] %s ... with swapped-out THP (%zu kB)\n",
953 do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size);
956 static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size)
958 ksft_print_msg("[RUN] %s ... with PTE-mapped THP (%zu kB)\n",
960 do_run_with_thp(fn, THP_RUN_PTE, size);
963 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size)
965 ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP (%zu kB)\n",
967 do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size);
970 static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size)
972 ksft_print_msg("[RUN] %s ... with single PTE of THP (%zu kB)\n",
974 do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size);
977 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size)
979 ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP (%zu kB)\n",
981 do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size);
984 static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size)
986 ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP (%zu kB)\n",
988 do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size);
991 static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size)
993 ksft_print_msg("[RUN] %s ... with partially shared THP (%zu kB)\n",
995 do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size);
998 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
1000 int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
1003 ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
1004 hugetlbsize / 1024);
1006 flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
1008 mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1009 if (mem == MAP_FAILED) {
1010 ksft_test_result_skip("need more free huge pages\n");
1014 /* Populate an huge page. */
1015 memset(mem, 0, hugetlbsize);
1018 * We need a total of two hugetlb pages to handle COW/unsharing
1019 * properly, otherwise we might get zapped by a SIGBUS.
1021 dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1022 if (dummy == MAP_FAILED) {
1023 ksft_test_result_skip("need more free huge pages\n");
1026 munmap(dummy, hugetlbsize);
1028 fn(mem, hugetlbsize, true);
1030 munmap(mem, hugetlbsize);
1039 * Test cases that are specific to anonymous pages: pages in private mappings
1040 * that may get shared via COW during fork().
1042 static const struct test_case anon_test_cases[] = {
1044 * Basic COW tests for fork() without any GUP. If we miss to break COW,
1045 * either the child can observe modifications by the parent or the
1049 "Basic COW after fork()",
1053 * Basic test, but do an additional mprotect(PROT_READ)+
1054 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1057 "Basic COW after fork() with mprotect() optimization",
1058 test_cow_in_parent_mprotect,
1061 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
1062 * we miss to break COW, the child observes modifications by the parent.
1063 * This is CVE-2020-29374 reported by Jann Horn.
1066 "vmsplice() + unmap in child",
1067 test_vmsplice_in_child,
1070 * vmsplice() test, but do an additional mprotect(PROT_READ)+
1071 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1074 "vmsplice() + unmap in child with mprotect() optimization",
1075 test_vmsplice_in_child_mprotect,
1078 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1079 * fork(); modify in the child. If we miss to break COW, the parent
1080 * observes modifications by the child.
1083 "vmsplice() before fork(), unmap in parent after fork()",
1084 test_vmsplice_before_fork,
1087 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1088 * child. If we miss to break COW, the parent observes modifications by
1092 "vmsplice() + unmap in parent after fork()",
1093 test_vmsplice_after_fork,
1095 #ifdef LOCAL_CONFIG_HAVE_LIBURING
1097 * Take a R/W longterm pin and then map the page R/O into the page
1098 * table to trigger a write fault on next access. When modifying the
1099 * page, the page content must be visible via the pin.
1102 "R/O-mapping a page registered as iouring fixed buffer",
1106 * Take a R/W longterm pin and then fork() a child. When modifying the
1107 * page, the page content must be visible via the pin. We expect the
1108 * pinned page to not get shared with the child.
1111 "fork() with an iouring fixed buffer",
1115 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
1117 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1118 * When modifying the page via the page table, the page content change
1119 * must be visible via the pin.
1122 "R/O GUP pin on R/O-mapped shared page",
1123 test_ro_pin_on_shared,
1125 /* Same as above, but using GUP-fast. */
1127 "R/O GUP-fast pin on R/O-mapped shared page",
1128 test_ro_fast_pin_on_shared,
1131 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1132 * was previously shared. When modifying the page via the page table,
1133 * the page content change must be visible via the pin.
1136 "R/O GUP pin on R/O-mapped previously-shared page",
1137 test_ro_pin_on_ro_previously_shared,
1139 /* Same as above, but using GUP-fast. */
1141 "R/O GUP-fast pin on R/O-mapped previously-shared page",
1142 test_ro_fast_pin_on_ro_previously_shared,
1145 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1146 * When modifying the page via the page table, the page content change
1147 * must be visible via the pin.
1150 "R/O GUP pin on R/O-mapped exclusive page",
1151 test_ro_pin_on_ro_exclusive,
1153 /* Same as above, but using GUP-fast. */
1155 "R/O GUP-fast pin on R/O-mapped exclusive page",
1156 test_ro_fast_pin_on_ro_exclusive,
1160 static void run_anon_test_case(struct test_case const *test_case)
1164 run_with_base_page(test_case->fn, test_case->desc);
1165 run_with_base_page_swap(test_case->fn, test_case->desc);
1166 for (i = 0; i < nr_thpsizes; i++) {
1167 size_t size = thpsizes[i];
1168 struct thp_settings settings = *thp_current_settings();
1170 settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER;
1171 settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS;
1172 thp_push_settings(&settings);
1174 if (size == pmdsize) {
1175 run_with_thp(test_case->fn, test_case->desc, size);
1176 run_with_thp_swap(test_case->fn, test_case->desc, size);
1179 run_with_pte_mapped_thp(test_case->fn, test_case->desc, size);
1180 run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size);
1181 run_with_single_pte_of_thp(test_case->fn, test_case->desc, size);
1182 run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size);
1183 run_with_partial_mremap_thp(test_case->fn, test_case->desc, size);
1184 run_with_partial_shared_thp(test_case->fn, test_case->desc, size);
1188 for (i = 0; i < nr_hugetlbsizes; i++)
1189 run_with_hugetlb(test_case->fn, test_case->desc,
1193 static void run_anon_test_cases(void)
1197 ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1199 for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1200 run_anon_test_case(&anon_test_cases[i]);
1203 static int tests_per_anon_test_case(void)
1205 int tests = 2 + nr_hugetlbsizes;
1207 tests += 6 * nr_thpsizes;
1213 enum anon_thp_collapse_test {
1214 ANON_THP_COLLAPSE_UNSHARED,
1215 ANON_THP_COLLAPSE_FULLY_SHARED,
1216 ANON_THP_COLLAPSE_LOWER_SHARED,
1217 ANON_THP_COLLAPSE_UPPER_SHARED,
1220 static void do_test_anon_thp_collapse(char *mem, size_t size,
1221 enum anon_thp_collapse_test test)
1223 struct comm_pipes comm_pipes;
1227 ret = setup_comm_pipes(&comm_pipes);
1229 ksft_test_result_fail("pipe() failed\n");
1234 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1235 * R/O, such that we can try collapsing it later.
1237 ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1239 ksft_test_result_fail("mprotect() failed\n");
1240 goto close_comm_pipes;
1242 ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1244 ksft_test_result_fail("mprotect() failed\n");
1245 goto close_comm_pipes;
1249 case ANON_THP_COLLAPSE_UNSHARED:
1250 /* Collapse before actually COW-sharing the page. */
1251 ret = madvise(mem, size, MADV_COLLAPSE);
1253 ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1255 goto close_comm_pipes;
1258 case ANON_THP_COLLAPSE_FULLY_SHARED:
1259 /* COW-share the full PTE-mapped THP. */
1261 case ANON_THP_COLLAPSE_LOWER_SHARED:
1262 /* Don't COW-share the upper part of the THP. */
1263 ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1265 ksft_test_result_fail("MADV_DONTFORK failed\n");
1266 goto close_comm_pipes;
1269 case ANON_THP_COLLAPSE_UPPER_SHARED:
1270 /* Don't COW-share the lower part of the THP. */
1271 ret = madvise(mem, size / 2, MADV_DONTFORK);
1273 ksft_test_result_fail("MADV_DONTFORK failed\n");
1274 goto close_comm_pipes;
1283 ksft_test_result_fail("fork() failed\n");
1284 goto close_comm_pipes;
1287 case ANON_THP_COLLAPSE_UNSHARED:
1288 case ANON_THP_COLLAPSE_FULLY_SHARED:
1289 exit(child_memcmp_fn(mem, size, &comm_pipes));
1291 case ANON_THP_COLLAPSE_LOWER_SHARED:
1292 exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1294 case ANON_THP_COLLAPSE_UPPER_SHARED:
1295 exit(child_memcmp_fn(mem + size / 2, size / 2,
1303 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1307 case ANON_THP_COLLAPSE_UNSHARED:
1309 case ANON_THP_COLLAPSE_UPPER_SHARED:
1310 case ANON_THP_COLLAPSE_LOWER_SHARED:
1312 * Revert MADV_DONTFORK such that we merge the VMAs and are
1313 * able to actually collapse.
1315 ret = madvise(mem, size, MADV_DOFORK);
1317 ksft_test_result_fail("MADV_DOFORK failed\n");
1318 write(comm_pipes.parent_ready[1], "0", 1);
1320 goto close_comm_pipes;
1323 case ANON_THP_COLLAPSE_FULLY_SHARED:
1324 /* Collapse before anyone modified the COW-shared page. */
1325 ret = madvise(mem, size, MADV_COLLAPSE);
1327 ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1329 write(comm_pipes.parent_ready[1], "0", 1);
1331 goto close_comm_pipes;
1338 /* Modify the page. */
1339 memset(mem, 0xff, size);
1340 write(comm_pipes.parent_ready[1], "0", 1);
1344 ret = WEXITSTATUS(ret);
1348 ksft_test_result(!ret, "No leak from parent into child\n");
1350 close_comm_pipes(&comm_pipes);
1353 static void test_anon_thp_collapse_unshared(char *mem, size_t size,
1356 assert(!is_hugetlb);
1357 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1360 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size,
1363 assert(!is_hugetlb);
1364 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1367 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size,
1370 assert(!is_hugetlb);
1371 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1374 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size,
1377 assert(!is_hugetlb);
1378 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1382 * Test cases that are specific to anonymous THP: pages in private mappings
1383 * that may get shared via COW during fork().
1385 static const struct test_case anon_thp_test_cases[] = {
1387 * Basic COW test for fork() without any GUP when collapsing a THP
1390 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1391 * collapse") might easily get COW handling wrong when not collapsing
1392 * exclusivity information properly.
1395 "Basic COW after fork() when collapsing before fork()",
1396 test_anon_thp_collapse_unshared,
1398 /* Basic COW test, but collapse after COW-sharing a full THP. */
1400 "Basic COW after fork() when collapsing after fork() (fully shared)",
1401 test_anon_thp_collapse_fully_shared,
1404 * Basic COW test, but collapse after COW-sharing the lower half of a
1408 "Basic COW after fork() when collapsing after fork() (lower shared)",
1409 test_anon_thp_collapse_lower_shared,
1412 * Basic COW test, but collapse after COW-sharing the upper half of a
1416 "Basic COW after fork() when collapsing after fork() (upper shared)",
1417 test_anon_thp_collapse_upper_shared,
1421 static void run_anon_thp_test_cases(void)
1428 ksft_print_msg("[INFO] Anonymous THP tests\n");
1430 for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1431 struct test_case const *test_case = &anon_thp_test_cases[i];
1433 ksft_print_msg("[RUN] %s\n", test_case->desc);
1434 do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize);
1438 static int tests_per_anon_thp_test_case(void)
1440 return pmdsize ? 1 : 0;
1443 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1445 static void test_cow(char *mem, const char *smem, size_t size)
1447 char *old = malloc(size);
1449 /* Backup the original content. */
1450 memcpy(old, smem, size);
1452 /* Modify the page. */
1453 memset(mem, 0xff, size);
1455 /* See if we still read the old values via the other mapping. */
1456 ksft_test_result(!memcmp(smem, old, size),
1457 "Other mapping not modified\n");
1461 static void test_ro_pin(char *mem, const char *smem, size_t size)
1463 do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1466 static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1468 do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1471 static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1473 char *mem, *smem, tmp;
1475 ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
1477 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1478 MAP_PRIVATE | MAP_ANON, -1, 0);
1479 if (mem == MAP_FAILED) {
1480 ksft_test_result_fail("mmap() failed\n");
1484 smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1485 if (mem == MAP_FAILED) {
1486 ksft_test_result_fail("mmap() failed\n");
1490 /* Read from the page to populate the shared zeropage. */
1492 asm volatile("" : "+r" (tmp));
1494 fn(mem, smem, pagesize);
1496 munmap(mem, pagesize);
1497 if (smem != MAP_FAILED)
1498 munmap(smem, pagesize);
1501 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1503 char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
1507 ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
1509 if (!has_huge_zeropage) {
1510 ksft_test_result_skip("Huge zeropage not enabled\n");
1514 /* For alignment purposes, we need twice the thp size. */
1515 mmap_size = 2 * pmdsize;
1516 mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1517 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1518 if (mmap_mem == MAP_FAILED) {
1519 ksft_test_result_fail("mmap() failed\n");
1522 mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1523 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1524 if (mmap_smem == MAP_FAILED) {
1525 ksft_test_result_fail("mmap() failed\n");
1529 /* We need a THP-aligned memory area. */
1530 mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1));
1531 smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1));
1533 ret = madvise(mem, pmdsize, MADV_HUGEPAGE);
1534 ret |= madvise(smem, pmdsize, MADV_HUGEPAGE);
1536 ksft_test_result_fail("MADV_HUGEPAGE failed\n");
1541 * Read from the memory to populate the huge shared zeropage. Read from
1542 * the first sub-page and test if we get another sub-page populated
1546 asm volatile("" : "+r" (tmp));
1547 if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1548 !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1549 ksft_test_result_skip("Did not get THPs populated\n");
1553 fn(mem, smem, pmdsize);
1555 munmap(mmap_mem, mmap_size);
1556 if (mmap_smem != MAP_FAILED)
1557 munmap(mmap_smem, mmap_size);
1560 static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1562 char *mem, *smem, tmp;
1565 ksft_print_msg("[RUN] %s ... with memfd\n", desc);
1567 fd = memfd_create("test", 0);
1569 ksft_test_result_fail("memfd_create() failed\n");
1573 /* File consists of a single page filled with zeroes. */
1574 if (fallocate(fd, 0, 0, pagesize)) {
1575 ksft_test_result_fail("fallocate() failed\n");
1579 /* Create a private mapping of the memfd. */
1580 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1581 if (mem == MAP_FAILED) {
1582 ksft_test_result_fail("mmap() failed\n");
1585 smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1586 if (mem == MAP_FAILED) {
1587 ksft_test_result_fail("mmap() failed\n");
1591 /* Fault the page in. */
1593 asm volatile("" : "+r" (tmp));
1595 fn(mem, smem, pagesize);
1597 munmap(mem, pagesize);
1598 if (smem != MAP_FAILED)
1599 munmap(smem, pagesize);
1604 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1606 char *mem, *smem, tmp;
1610 ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
1614 ksft_test_result_fail("tmpfile() failed\n");
1620 ksft_test_result_skip("fileno() failed\n");
1624 /* File consists of a single page filled with zeroes. */
1625 if (fallocate(fd, 0, 0, pagesize)) {
1626 ksft_test_result_fail("fallocate() failed\n");
1630 /* Create a private mapping of the memfd. */
1631 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1632 if (mem == MAP_FAILED) {
1633 ksft_test_result_fail("mmap() failed\n");
1636 smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1637 if (mem == MAP_FAILED) {
1638 ksft_test_result_fail("mmap() failed\n");
1642 /* Fault the page in. */
1644 asm volatile("" : "+r" (tmp));
1646 fn(mem, smem, pagesize);
1648 munmap(mem, pagesize);
1649 if (smem != MAP_FAILED)
1650 munmap(smem, pagesize);
1655 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1658 int flags = MFD_HUGETLB;
1659 char *mem, *smem, tmp;
1662 ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
1663 hugetlbsize / 1024);
1665 flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1667 fd = memfd_create("test", flags);
1669 ksft_test_result_skip("memfd_create() failed\n");
1673 /* File consists of a single page filled with zeroes. */
1674 if (fallocate(fd, 0, 0, hugetlbsize)) {
1675 ksft_test_result_skip("need more free huge pages\n");
1679 /* Create a private mapping of the memfd. */
1680 mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1682 if (mem == MAP_FAILED) {
1683 ksft_test_result_skip("need more free huge pages\n");
1686 smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1687 if (mem == MAP_FAILED) {
1688 ksft_test_result_fail("mmap() failed\n");
1692 /* Fault the page in. */
1694 asm volatile("" : "+r" (tmp));
1696 fn(mem, smem, hugetlbsize);
1698 munmap(mem, hugetlbsize);
1699 if (mem != MAP_FAILED)
1700 munmap(smem, hugetlbsize);
1705 struct non_anon_test_case {
1707 non_anon_test_fn fn;
1711 * Test cases that target any pages in private mappings that are not anonymous:
1712 * pages that may get shared via COW ndependent of fork(). This includes
1713 * the shared zeropage(s), pagecache pages, ...
1715 static const struct non_anon_test_case non_anon_test_cases[] = {
1717 * Basic COW test without any GUP. If we miss to break COW, changes are
1718 * visible via other private/shared mappings.
1725 * Take a R/O longterm pin. When modifying the page via the page table,
1726 * the page content change must be visible via the pin.
1729 "R/O longterm GUP pin",
1732 /* Same as above, but using GUP-fast. */
1734 "R/O longterm GUP-fast pin",
1739 static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1743 run_with_zeropage(test_case->fn, test_case->desc);
1744 run_with_memfd(test_case->fn, test_case->desc);
1745 run_with_tmpfile(test_case->fn, test_case->desc);
1747 run_with_huge_zeropage(test_case->fn, test_case->desc);
1748 for (i = 0; i < nr_hugetlbsizes; i++)
1749 run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1753 static void run_non_anon_test_cases(void)
1757 ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1759 for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1760 run_non_anon_test_case(&non_anon_test_cases[i]);
1763 static int tests_per_non_anon_test_case(void)
1765 int tests = 3 + nr_hugetlbsizes;
1772 int main(int argc, char **argv)
1775 struct thp_settings default_settings;
1777 ksft_print_header();
1779 pagesize = getpagesize();
1780 pmdsize = read_pmd_pagesize();
1782 /* Only if THP is supported. */
1783 thp_read_settings(&default_settings);
1784 default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT;
1785 thp_save_settings();
1786 thp_push_settings(&default_settings);
1788 ksft_print_msg("[INFO] detected PMD size: %zu KiB\n",
1790 nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes));
1792 nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
1793 ARRAY_SIZE(hugetlbsizes));
1794 detect_huge_zeropage();
1796 ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1797 ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1798 ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1800 gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1801 pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1803 ksft_exit_fail_msg("opening pagemap failed\n");
1805 run_anon_test_cases();
1806 run_anon_thp_test_cases();
1807 run_non_anon_test_cases();
1810 /* Only if THP is supported. */
1811 thp_restore_settings();
1814 err = ksft_get_fail_cnt();
1816 ksft_exit_fail_msg("%d out of %d tests failed\n",
1817 err, ksft_test_num());