mm: split underused THPs
authorUsama Arif <usamaarif642@gmail.com>
Fri, 30 Aug 2024 10:03:39 +0000 (11:03 +0100)
committerAndrew Morton <akpm@linux-foundation.org>
Mon, 9 Sep 2024 23:39:04 +0000 (16:39 -0700)
This is an attempt to mitigate the issue of running out of memory when THP
is always enabled.  During runtime whenever a THP is being faulted in
(__do_huge_pmd_anonymous_page) or collapsed by khugepaged
(collapse_huge_page), the THP is added to _deferred_list.  Whenever memory
reclaim happens in linux, the kernel runs the deferred_split shrinker
which goes through the _deferred_list.

If the folio was partially mapped, the shrinker attempts to split it.  If
the folio is not partially mapped, the shrinker checks if the THP was
underused, i.e.  how many of the base 4K pages of the entire THP were
zero-filled.  If this number goes above a certain threshold (decided by
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none), the
shrinker will attempt to split that THP.  Then at remap time, the pages
that were zero-filled are mapped to the shared zeropage, hence saving
memory.

Link: https://lkml.kernel.org/r/20240830100438.3623486-6-usamaarif642@gmail.com
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
Suggested-by: Rik van Riel <riel@surriel.com>
Co-authored-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Alexander Zhu <alexlzhu@fb.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Shuang Zhai <zhais@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Shuang Zhai <szhai2@cs.rochester.edu>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Documentation/admin-guide/mm/transhuge.rst
include/linux/khugepaged.h
include/linux/vm_event_item.h
mm/huge_memory.c
mm/khugepaged.c
mm/vmstat.c

index 56a086900651509fe318bc106d634d4c0486e3ea..aca0cff852b852b3b40428a29bf3d001fd1002a5 100644 (file)
@@ -471,6 +471,12 @@ thp_deferred_split_page
        splitting it would free up some memory. Pages on split queue are
        going to be split under memory pressure.
 
+thp_underused_split_page
+       is incremented when a huge page on the split queue was split
+       because it was underused. A THP is underused if the number of
+       zero pages in the THP is above a certain threshold
+       (/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none).
+
 thp_split_pmd
        is incremented every time a PMD split into table of PTEs.
        This can happen, for instance, when application calls mprotect() or
index f68865e19b0b0efad301c72b8749d8deec9d2431..30baae91b2255f708665bc5fbbc9a7cd63ca3be1 100644 (file)
@@ -4,6 +4,7 @@
 
 #include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */
 
+extern unsigned int khugepaged_max_ptes_none __read_mostly;
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern struct attribute_group khugepaged_attr_group;
 
index aae5c7c5cfb4e174b065a99577604ea8292d818b..aed952d04132b6bf86b06c0194a4b4b35b211034 100644 (file)
@@ -105,6 +105,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
                THP_SPLIT_PAGE,
                THP_SPLIT_PAGE_FAILED,
                THP_DEFERRED_SPLIT_PAGE,
+               THP_UNDERUSED_SPLIT_PAGE,
                THP_SPLIT_PMD,
                THP_SCAN_EXCEED_NONE_PTE,
                THP_SCAN_EXCEED_SWAP_PTE,
index 9bc435bef2e62f6ea22a55109f91fe95d722bf63..cec5bce046a0d1d7fabb303036e1b889258e7f8d 100644 (file)
@@ -1187,6 +1187,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
                update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
                mm_inc_nr_ptes(vma->vm_mm);
+               deferred_split_folio(folio, false);
                spin_unlock(vmf->ptl);
                count_vm_event(THP_FAULT_ALLOC);
                count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
@@ -3608,6 +3609,39 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
        return READ_ONCE(ds_queue->split_queue_len);
 }
 
+static bool thp_underused(struct folio *folio)
+{
+       int num_zero_pages = 0, num_filled_pages = 0;
+       void *kaddr;
+       int i;
+
+       if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
+               return false;
+
+       for (i = 0; i < folio_nr_pages(folio); i++) {
+               kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
+               if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
+                       num_zero_pages++;
+                       if (num_zero_pages > khugepaged_max_ptes_none) {
+                               kunmap_local(kaddr);
+                               return true;
+                       }
+               } else {
+                       /*
+                        * Another path for early exit once the number
+                        * of non-zero filled pages exceeds threshold.
+                        */
+                       num_filled_pages++;
+                       if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+                               kunmap_local(kaddr);
+                               return false;
+                       }
+               }
+               kunmap_local(kaddr);
+       }
+       return false;
+}
+
 static unsigned long deferred_split_scan(struct shrinker *shrink,
                struct shrink_control *sc)
 {
@@ -3645,13 +3679,35 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 
        list_for_each_entry_safe(folio, next, &list, _deferred_list) {
+               bool did_split = false;
+               bool underused = false;
+
+               if (!folio_test_partially_mapped(folio)) {
+                       underused = thp_underused(folio);
+                       if (!underused)
+                               goto next;
+               }
                if (!folio_trylock(folio))
                        goto next;
-               /* split_huge_page() removes page from list on success */
-               if (!split_folio(folio))
+               if (!split_folio(folio)) {
+                       did_split = true;
+                       if (underused)
+                               count_vm_event(THP_UNDERUSED_SPLIT_PAGE);
                        split++;
+               }
                folio_unlock(folio);
 next:
+               /*
+                * split_folio() removes folio from list on success.
+                * Only add back to the queue if folio is partially mapped.
+                * If thp_underused returns false, or if split_folio fails
+                * in the case it was underused, then consider it used and
+                * don't add it back to split_queue.
+                */
+               if (!did_split && !folio_test_partially_mapped(folio)) {
+                       list_del_init(&folio->_deferred_list);
+                       ds_queue->split_queue_len--;
+               }
                folio_put(folio);
        }
 
index ab646018ce25db0decc8a5d50737b3da5b7e32c1..32100041aef3a7b077fa5a0bc4d9f1e66a157544 100644 (file)
@@ -85,7 +85,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
  *
  * Note that these are only respected if collapse was initiated by khugepaged.
  */
-static unsigned int khugepaged_max_ptes_none __read_mostly;
+unsigned int khugepaged_max_ptes_none __read_mostly;
 static unsigned int khugepaged_max_ptes_swap __read_mostly;
 static unsigned int khugepaged_max_ptes_shared __read_mostly;
 
@@ -1237,6 +1237,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, address, pmd, _pmd);
        update_mmu_cache_pmd(vma, address, pmd);
+       deferred_split_folio(folio, false);
        spin_unlock(pmd_ptl);
 
        folio = NULL;
index aea58e9fce60c1edec8b4da80f1a7913571e1e57..b5a4cea423e1753c2f01cd9a54fd142e8db1cf56 100644 (file)
@@ -1385,6 +1385,7 @@ const char * const vmstat_text[] = {
        "thp_split_page",
        "thp_split_page_failed",
        "thp_deferred_split_page",
+       "thp_underused_split_page",
        "thp_split_pmd",
        "thp_scan_exceed_none_pte",
        "thp_scan_exceed_swap_pte",
This page took 0.121771 seconds and 4 git commands to generate.