]> Git Repo - linux.git/commitdiff
mm/thp: fix deferred split queue not partially_mapped
authorHugh Dickins <[email protected]>
Sun, 27 Oct 2024 19:59:34 +0000 (12:59 -0700)
committerAndrew Morton <[email protected]>
Wed, 6 Nov 2024 00:49:54 +0000 (16:49 -0800)
Recent changes are putting more pressure on THP deferred split queues:
under load revealing long-standing races, causing list_del corruptions,
"Bad page state"s and worse (I keep BUGs in both of those, so usually
don't get to see how badly they end up without).  The relevant recent
changes being 6.8's mTHP, 6.10's mTHP swapout, and 6.12's mTHP swapin,
improved swap allocation, and underused THP splitting.

The new unlocked list_del_init() in deferred_split_scan() is buggy.  I
gave bad advice, it looks plausible since that's a local on-stack list,
but the fact is that it can race with a third party freeing or migrating
the preceding folio (properly unqueueing it with refcount 0 while holding
split_queue_lock), thereby corrupting the list linkage.

The obvious answer would be to take split_queue_lock there: but it has a
long history of contention, so I'm reluctant to add to that.  Instead,
make sure that there is always one safe (raised refcount) folio before, by
delaying its folio_put().  (And of course I was wrong to suggest updating
split_queue_len without the lock: leave that until the splice.)

And remove two over-eager partially_mapped checks, restoring those tests
to how they were before: if uncharge_folio() or free_tail_page_prepare()
finds _deferred_list non-empty, it's in trouble whether or not that folio
is partially_mapped (and the flag was already cleared in the latter case).

Link: https://lkml.kernel.org/r/[email protected]
Fixes: dafff3f4c850 ("mm: split underused THPs")
Signed-off-by: Hugh Dickins <[email protected]>
Acked-by: Usama Arif <[email protected]>
Reviewed-by: David Hildenbrand <[email protected]>
Reviewed-by: Baolin Wang <[email protected]>
Acked-by: Zi Yan <[email protected]>
Cc: Barry Song <[email protected]>
Cc: Chris Li <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Kefeng Wang <[email protected]>
Cc: Kirill A. Shutemov <[email protected]>
Cc: Matthew Wilcox (Oracle) <[email protected]>
Cc: Nhat Pham <[email protected]>
Cc: Ryan Roberts <[email protected]>
Cc: Shakeel Butt <[email protected]>
Cc: Wei Yang <[email protected]>
Cc: Yang Shi <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
mm/huge_memory.c
mm/memcontrol.c
mm/page_alloc.c

index 2fb328880b509e356315dd814169b91528a57b97..a1d345f1680cb1609b176b20a31cd7e319574e30 100644 (file)
@@ -3718,8 +3718,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
        struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
        unsigned long flags;
        LIST_HEAD(list);
-       struct folio *folio, *next;
-       int split = 0;
+       struct folio *folio, *next, *prev = NULL;
+       int split = 0, removed = 0;
 
 #ifdef CONFIG_MEMCG
        if (sc->memcg)
@@ -3775,15 +3775,28 @@ next:
                 */
                if (!did_split && !folio_test_partially_mapped(folio)) {
                        list_del_init(&folio->_deferred_list);
-                       ds_queue->split_queue_len--;
+                       removed++;
+               } else {
+                       /*
+                        * That unlocked list_del_init() above would be unsafe,
+                        * unless its folio is separated from any earlier folios
+                        * left on the list (which may be concurrently unqueued)
+                        * by one safe folio with refcount still raised.
+                        */
+                       swap(folio, prev);
                }
-               folio_put(folio);
+               if (folio)
+                       folio_put(folio);
        }
 
        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        list_splice_tail(&list, &ds_queue->split_queue);
+       ds_queue->split_queue_len -= removed;
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 
+       if (prev)
+               folio_put(prev);
+
        /*
         * Stop shrinker if we didn't split any page, but the queue is empty.
         * This can happen if pages were freed under us.
index 7845c64a2c570b2857134ded7b501e5b9fd3b6f4..2703227cce88ba8fc82c4c07b9d9bf4c14f0fdf2 100644 (file)
@@ -4631,8 +4631,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
        VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
                        !folio_test_hugetlb(folio) &&
-                       !list_empty(&folio->_deferred_list) &&
-                       folio_test_partially_mapped(folio), folio);
+                       !list_empty(&folio->_deferred_list), folio);
 
        /*
         * Nobody should be changing or seriously looking at
index 94a2ffe2800897f4635afc3d9cbace88b7236ea5..5e108ae755cc94479868dafc48b51f3dca78898e 100644 (file)
@@ -961,9 +961,8 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
                break;
        case 2:
                /* the second tail page: deferred_list overlaps ->mapping */
-               if (unlikely(!list_empty(&folio->_deferred_list) &&
-                   folio_test_partially_mapped(folio))) {
-                       bad_page(page, "partially mapped folio on deferred list");
+               if (unlikely(!list_empty(&folio->_deferred_list))) {
+                       bad_page(page, "on deferred list");
                        goto out;
                }
                break;
This page took 0.079078 seconds and 4 git commands to generate.