]> Git Repo - linux.git/commitdiff
s390: add pte_free_defer() for pgtables sharing page
authorHugh Dickins <[email protected]>
Wed, 12 Jul 2023 04:38:35 +0000 (21:38 -0700)
committerAndrew Morton <[email protected]>
Fri, 18 Aug 2023 17:12:24 +0000 (10:12 -0700)
Add s390-specific pte_free_defer(), to free table page via call_rcu().
pte_free_defer() will be called inside khugepaged's retract_page_tables()
loop, where allocating extra memory cannot be relied upon.  This precedes
the generic version to avoid build breakage from incompatible pgtable_t.

This version is more complicated than others: because s390 fits two 2K
page tables into one 4K page (so page->rcu_head must be shared between
both halves), and already uses page->lru (which page->rcu_head overlays)
to list any free halves; with clever management by page->_refcount bits.

Build upon the existing management, adjusted to follow a new rule: that a
page is never on the free list if pte_free_defer() was used on either half
(marked by PageActive).  And for simplicity, delay calling RCU until both
halves are freed.

Not adding back unallocated fragments to the list in pte_free_defer() can
result in wasting some amount of memory for pagetables, depending on how
long the allocated fragment will stay in use.  In practice, this effect is
expected to be insignificant, and not justify a far more complex approach,
which might allow to add the fragments back later in __tlb_remove_table(),
where we might not have a stable mm any more.

[[email protected]: Claudio finds warning on mm_has_pgste() more useful than on mm_alloc_pgste()]
Link: https://lkml.kernel.org/r/[email protected]
Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Hugh Dickins <[email protected]>
Reviewed-by: Gerald Schaefer <[email protected]>
Tested-by: Alexander Gordeev <[email protected]>
Acked-by: Alexander Gordeev <[email protected]>
Cc: Alistair Popple <[email protected]>
Cc: Aneesh Kumar K.V <[email protected]>
Cc: Anshuman Khandual <[email protected]>
Cc: Axel Rasmussen <[email protected]>
Cc: Christian Borntraeger <[email protected]>
Cc: Christophe Leroy <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Claudio Imbrenda <[email protected]>
Cc: David Hildenbrand <[email protected]>
Cc: "David S. Miller" <[email protected]>
Cc: Heiko Carstens <[email protected]>
Cc: Huang, Ying <[email protected]>
Cc: Ira Weiny <[email protected]>
Cc: Jann Horn <[email protected]>
Cc: Jason Gunthorpe <[email protected]>
Cc: Kirill A. Shutemov <[email protected]>
Cc: Lorenzo Stoakes <[email protected]>
Cc: Matthew Wilcox (Oracle) <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Miaohe Lin <[email protected]>
Cc: Michael Ellerman <[email protected]>
Cc: Mike Kravetz <[email protected]>
Cc: Mike Rapoport (IBM) <[email protected]>
Cc: Minchan Kim <[email protected]>
Cc: Naoya Horiguchi <[email protected]>
Cc: Pavel Tatashin <[email protected]>
Cc: Peter Xu <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Qi Zheng <[email protected]>
Cc: Ralph Campbell <[email protected]>
Cc: Russell King <[email protected]>
Cc: SeongJae Park <[email protected]>
Cc: Song Liu <[email protected]>
Cc: Steven Price <[email protected]>
Cc: Suren Baghdasaryan <[email protected]>
Cc: Thomas Hellström <[email protected]>
Cc: Vasily Gorbik <[email protected]>
Cc: Vishal Moola (Oracle) <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Cc: Will Deacon <[email protected]>
Cc: Yang Shi <[email protected]>
Cc: Yu Zhao <[email protected]>
Cc: Zack Rusin <[email protected]>
Cc: Zi Yan <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
arch/s390/include/asm/pgalloc.h
arch/s390/mm/pgalloc.c

index 17eb618f1348ad6cdea47720f2345f18a52f0b29..89a9d5ef94f866cc069b1c12c52dd09ecb507662 100644 (file)
@@ -143,6 +143,10 @@ static inline void pmd_populate(struct mm_struct *mm,
 #define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte)
 #define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte)
 
+/* arch use pte_free_defer() implementation in arch/s390/mm/pgalloc.c */
+#define pte_free_defer pte_free_defer
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);
+
 void vmem_map_init(void);
 void *vmem_crst_alloc(unsigned long val);
 pte_t *vmem_pte_alloc(void);
index 66ab68db98428cfeec1e5fb10b6a13f4288230d3..d7374add78209e69b48b322daa82bc4e6377bea4 100644 (file)
@@ -229,6 +229,15 @@ void page_table_free_pgste(struct page *page)
  * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
  * while the PP bits are never used, nor such a page is added to or removed
  * from mm_context_t::pgtable_list.
+ *
+ * pte_free_defer() overrides those rules: it takes the page off pgtable_list,
+ * and prevents both 2K fragments from being reused. pte_free_defer() has to
+ * guarantee that its pgtable cannot be reused before the RCU grace period
+ * has elapsed (which page_table_free_rcu() does not actually guarantee).
+ * But for simplicity, because page->rcu_head overlays page->lru, and because
+ * the RCU callback might not be called before the mm_context_t has been freed,
+ * pte_free_defer() in this implementation prevents both fragments from being
+ * reused, and delays making the call to RCU until both fragments are freed.
  */
 unsigned long *page_table_alloc(struct mm_struct *mm)
 {
@@ -261,7 +270,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
                                        table += PTRS_PER_PTE;
                                atomic_xor_bits(&page->_refcount,
                                                        0x01U << (bit + 24));
-                               list_del(&page->lru);
+                               list_del_init(&page->lru);
                        }
                }
                spin_unlock_bh(&mm->context.lock);
@@ -281,6 +290,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
        table = (unsigned long *) page_to_virt(page);
        if (mm_alloc_pgste(mm)) {
                /* Return 4K page table with PGSTEs */
+               INIT_LIST_HEAD(&page->lru);
                atomic_xor_bits(&page->_refcount, 0x03U << 24);
                memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
                memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
@@ -300,7 +310,9 @@ static void page_table_release_check(struct page *page, void *table,
 {
        char msg[128];
 
-       if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask)
+       if (!IS_ENABLED(CONFIG_DEBUG_VM))
+               return;
+       if (!mask && list_empty(&page->lru))
                return;
        snprintf(msg, sizeof(msg),
                 "Invalid pgtable %p release half 0x%02x mask 0x%02x",
@@ -308,6 +320,15 @@ static void page_table_release_check(struct page *page, void *table,
        dump_page(page, msg);
 }
 
+static void pte_free_now(struct rcu_head *head)
+{
+       struct page *page;
+
+       page = container_of(head, struct page, rcu_head);
+       pgtable_pte_page_dtor(page);
+       __free_page(page);
+}
+
 void page_table_free(struct mm_struct *mm, unsigned long *table)
 {
        unsigned int mask, bit, half;
@@ -325,10 +346,17 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
                 */
                mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
                mask >>= 24;
-               if (mask & 0x03U)
+               if ((mask & 0x03U) && !PageActive(page)) {
+                       /*
+                        * Other half is allocated, and neither half has had
+                        * its free deferred: add page to head of list, to make
+                        * this freed half available for immediate reuse.
+                        */
                        list_add(&page->lru, &mm->context.pgtable_list);
-               else
-                       list_del(&page->lru);
+               } else {
+                       /* If page is on list, now remove it. */
+                       list_del_init(&page->lru);
+               }
                spin_unlock_bh(&mm->context.lock);
                mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
                mask >>= 24;
@@ -342,8 +370,10 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
        }
 
        page_table_release_check(page, table, half, mask);
-       pgtable_pte_page_dtor(page);
-       __free_page(page);
+       if (TestClearPageActive(page))
+               call_rcu(&page->rcu_head, pte_free_now);
+       else
+               pte_free_now(&page->rcu_head);
 }
 
 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
@@ -370,10 +400,18 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
         */
        mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
        mask >>= 24;
-       if (mask & 0x03U)
+       if ((mask & 0x03U) && !PageActive(page)) {
+               /*
+                * Other half is allocated, and neither half has had
+                * its free deferred: add page to end of list, to make
+                * this freed half available for reuse once its pending
+                * bit has been cleared by __tlb_remove_table().
+                */
                list_add_tail(&page->lru, &mm->context.pgtable_list);
-       else
-               list_del(&page->lru);
+       } else {
+               /* If page is on list, now remove it. */
+               list_del_init(&page->lru);
+       }
        spin_unlock_bh(&mm->context.lock);
        table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
        tlb_remove_table(tlb, table);
@@ -403,9 +441,27 @@ void __tlb_remove_table(void *_table)
        }
 
        page_table_release_check(page, table, half, mask);
-       pgtable_pte_page_dtor(page);
-       __free_page(page);
+       if (TestClearPageActive(page))
+               call_rcu(&page->rcu_head, pte_free_now);
+       else
+               pte_free_now(&page->rcu_head);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
+{
+       struct page *page;
+
+       page = virt_to_page(pgtable);
+       SetPageActive(page);
+       page_table_free(mm, (unsigned long *)pgtable);
+       /*
+        * page_table_free() does not do the pgste gmap_unlink() which
+        * page_table_free_rcu() does: warn us if pgste ever reaches here.
+        */
+       WARN_ON_ONCE(mm_has_pgste(mm));
 }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /*
  * Base infrastructure required to generate basic asces, region, segment,
This page took 0.072061 seconds and 4 git commands to generate.