]> Git Repo - J-linux.git/commitdiff
Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
authorLinus Torvalds <[email protected]>
Sat, 21 Sep 2019 17:07:42 +0000 (10:07 -0700)
committerLinus Torvalds <[email protected]>
Sat, 21 Sep 2019 17:07:42 +0000 (10:07 -0700)
Pull hmm updates from Jason Gunthorpe:
 "This is more cleanup and consolidation of the hmm APIs and the very
  strongly related mmu_notifier interfaces. Many places across the tree
  using these interfaces are touched in the process. Beyond that a
  cleanup to the page walker API and a few memremap related changes
  round out the series:

   - General improvement of hmm_range_fault() and related APIs, more
     documentation, bug fixes from testing, API simplification &
     consolidation, and unused API removal

   - Simplify the hmm related kconfigs to HMM_MIRROR and DEVICE_PRIVATE,
     and make them internal kconfig selects

   - Hoist a lot of code related to mmu notifier attachment out of
     drivers by using a refcount get/put attachment idiom and remove the
     convoluted mmu_notifier_unregister_no_release() and related APIs.

   - General API improvement for the migrate_vma API and revision of its
     only user in nouveau

   - Annotate mmu_notifiers with lockdep and sleeping region debugging

  Two series unrelated to HMM or mmu_notifiers came along due to
  dependencies:

   - Allow pagemap's memremap_pages family of APIs to work without
     providing a struct device

   - Make walk_page_range() and related use a constant structure for
     function pointers"

* tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (75 commits)
  libnvdimm: Enable unit test infrastructure compile checks
  mm, notifier: Catch sleeping/blocking for !blockable
  kernel.h: Add non_block_start/end()
  drm/radeon: guard against calling an unpaired radeon_mn_unregister()
  csky: add missing brackets in a macro for tlb.h
  pagewalk: use lockdep_assert_held for locking validation
  pagewalk: separate function pointers from iterator data
  mm: split out a new pagewalk.h header from mm.h
  mm/mmu_notifiers: annotate with might_sleep()
  mm/mmu_notifiers: prime lockdep
  mm/mmu_notifiers: add a lockdep map for invalidate_range_start/end
  mm/mmu_notifiers: remove the __mmu_notifier_invalidate_range_start/end exports
  mm/hmm: hmm_range_fault() infinite loop
  mm/hmm: hmm_range_fault() NULL pointer bug
  mm/hmm: fix hmm_range_fault()'s handling of swapped out pages
  mm/mmu_notifiers: remove unregister_no_release
  RDMA/odp: remove ib_ucontext from ib_umem
  RDMA/odp: use mmu_notifier_get/put for 'struct ib_ucontext_per_mm'
  RDMA/mlx5: Use odp instead of mr->umem in pagefault_mr
  RDMA/mlx5: Use ib_umem_start instead of umem.address
  ...

20 files changed:
1  2 
arch/s390/mm/gmap.c
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/amdkfd/kfd_process.c
drivers/gpu/drm/nouveau/nouveau_drm.c
drivers/gpu/drm/radeon/radeon.h
drivers/gpu/drm/radeon/radeon_device.c
drivers/gpu/drm/radeon/radeon_drv.c
drivers/gpu/drm/radeon/radeon_mn.c
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/mr.c
drivers/infiniband/hw/mlx5/odp.c
include/linux/sched.h
kernel/fork.c
kernel/sched/core.c
mm/madvise.c
mm/memcontrol.c
mm/page_alloc.c

diff --combined arch/s390/mm/gmap.c
index cd8e03f04d6daac463af57b15fe26dcc7ddc658d,bd78d504fdade86dcaac497702d9aa69d571a7c8..edcdca97e85eeecbddb4fb650aaa5c0432a60259
@@@ -9,7 -9,7 +9,7 @@@
   */
  
  #include <linux/kernel.h>
- #include <linux/mm.h>
+ #include <linux/pagewalk.h>
  #include <linux/swap.h>
  #include <linux/smp.h>
  #include <linux/spinlock.h>
@@@ -67,7 -67,7 +67,7 @@@ static struct gmap *gmap_alloc(unsigne
        INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
        spin_lock_init(&gmap->guest_table_lock);
        spin_lock_init(&gmap->shadow_lock);
 -      atomic_set(&gmap->ref_count, 1);
 +      refcount_set(&gmap->ref_count, 1);
        page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
        if (!page)
                goto out_free;
@@@ -214,7 -214,7 +214,7 @@@ static void gmap_free(struct gmap *gmap
   */
  struct gmap *gmap_get(struct gmap *gmap)
  {
 -      atomic_inc(&gmap->ref_count);
 +      refcount_inc(&gmap->ref_count);
        return gmap;
  }
  EXPORT_SYMBOL_GPL(gmap_get);
   */
  void gmap_put(struct gmap *gmap)
  {
 -      if (atomic_dec_return(&gmap->ref_count) == 0)
 +      if (refcount_dec_and_test(&gmap->ref_count))
                gmap_free(gmap);
  }
  EXPORT_SYMBOL_GPL(gmap_put);
@@@ -1594,7 -1594,7 +1594,7 @@@ static struct gmap *gmap_find_shadow(st
                        continue;
                if (!sg->initialized)
                        return ERR_PTR(-EAGAIN);
 -              atomic_inc(&sg->ref_count);
 +              refcount_inc(&sg->ref_count);
                return sg;
        }
        return NULL;
@@@ -1682,7 -1682,7 +1682,7 @@@ struct gmap *gmap_shadow(struct gmap *p
                        }
                }
        }
 -      atomic_set(&new->ref_count, 2);
 +      refcount_set(&new->ref_count, 2);
        list_add(&new->list, &parent->children);
        if (asce & _ASCE_REAL_SPACE) {
                /* nothing to protect, return right away */
@@@ -2521,13 -2521,9 +2521,9 @@@ static int __zap_zero_pages(pmd_t *pmd
        return 0;
  }
  
- static inline void zap_zero_pages(struct mm_struct *mm)
- {
-       struct mm_walk walk = { .pmd_entry = __zap_zero_pages };
-       walk.mm = mm;
-       walk_page_range(0, TASK_SIZE, &walk);
- }
+ static const struct mm_walk_ops zap_zero_walk_ops = {
+       .pmd_entry      = __zap_zero_pages,
+ };
  
  /*
   * switch on pgstes for its userspace process (for kvm)
@@@ -2546,7 -2542,7 +2542,7 @@@ int s390_enable_sie(void
        mm->context.has_pgste = 1;
        /* split thp mappings and disable thp for future mappings */
        thp_split_mm(mm);
-       zap_zero_pages(mm);
+       walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
        up_write(&mm->mmap_sem);
        return 0;
  }
@@@ -2589,12 -2585,13 +2585,13 @@@ static int __s390_enable_skey_hugetlb(p
        return 0;
  }
  
+ static const struct mm_walk_ops enable_skey_walk_ops = {
+       .hugetlb_entry          = __s390_enable_skey_hugetlb,
+       .pte_entry              = __s390_enable_skey_pte,
+ };
  int s390_enable_skey(void)
  {
-       struct mm_walk walk = {
-               .hugetlb_entry = __s390_enable_skey_hugetlb,
-               .pte_entry = __s390_enable_skey_pte,
-       };
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        int rc = 0;
        }
        mm->def_flags &= ~VM_MERGEABLE;
  
-       walk.mm = mm;
-       walk_page_range(0, TASK_SIZE, &walk);
+       walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
  
  out_up:
        up_write(&mm->mmap_sem);
@@@ -2633,13 -2629,14 +2629,14 @@@ static int __s390_reset_cmma(pte_t *pte
        return 0;
  }
  
+ static const struct mm_walk_ops reset_cmma_walk_ops = {
+       .pte_entry              = __s390_reset_cmma,
+ };
  void s390_reset_cmma(struct mm_struct *mm)
  {
-       struct mm_walk walk = { .pte_entry = __s390_reset_cmma };
        down_write(&mm->mmap_sem);
-       walk.mm = mm;
-       walk_page_range(0, TASK_SIZE, &walk);
+       walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
        up_write(&mm->mmap_sem);
  }
  EXPORT_SYMBOL_GPL(s390_reset_cmma);
index 48a2070e72f2cbc20245ac7e8e7c409b66f0ba49,9a05a37603bdc72f041b1f8e79ed16d73809f039..bdf849da32e42e11d28017e59a18e47d9ca41898
@@@ -35,6 -35,7 +35,7 @@@
  #include <linux/pm_runtime.h>
  #include <linux/vga_switcheroo.h>
  #include <drm/drm_probe_helper.h>
+ #include <linux/mmu_notifier.h>
  
  #include "amdgpu.h"
  #include "amdgpu_irq.h"
   * - 3.31.0 - Add support for per-flip tiling attribute changes with DC
   * - 3.32.0 - Add syncobj timeline support to AMDGPU_CS.
   * - 3.33.0 - Fixes for GDS ENOMEM failures in AMDGPU_CS.
 + * - 3.34.0 - Non-DC can flip correctly between buffers with different pitches
   */
  #define KMS_DRIVER_MAJOR      3
 -#define KMS_DRIVER_MINOR      33
 +#define KMS_DRIVER_MINOR      34
  #define KMS_DRIVER_PATCHLEVEL 0
  
  #define AMDGPU_MAX_TIMEOUT_PARAM_LENTH        256
@@@ -143,7 -143,7 +144,7 @@@ int amdgpu_async_gfx_ring = 1
  int amdgpu_mcbp = 0;
  int amdgpu_discovery = -1;
  int amdgpu_mes = 0;
 -int amdgpu_noretry;
 +int amdgpu_noretry = 1;
  
  struct amdgpu_mgpu_info mgpu_info = {
        .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
@@@ -611,7 -611,7 +612,7 @@@ MODULE_PARM_DESC(mes
  module_param_named(mes, amdgpu_mes, int, 0444);
  
  MODULE_PARM_DESC(noretry,
 -      "Disable retry faults (0 = retry enabled (default), 1 = retry disabled)");
 +      "Disable retry faults (0 = retry enabled, 1 = retry disabled (default))");
  module_param_named(noretry, amdgpu_noretry, int, 0644);
  
  #ifdef CONFIG_HSA_AMD
@@@ -997,11 -997,6 +998,11 @@@ static const struct pci_device_id pciid
        /* Raven */
        {0x1002, 0x15dd, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RAVEN|AMD_IS_APU},
        {0x1002, 0x15d8, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RAVEN|AMD_IS_APU},
 +      /* Arcturus */
 +      {0x1002, 0x738C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARCTURUS|AMD_EXP_HW_SUPPORT},
 +      {0x1002, 0x7388, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARCTURUS|AMD_EXP_HW_SUPPORT},
 +      {0x1002, 0x738E, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARCTURUS|AMD_EXP_HW_SUPPORT},
 +      {0x1002, 0x7390, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARCTURUS|AMD_EXP_HW_SUPPORT},
        /* Navi10 */
        {0x1002, 0x7310, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
        {0x1002, 0x7312, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
        {0x1002, 0x731A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
        {0x1002, 0x731B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
        {0x1002, 0x731F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
 +      /* Navi14 */
 +      {0x1002, 0x7340, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14},
 +
 +      /* Renoir */
 +      {0x1002, 0x1636, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU|AMD_EXP_HW_SUPPORT},
  
        {0, 0, 0}
  };
@@@ -1103,21 -1093,21 +1104,21 @@@ amdgpu_pci_shutdown(struct pci_dev *pde
         * unfortunately we can't detect certain
         * hypervisors so just do this all the time.
         */
 +      adev->mp1_state = PP_MP1_STATE_UNLOAD;
        amdgpu_device_ip_suspend(adev);
 +      adev->mp1_state = PP_MP1_STATE_NONE;
  }
  
  static int amdgpu_pmops_suspend(struct device *dev)
  {
 -      struct pci_dev *pdev = to_pci_dev(dev);
 +      struct drm_device *drm_dev = dev_get_drvdata(dev);
  
 -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
        return amdgpu_device_suspend(drm_dev, true, true);
  }
  
  static int amdgpu_pmops_resume(struct device *dev)
  {
 -      struct pci_dev *pdev = to_pci_dev(dev);
 -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
 +      struct drm_device *drm_dev = dev_get_drvdata(dev);
  
        /* GPU comes up enabled by the bios on resume */
        if (amdgpu_device_is_px(drm_dev)) {
  
  static int amdgpu_pmops_freeze(struct device *dev)
  {
 -      struct pci_dev *pdev = to_pci_dev(dev);
 +      struct drm_device *drm_dev = dev_get_drvdata(dev);
  
 -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
        return amdgpu_device_suspend(drm_dev, false, true);
  }
  
  static int amdgpu_pmops_thaw(struct device *dev)
  {
 -      struct pci_dev *pdev = to_pci_dev(dev);
 +      struct drm_device *drm_dev = dev_get_drvdata(dev);
  
 -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
        return amdgpu_device_resume(drm_dev, false, true);
  }
  
  static int amdgpu_pmops_poweroff(struct device *dev)
  {
 -      struct pci_dev *pdev = to_pci_dev(dev);
 +      struct drm_device *drm_dev = dev_get_drvdata(dev);
  
 -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
        return amdgpu_device_suspend(drm_dev, true, true);
  }
  
  static int amdgpu_pmops_restore(struct device *dev)
  {
 -      struct pci_dev *pdev = to_pci_dev(dev);
 +      struct drm_device *drm_dev = dev_get_drvdata(dev);
  
 -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
        return amdgpu_device_resume(drm_dev, false, true);
  }
  
@@@ -1212,7 -1206,8 +1213,7 @@@ static int amdgpu_pmops_runtime_resume(
  
  static int amdgpu_pmops_runtime_idle(struct device *dev)
  {
 -      struct pci_dev *pdev = to_pci_dev(dev);
 -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
 +      struct drm_device *drm_dev = dev_get_drvdata(dev);
        struct drm_crtc *crtc;
  
        if (!amdgpu_device_is_px(drm_dev)) {
@@@ -1379,7 -1374,7 +1380,7 @@@ static struct drm_driver kms_driver = 
        .driver_features =
            DRIVER_USE_AGP | DRIVER_ATOMIC |
            DRIVER_GEM |
 -          DRIVER_PRIME | DRIVER_RENDER | DRIVER_MODESET | DRIVER_SYNCOBJ,
 +          DRIVER_RENDER | DRIVER_MODESET | DRIVER_SYNCOBJ,
        .load = amdgpu_driver_load_kms,
        .open = amdgpu_driver_open_kms,
        .postclose = amdgpu_driver_postclose_kms,
        .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
        .gem_prime_export = amdgpu_gem_prime_export,
        .gem_prime_import = amdgpu_gem_prime_import,
 -      .gem_prime_res_obj = amdgpu_gem_prime_res_obj,
        .gem_prime_get_sg_table = amdgpu_gem_prime_get_sg_table,
        .gem_prime_import_sg_table = amdgpu_gem_prime_import_sg_table,
        .gem_prime_vmap = amdgpu_gem_prime_vmap,
@@@ -1469,6 -1465,7 +1470,7 @@@ static void __exit amdgpu_exit(void
        amdgpu_unregister_atpx_handler();
        amdgpu_sync_fini();
        amdgpu_fence_slab_fini();
+       mmu_notifier_synchronize();
  }
  
  module_init(amdgpu_init);
index f1f8cdd695d3f6a556c74351c3eb8dc4547e200e,60b9fc9561d7f5467776be71fcde8b60a2d6afbb..31d4deb5d294846aa6fc66c9f7ec8a7963604854
@@@ -179,7 -179,7 +179,7 @@@ static void amdgpu_mn_invalidate_node(s
                if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, start, end))
                        continue;
  
 -              r = reservation_object_wait_timeout_rcu(bo->tbo.resv,
 +              r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv,
                        true, false, MAX_SCHEDULE_TIMEOUT);
                if (r <= 0)
                        DRM_ERROR("(%ld) failed to wait for user bo\n", r);
   * Block for operations on BOs to finish and mark pages as accessed and
   * potentially dirty.
   */
- static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror,
-                       const struct hmm_update *update)
+ static int
+ amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror,
+                             const struct mmu_notifier_range *update)
  {
        struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
        unsigned long start = update->start;
        unsigned long end = update->end;
-       bool blockable = update->blockable;
+       bool blockable = mmu_notifier_range_blockable(update);
        struct interval_tree_node *it;
  
        /* notification is exclusive, but interval is inclusive */
   * necessitates evicting all user-mode queues of the process. The BOs
   * are restorted in amdgpu_mn_invalidate_range_end_hsa.
   */
- static int amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror,
-                       const struct hmm_update *update)
+ static int
+ amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror,
+                             const struct mmu_notifier_range *update)
  {
        struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
        unsigned long start = update->start;
        unsigned long end = update->end;
-       bool blockable = update->blockable;
+       bool blockable = mmu_notifier_range_blockable(update);
        struct interval_tree_node *it;
  
        /* notification is exclusive, but interval is inclusive */
@@@ -482,6 -484,5 +484,5 @@@ void amdgpu_hmm_init_range(struct hmm_r
                range->flags = hmm_range_flags;
                range->values = hmm_range_values;
                range->pfn_shift = PAGE_SHIFT;
-               INIT_LIST_HEAD(&range->list);
        }
  }
index 13b144c8f67d68c2c261d16efbfea8772ff0a95b,8bf79288c4e29b240c45c23eef024b720be7f49c..dff41d0a85fe969b7bc96323ba66d235d210fed3
@@@ -227,7 -227,7 +227,7 @@@ static int amdgpu_verify_access(struct 
  
        if (amdgpu_ttm_tt_get_usermm(bo->ttm))
                return -EPERM;
 -      return drm_vma_node_verify_access(&abo->gem_base.vma_node,
 +      return drm_vma_node_verify_access(&abo->tbo.base.vma_node,
                                          filp->private_data);
  }
  
@@@ -303,7 -303,7 +303,7 @@@ int amdgpu_ttm_copy_mem_to_mem(struct a
                               struct amdgpu_copy_mem *src,
                               struct amdgpu_copy_mem *dst,
                               uint64_t size,
 -                             struct reservation_object *resv,
 +                             struct dma_resv *resv,
                               struct dma_fence **f)
  {
        struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
@@@ -440,26 -440,10 +440,26 @@@ static int amdgpu_move_blit(struct ttm_
  
        r = amdgpu_ttm_copy_mem_to_mem(adev, &src, &dst,
                                       new_mem->num_pages << PAGE_SHIFT,
 -                                     bo->resv, &fence);
 +                                     bo->base.resv, &fence);
        if (r)
                goto error;
  
 +      /* clear the space being freed */
 +      if (old_mem->mem_type == TTM_PL_VRAM &&
 +          (ttm_to_amdgpu_bo(bo)->flags &
 +           AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) {
 +              struct dma_fence *wipe_fence = NULL;
 +
 +              r = amdgpu_fill_buffer(ttm_to_amdgpu_bo(bo), AMDGPU_POISON,
 +                                     NULL, &wipe_fence);
 +              if (r) {
 +                      goto error;
 +              } else if (wipe_fence) {
 +                      dma_fence_put(fence);
 +                      fence = wipe_fence;
 +              }
 +      }
 +
        /* Always block for VM page tables before committing the new location */
        if (bo->type == ttm_bo_type_kernel)
                r = ttm_bo_move_accel_cleanup(bo, fence, true, new_mem);
@@@ -794,7 -778,6 +794,6 @@@ int amdgpu_ttm_tt_get_user_pages(struc
        struct hmm_range *range;
        unsigned long i;
        uint64_t *pfns;
-       int retry = 0;
        int r = 0;
  
        if (!mm) /* Happens during process shutdown */
                                0 : range->flags[HMM_PFN_WRITE];
        range->pfn_flags_mask = 0;
        range->pfns = pfns;
-       hmm_range_register(range, mirror, start,
-                          start + ttm->num_pages * PAGE_SIZE, PAGE_SHIFT);
+       range->start = start;
+       range->end = start + ttm->num_pages * PAGE_SIZE;
+       hmm_range_register(range, mirror);
  
- retry:
        /*
         * Just wait for range to be valid, safe to ignore return value as we
         * will use the return value of hmm_range_fault() below under the
        hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT);
  
        down_read(&mm->mmap_sem);
-       r = hmm_range_fault(range, true);
-       if (unlikely(r < 0)) {
-               if (likely(r == -EAGAIN)) {
-                       /*
-                        * return -EAGAIN, mmap_sem is dropped
-                        */
-                       if (retry++ < MAX_RETRY_HMM_RANGE_FAULT)
-                               goto retry;
-                       else
-                               pr_err("Retry hmm fault too many times\n");
-               }
-               goto out_up_read;
-       }
+       r = hmm_range_fault(range, 0);
        up_read(&mm->mmap_sem);
  
+       if (unlikely(r < 0))
+               goto out_free_pfns;
        for (i = 0; i < ttm->num_pages; i++) {
                pages[i] = hmm_device_entry_to_page(range, pfns[i]);
                if (unlikely(!pages[i])) {
  
        return 0;
  
- out_up_read:
-       if (likely(r != -EAGAIN))
-               up_read(&mm->mmap_sem);
  out_free_pfns:
        hmm_range_unregister(range);
        kvfree(pfns);
@@@ -1486,7 -1455,7 +1471,7 @@@ static bool amdgpu_ttm_bo_eviction_valu
  {
        unsigned long num_pages = bo->mem.num_pages;
        struct drm_mm_node *node = bo->mem.mm_node;
 -      struct reservation_object_list *flist;
 +      struct dma_resv_list *flist;
        struct dma_fence *f;
        int i;
  
         * cleanly handle page faults.
         */
        if (bo->type == ttm_bo_type_kernel &&
 -          !reservation_object_test_signaled_rcu(bo->resv, true))
 +          !dma_resv_test_signaled_rcu(bo->base.resv, true))
                return false;
  
        /* If bo is a KFD BO, check if the bo belongs to the current process.
         * If true, then return false as any KFD process needs all its BOs to
         * be resident to run successfully
         */
 -      flist = reservation_object_get_list(bo->resv);
 +      flist = dma_resv_get_list(bo->base.resv);
        if (flist) {
                for (i = 0; i < flist->shared_count; ++i) {
                        f = rcu_dereference_protected(flist->shared[i],
 -                              reservation_object_held(bo->resv));
 +                              dma_resv_held(bo->base.resv));
                        if (amdkfd_fence_check_mm(f, current->mm))
                                return false;
                }
@@@ -1615,7 -1584,6 +1600,7 @@@ static struct ttm_bo_driver amdgpu_bo_d
        .move = &amdgpu_bo_move,
        .verify_access = &amdgpu_verify_access,
        .move_notify = &amdgpu_bo_move_notify,
 +      .release_notify = &amdgpu_bo_release_notify,
        .fault_reserve_notify = &amdgpu_bo_fault_reserve_notify,
        .io_mem_reserve = &amdgpu_ttm_io_mem_reserve,
        .io_mem_free = &amdgpu_ttm_io_mem_free,
@@@ -1738,7 -1706,6 +1723,7 @@@ int amdgpu_ttm_init(struct amdgpu_devic
        uint64_t gtt_size;
        int r;
        u64 vis_vram_limit;
 +      void *stolen_vga_buf;
  
        mutex_init(&adev->mman.gtt_window_lock);
  
        r = ttm_bo_device_init(&adev->mman.bdev,
                               &amdgpu_bo_driver,
                               adev->ddev->anon_inode->i_mapping,
 -                             adev->need_dma32);
 +                             dma_addressing_limited(adev->dev));
        if (r) {
                DRM_ERROR("failed initializing buffer object driver(%d).\n", r);
                return r;
        r = amdgpu_bo_create_kernel(adev, adev->gmc.stolen_size, PAGE_SIZE,
                                    AMDGPU_GEM_DOMAIN_VRAM,
                                    &adev->stolen_vga_memory,
 -                                  NULL, NULL);
 +                                  NULL, &stolen_vga_buf);
        if (r)
                return r;
        DRM_INFO("amdgpu: %uM of VRAM memory ready\n",
   */
  void amdgpu_ttm_late_init(struct amdgpu_device *adev)
  {
 +      void *stolen_vga_buf;
        /* return the VGA stolen memory (if any) back to VRAM */
 -      amdgpu_bo_free_kernel(&adev->stolen_vga_memory, NULL, NULL);
 +      amdgpu_bo_free_kernel(&adev->stolen_vga_memory, NULL, &stolen_vga_buf);
  }
  
  /**
@@@ -2011,7 -1977,7 +1996,7 @@@ error_free
  
  int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
                       uint64_t dst_offset, uint32_t byte_count,
 -                     struct reservation_object *resv,
 +                     struct dma_resv *resv,
                       struct dma_fence **fence, bool direct_submit,
                       bool vm_needs_flush)
  {
@@@ -2085,7 -2051,7 +2070,7 @@@ error_free
  
  int amdgpu_fill_buffer(struct amdgpu_bo *bo,
                       uint32_t src_data,
 -                     struct reservation_object *resv,
 +                     struct dma_resv *resv,
                       struct dma_fence **fence)
  {
        struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
index 3bb75d11a6627339bde0a9769b78b0bb9cf94b2a,9450e20d17093bd4670422c4356c09fbbe1f1dd6..c89326125d71170509531a8ffc434db3101f1908
@@@ -195,7 -195,6 +195,7 @@@ struct kfd_event_interrupt_class 
  
  struct kfd_device_info {
        enum amd_asic_type asic_family;
 +      const char *asic_name;
        const struct kfd_event_interrupt_class *event_interrupt_class;
        unsigned int max_pasid_bits;
        unsigned int max_no_of_hqd;
@@@ -687,9 -686,6 +687,6 @@@ struct kfd_process 
        /* We want to receive a notification when the mm_struct is destroyed */
        struct mmu_notifier mmu_notifier;
  
-       /* Use for delayed freeing of kfd_process structure */
-       struct rcu_head rcu;
        unsigned int pasid;
        unsigned int doorbell_index;
  
index 0c6ac043ae3cc54cad8284e3eda06eb027361d22,e5e326f2f2675edc88dce8eede0a93a33d2cc4ae..40e3fc0c6942120b83e724038a537ed11f756e9f
@@@ -62,8 -62,8 +62,8 @@@ static struct workqueue_struct *kfd_res
  
  static struct kfd_process *find_process(const struct task_struct *thread);
  static void kfd_process_ref_release(struct kref *ref);
- static struct kfd_process *create_process(const struct task_struct *thread,
                                      struct file *filep);
+ static struct kfd_process *create_process(const struct task_struct *thread);
static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep);
  
  static void evict_process_worker(struct work_struct *work);
  static void restore_process_worker(struct work_struct *work);
@@@ -289,7 -289,15 +289,15 @@@ struct kfd_process *kfd_create_process(
        if (process) {
                pr_debug("Process already found\n");
        } else {
-               process = create_process(thread, filep);
+               process = create_process(thread);
+               if (IS_ERR(process))
+                       goto out;
+               ret = kfd_process_init_cwsr_apu(process, filep);
+               if (ret) {
+                       process = ERR_PTR(ret);
+                       goto out;
+               }
  
                if (!procfs.kobj)
                        goto out;
@@@ -478,11 -486,9 +486,9 @@@ static void kfd_process_ref_release(str
        queue_work(kfd_process_wq, &p->release_work);
  }
  
- static void kfd_process_destroy_delayed(struct rcu_head *rcu)
+ static void kfd_process_free_notifier(struct mmu_notifier *mn)
  {
-       struct kfd_process *p = container_of(rcu, struct kfd_process, rcu);
-       kfd_unref_process(p);
+       kfd_unref_process(container_of(mn, struct kfd_process, mmu_notifier));
  }
  
  static void kfd_process_notifier_release(struct mmu_notifier *mn,
  
        mutex_unlock(&p->mutex);
  
-       mmu_notifier_unregister_no_release(&p->mmu_notifier, mm);
-       mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed);
+       mmu_notifier_put(&p->mmu_notifier);
  }
  
  static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
        .release = kfd_process_notifier_release,
+       .free_notifier = kfd_process_free_notifier,
  };
  
  static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
@@@ -609,81 -615,69 +615,69 @@@ static int kfd_process_device_init_cwsr
        return 0;
  }
  
- static struct kfd_process *create_process(const struct task_struct *thread,
-                                       struct file *filep)
+ /*
+  * On return the kfd_process is fully operational and will be freed when the
+  * mm is released
+  */
+ static struct kfd_process *create_process(const struct task_struct *thread)
  {
        struct kfd_process *process;
        int err = -ENOMEM;
  
        process = kzalloc(sizeof(*process), GFP_KERNEL);
        if (!process)
                goto err_alloc_process;
  
-       process->pasid = kfd_pasid_alloc();
-       if (process->pasid == 0)
-               goto err_alloc_pasid;
-       if (kfd_alloc_process_doorbells(process) < 0)
-               goto err_alloc_doorbells;
        kref_init(&process->ref);
        mutex_init(&process->mutex);
        process->mm = thread->mm;
-       /* register notifier */
-       process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
-       err = mmu_notifier_register(&process->mmu_notifier, process->mm);
-       if (err)
-               goto err_mmu_notifier;
-       hash_add_rcu(kfd_processes_table, &process->kfd_processes,
-                       (uintptr_t)process->mm);
        process->lead_thread = thread->group_leader;
-       get_task_struct(process->lead_thread);
        INIT_LIST_HEAD(&process->per_device_data);
+       INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
+       INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
+       process->last_restore_timestamp = get_jiffies_64();
        kfd_event_init_process(process);
+       process->is_32bit_user_mode = in_compat_syscall();
+       process->pasid = kfd_pasid_alloc();
+       if (process->pasid == 0)
+               goto err_alloc_pasid;
+       if (kfd_alloc_process_doorbells(process) < 0)
+               goto err_alloc_doorbells;
  
        err = pqm_init(&process->pqm, process);
        if (err != 0)
                goto err_process_pqm_init;
  
        /* init process apertures*/
-       process->is_32bit_user_mode = in_compat_syscall();
        err = kfd_init_apertures(process);
        if (err != 0)
                goto err_init_apertures;
  
-       INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
-       INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
-       process->last_restore_timestamp = get_jiffies_64();
-       err = kfd_process_init_cwsr_apu(process, filep);
+       /* Must be last, have to use release destruction after this */
+       process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
+       err = mmu_notifier_register(&process->mmu_notifier, process->mm);
        if (err)
-               goto err_init_cwsr;
+               goto err_register_notifier;
+       get_task_struct(process->lead_thread);
+       hash_add_rcu(kfd_processes_table, &process->kfd_processes,
+                       (uintptr_t)process->mm);
  
        return process;
  
- err_init_cwsr:
+ err_register_notifier:
        kfd_process_free_outstanding_kfd_bos(process);
        kfd_process_destroy_pdds(process);
  err_init_apertures:
        pqm_uninit(&process->pqm);
  err_process_pqm_init:
-       hash_del_rcu(&process->kfd_processes);
-       synchronize_rcu();
-       mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm);
- err_mmu_notifier:
-       mutex_destroy(&process->mutex);
        kfd_free_process_doorbells(process);
  err_alloc_doorbells:
        kfd_pasid_free(process->pasid);
  err_alloc_pasid:
+       mutex_destroy(&process->mutex);
        kfree(process);
  err_alloc_process:
        return ERR_PTR(err);
@@@ -801,8 -795,6 +795,8 @@@ int kfd_process_device_init_vm(struct k
                return ret;
        }
  
 +      amdgpu_vm_set_task_info(pdd->vm);
 +
        ret = kfd_process_device_reserve_ib_mem(pdd);
        if (ret)
                goto err_reserve_ib_mem;
@@@ -1044,6 -1036,7 +1038,6 @@@ static void restore_process_worker(stru
  {
        struct delayed_work *dwork;
        struct kfd_process *p;
 -      struct kfd_process_device *pdd;
        int ret = 0;
  
        dwork = to_delayed_work(work);
         * lifetime of this thread, kfd_process p will be valid
         */
        p = container_of(dwork, struct kfd_process, restore_work);
 -
 -      /* Call restore_process_bos on the first KGD device. This function
 -       * takes care of restoring the whole process including other devices.
 -       * Restore can fail if enough memory is not available. If so,
 -       * reschedule again.
 -       */
 -      pdd = list_first_entry(&p->per_device_data,
 -                             struct kfd_process_device,
 -                             per_device_list);
 -
        pr_debug("Started restoring pasid %d\n", p->pasid);
  
        /* Setting last_restore_timestamp before successful restoration.
index bdc948352467a97fa702b522992f9027b8c8e714,a0e48a482452d70a6592d27b06ffb57667beeb6c..2cd83849600f34b2ff22ca9990af042e4ad8a16b
  #include <linux/pci.h>
  #include <linux/pm_runtime.h>
  #include <linux/vga_switcheroo.h>
+ #include <linux/mmu_notifier.h>
  
 -#include <drm/drmP.h>
  #include <drm/drm_crtc_helper.h>
 +#include <drm/drm_ioctl.h>
 +#include <drm/drm_vblank.h>
  
  #include <core/gpuobj.h>
  #include <core/option.h>
@@@ -1047,20 -1047,20 +1048,20 @@@ nouveau_drm_postclose(struct drm_devic
  
  static const struct drm_ioctl_desc
  nouveau_ioctls[] = {
 -      DRM_IOCTL_DEF_DRV(NOUVEAU_GETPARAM, nouveau_abi16_ioctl_getparam, DRM_AUTH|DRM_RENDER_ALLOW),
 -      DRM_IOCTL_DEF_DRV(NOUVEAU_SETPARAM, nouveau_abi16_ioctl_setparam, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
 -      DRM_IOCTL_DEF_DRV(NOUVEAU_CHANNEL_ALLOC, nouveau_abi16_ioctl_channel_alloc, DRM_AUTH|DRM_RENDER_ALLOW),
 -      DRM_IOCTL_DEF_DRV(NOUVEAU_CHANNEL_FREE, nouveau_abi16_ioctl_channel_free, DRM_AUTH|DRM_RENDER_ALLOW),
 -      DRM_IOCTL_DEF_DRV(NOUVEAU_GROBJ_ALLOC, nouveau_abi16_ioctl_grobj_alloc, DRM_AUTH|DRM_RENDER_ALLOW),
 -      DRM_IOCTL_DEF_DRV(NOUVEAU_NOTIFIEROBJ_ALLOC, nouveau_abi16_ioctl_notifierobj_alloc, DRM_AUTH|DRM_RENDER_ALLOW),
 -      DRM_IOCTL_DEF_DRV(NOUVEAU_GPUOBJ_FREE, nouveau_abi16_ioctl_gpuobj_free, DRM_AUTH|DRM_RENDER_ALLOW),
 -      DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_INIT, nouveau_svmm_init, DRM_AUTH|DRM_RENDER_ALLOW),
 -      DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_BIND, nouveau_svmm_bind, DRM_AUTH|DRM_RENDER_ALLOW),
 -      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_NEW, nouveau_gem_ioctl_new, DRM_AUTH|DRM_RENDER_ALLOW),
 -      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_PUSHBUF, nouveau_gem_ioctl_pushbuf, DRM_AUTH|DRM_RENDER_ALLOW),
 -      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_PREP, nouveau_gem_ioctl_cpu_prep, DRM_AUTH|DRM_RENDER_ALLOW),
 -      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_FINI, nouveau_gem_ioctl_cpu_fini, DRM_AUTH|DRM_RENDER_ALLOW),
 -      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_INFO, nouveau_gem_ioctl_info, DRM_AUTH|DRM_RENDER_ALLOW),
 +      DRM_IOCTL_DEF_DRV(NOUVEAU_GETPARAM, nouveau_abi16_ioctl_getparam, DRM_RENDER_ALLOW),
 +      DRM_IOCTL_DEF_DRV(NOUVEAU_SETPARAM, drm_invalid_op, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
 +      DRM_IOCTL_DEF_DRV(NOUVEAU_CHANNEL_ALLOC, nouveau_abi16_ioctl_channel_alloc, DRM_RENDER_ALLOW),
 +      DRM_IOCTL_DEF_DRV(NOUVEAU_CHANNEL_FREE, nouveau_abi16_ioctl_channel_free, DRM_RENDER_ALLOW),
 +      DRM_IOCTL_DEF_DRV(NOUVEAU_GROBJ_ALLOC, nouveau_abi16_ioctl_grobj_alloc, DRM_RENDER_ALLOW),
 +      DRM_IOCTL_DEF_DRV(NOUVEAU_NOTIFIEROBJ_ALLOC, nouveau_abi16_ioctl_notifierobj_alloc, DRM_RENDER_ALLOW),
 +      DRM_IOCTL_DEF_DRV(NOUVEAU_GPUOBJ_FREE, nouveau_abi16_ioctl_gpuobj_free, DRM_RENDER_ALLOW),
 +      DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_INIT, nouveau_svmm_init, DRM_RENDER_ALLOW),
 +      DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_BIND, nouveau_svmm_bind, DRM_RENDER_ALLOW),
 +      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_NEW, nouveau_gem_ioctl_new, DRM_RENDER_ALLOW),
 +      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_PUSHBUF, nouveau_gem_ioctl_pushbuf, DRM_RENDER_ALLOW),
 +      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_PREP, nouveau_gem_ioctl_cpu_prep, DRM_RENDER_ALLOW),
 +      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_FINI, nouveau_gem_ioctl_cpu_fini, DRM_RENDER_ALLOW),
 +      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_INFO, nouveau_gem_ioctl_info, DRM_RENDER_ALLOW),
  };
  
  long
@@@ -1106,7 -1106,7 +1107,7 @@@ nouveau_driver_fops = 
  static struct drm_driver
  driver_stub = {
        .driver_features =
 -              DRIVER_GEM | DRIVER_MODESET | DRIVER_PRIME | DRIVER_RENDER
 +              DRIVER_GEM | DRIVER_MODESET | DRIVER_RENDER
  #if defined(CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT)
                | DRIVER_KMS_LEGACY_CONTEXT
  #endif
  
        .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
        .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 -      .gem_prime_export = drm_gem_prime_export,
 -      .gem_prime_import = drm_gem_prime_import,
        .gem_prime_pin = nouveau_gem_prime_pin,
 -      .gem_prime_res_obj = nouveau_gem_prime_res_obj,
        .gem_prime_unpin = nouveau_gem_prime_unpin,
        .gem_prime_get_sg_table = nouveau_gem_prime_get_sg_table,
        .gem_prime_import_sg_table = nouveau_gem_prime_import_sg_table,
@@@ -1290,6 -1293,8 +1291,8 @@@ nouveau_drm_exit(void
  #ifdef CONFIG_NOUVEAU_PLATFORM_DRIVER
        platform_driver_unregister(&nouveau_platform_driver);
  #endif
+       if (IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM))
+               mmu_notifier_synchronize();
  }
  
  module_init(nouveau_drm_init);
index 05b88491ccb9962ccbfb251ba9b8cdc8ed4faf4f,918164f90b114acbfe36256382ed9d8ae4ee209e..d59b004f6695831b4e6c9e71a180bde7a65988eb
@@@ -505,6 -505,7 +505,6 @@@ struct radeon_bo 
        struct list_head                va;
        /* Constant after initialization */
        struct radeon_device            *rdev;
 -      struct drm_gem_object           gem_base;
  
        struct ttm_bo_kmap_obj          dma_buf_vmap;
        pid_t                           pid;
        struct radeon_mn                *mn;
        struct list_head                mn_list;
  };
 -#define gem_to_radeon_bo(gobj) container_of((gobj), struct radeon_bo, gem_base)
 +#define gem_to_radeon_bo(gobj) container_of((gobj), struct radeon_bo, tbo.base)
  
  int radeon_gem_debugfs_init(struct radeon_device *rdev);
  
@@@ -619,7 -620,7 +619,7 @@@ void radeon_sync_fence(struct radeon_sy
                       struct radeon_fence *fence);
  int radeon_sync_resv(struct radeon_device *rdev,
                     struct radeon_sync *sync,
 -                   struct reservation_object *resv,
 +                   struct dma_resv *resv,
                     bool shared);
  int radeon_sync_rings(struct radeon_device *rdev,
                      struct radeon_sync *sync,
@@@ -1912,20 -1913,20 +1912,20 @@@ struct radeon_asic 
                                             uint64_t src_offset,
                                             uint64_t dst_offset,
                                             unsigned num_gpu_pages,
 -                                           struct reservation_object *resv);
 +                                           struct dma_resv *resv);
                u32 blit_ring_index;
                struct radeon_fence *(*dma)(struct radeon_device *rdev,
                                            uint64_t src_offset,
                                            uint64_t dst_offset,
                                            unsigned num_gpu_pages,
 -                                          struct reservation_object *resv);
 +                                          struct dma_resv *resv);
                u32 dma_ring_index;
                /* method used for bo copy */
                struct radeon_fence *(*copy)(struct radeon_device *rdev,
                                             uint64_t src_offset,
                                             uint64_t dst_offset,
                                             unsigned num_gpu_pages,
 -                                           struct reservation_object *resv);
 +                                           struct dma_resv *resv);
                /* ring used for bo copies */
                u32 copy_ring_index;
        } copy;
@@@ -2386,6 -2387,7 +2386,6 @@@ struct radeon_device 
        struct radeon_wb                wb;
        struct radeon_dummy_page        dummy_page;
        bool                            shutdown;
 -      bool                            need_dma32;
        bool                            need_swiotlb;
        bool                            accel_working;
        bool                            fastfb_working; /* IGP feature*/
        /* tracking pinned memory */
        u64 vram_pin_size;
        u64 gart_pin_size;
-       struct mutex    mn_lock;
-       DECLARE_HASHTABLE(mn_hash, 7);
  };
  
  bool radeon_is_px(struct drm_device *dev);
index 88eb7cb522bb7dedbad56a28f2e6176d72e31e93,788b1d8a80e660392f7b55502e21700fad618b53..5d017f0aec665c9996435b2a211e83596e065757
@@@ -1325,8 -1325,6 +1325,6 @@@ int radeon_device_init(struct radeon_de
        init_rwsem(&rdev->pm.mclk_lock);
        init_rwsem(&rdev->exclusive_lock);
        init_waitqueue_head(&rdev->irq.vblank_queue);
-       mutex_init(&rdev->mn_lock);
-       hash_init(rdev->mn_hash);
        r = radeon_gem_init(rdev);
        if (r)
                return r;
        else
                rdev->mc.mc_mask = 0xffffffffULL; /* 32 bit MC */
  
 -      /* set DMA mask + need_dma32 flags.
 +      /* set DMA mask.
         * PCIE - can handle 40-bits.
         * IGP - can handle 40-bits
         * AGP - generally dma32 is safest
         * PCI - dma32 for legacy pci gart, 40 bits on newer asics
         */
 -      rdev->need_dma32 = false;
 +      dma_bits = 40;
        if (rdev->flags & RADEON_IS_AGP)
 -              rdev->need_dma32 = true;
 +              dma_bits = 32;
        if ((rdev->flags & RADEON_IS_PCI) &&
            (rdev->family <= CHIP_RS740))
 -              rdev->need_dma32 = true;
 +              dma_bits = 32;
  #ifdef CONFIG_PPC64
        if (rdev->family == CHIP_CEDAR)
 -              rdev->need_dma32 = true;
 +              dma_bits = 32;
  #endif
  
 -      dma_bits = rdev->need_dma32 ? 32 : 40;
 -      r = pci_set_dma_mask(rdev->pdev, DMA_BIT_MASK(dma_bits));
 +      r = dma_set_mask_and_coherent(&rdev->pdev->dev, DMA_BIT_MASK(dma_bits));
        if (r) {
 -              rdev->need_dma32 = true;
 -              dma_bits = 32;
                pr_warn("radeon: No suitable DMA available\n");
 -      }
 -      r = pci_set_consistent_dma_mask(rdev->pdev, DMA_BIT_MASK(dma_bits));
 -      if (r) {
 -              pci_set_consistent_dma_mask(rdev->pdev, DMA_BIT_MASK(32));
 -              pr_warn("radeon: No coherent DMA available\n");
 +              return r;
        }
        rdev->need_swiotlb = drm_need_swiotlb(dma_bits);
  
index 5838162f687fe33a8fdeb2611398e55800ec8f3d,b6535ac91fdb7458ef1f8cef6a6288a7481dbfa2..431e6b64b77db6914dedbe24f122e72eaddf9de9
@@@ -35,6 -35,7 +35,7 @@@
  #include <linux/module.h>
  #include <linux/pm_runtime.h>
  #include <linux/vga_switcheroo.h>
+ #include <linux/mmu_notifier.h>
  
  #include <drm/drm_crtc_helper.h>
  #include <drm/drm_drv.h>
@@@ -130,7 -131,8 +131,7 @@@ int radeon_gem_object_open(struct drm_g
                                struct drm_file *file_priv);
  void radeon_gem_object_close(struct drm_gem_object *obj,
                                struct drm_file *file_priv);
 -struct dma_buf *radeon_gem_prime_export(struct drm_device *dev,
 -                                      struct drm_gem_object *gobj,
 +struct dma_buf *radeon_gem_prime_export(struct drm_gem_object *gobj,
                                        int flags);
  extern int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int crtc,
                                      unsigned int flags, int *vpos, int *hpos,
@@@ -152,6 -154,7 +153,6 @@@ struct drm_gem_object *radeon_gem_prime
                                                        struct sg_table *sg);
  int radeon_gem_prime_pin(struct drm_gem_object *obj);
  void radeon_gem_prime_unpin(struct drm_gem_object *obj);
 -struct reservation_object *radeon_gem_prime_res_obj(struct drm_gem_object *);
  void *radeon_gem_prime_vmap(struct drm_gem_object *obj);
  void radeon_gem_prime_vunmap(struct drm_gem_object *obj, void *vaddr);
  
@@@ -347,30 -350,24 +348,30 @@@ radeon_pci_remove(struct pci_dev *pdev
  static void
  radeon_pci_shutdown(struct pci_dev *pdev)
  {
 +      struct drm_device *ddev = pci_get_drvdata(pdev);
 +
        /* if we are running in a VM, make sure the device
         * torn down properly on reboot/shutdown
         */
        if (radeon_device_is_virtual())
                radeon_pci_remove(pdev);
 +
 +      /* Some adapters need to be suspended before a
 +      * shutdown occurs in order to prevent an error
 +      * during kexec.
 +      */
 +      radeon_suspend_kms(ddev, true, true, false);
  }
  
  static int radeon_pmops_suspend(struct device *dev)
  {
 -      struct pci_dev *pdev = to_pci_dev(dev);
 -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
 +      struct drm_device *drm_dev = dev_get_drvdata(dev);
        return radeon_suspend_kms(drm_dev, true, true, false);
  }
  
  static int radeon_pmops_resume(struct device *dev)
  {
 -      struct pci_dev *pdev = to_pci_dev(dev);
 -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
 +      struct drm_device *drm_dev = dev_get_drvdata(dev);
  
        /* GPU comes up enabled by the bios on resume */
        if (radeon_is_px(drm_dev)) {
  
  static int radeon_pmops_freeze(struct device *dev)
  {
 -      struct pci_dev *pdev = to_pci_dev(dev);
 -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
 +      struct drm_device *drm_dev = dev_get_drvdata(dev);
        return radeon_suspend_kms(drm_dev, false, true, true);
  }
  
  static int radeon_pmops_thaw(struct device *dev)
  {
 -      struct pci_dev *pdev = to_pci_dev(dev);
 -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
 +      struct drm_device *drm_dev = dev_get_drvdata(dev);
        return radeon_resume_kms(drm_dev, false, true);
  }
  
@@@ -449,7 -448,8 +450,7 @@@ static int radeon_pmops_runtime_resume(
  
  static int radeon_pmops_runtime_idle(struct device *dev)
  {
 -      struct pci_dev *pdev = to_pci_dev(dev);
 -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
 +      struct drm_device *drm_dev = dev_get_drvdata(dev);
        struct drm_crtc *crtc;
  
        if (!radeon_is_px(drm_dev)) {
@@@ -540,7 -540,7 +541,7 @@@ radeon_get_crtc_scanout_position(struc
  
  static struct drm_driver kms_driver = {
        .driver_features =
 -          DRIVER_USE_AGP | DRIVER_GEM | DRIVER_PRIME | DRIVER_RENDER,
 +          DRIVER_USE_AGP | DRIVER_GEM | DRIVER_RENDER,
        .load = radeon_driver_load_kms,
        .open = radeon_driver_open_kms,
        .postclose = radeon_driver_postclose_kms,
        .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
        .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
        .gem_prime_export = radeon_gem_prime_export,
 -      .gem_prime_import = drm_gem_prime_import,
        .gem_prime_pin = radeon_gem_prime_pin,
        .gem_prime_unpin = radeon_gem_prime_unpin,
 -      .gem_prime_res_obj = radeon_gem_prime_res_obj,
        .gem_prime_get_sg_table = radeon_gem_prime_get_sg_table,
        .gem_prime_import_sg_table = radeon_gem_prime_import_sg_table,
        .gem_prime_vmap = radeon_gem_prime_vmap,
@@@ -623,6 -625,7 +624,7 @@@ static void __exit radeon_exit(void
  {
        pci_unregister_driver(pdriver);
        radeon_unregister_atpx_handler();
+       mmu_notifier_synchronize();
  }
  
  module_init(radeon_init);
index 6902f998ede9068acbe859aa1d5552abac64a294,1ee20d528a7c2465f9a4c3f4aa523ad5433b78d0..dbab9a3a969b9e3a49bffa49b2bd127df7d34b5e
  #include "radeon.h"
  
  struct radeon_mn {
-       /* constant after initialisation */
-       struct radeon_device    *rdev;
-       struct mm_struct        *mm;
        struct mmu_notifier     mn;
  
-       /* only used on destruction */
-       struct work_struct      work;
-       /* protected by rdev->mn_lock */
-       struct hlist_node       node;
        /* objects protected by lock */
        struct mutex            lock;
        struct rb_root_cached   objects;
@@@ -58,55 -49,6 +49,6 @@@ struct radeon_mn_node 
        struct list_head                bos;
  };
  
- /**
-  * radeon_mn_destroy - destroy the rmn
-  *
-  * @work: previously sheduled work item
-  *
-  * Lazy destroys the notifier from a work item
-  */
- static void radeon_mn_destroy(struct work_struct *work)
- {
-       struct radeon_mn *rmn = container_of(work, struct radeon_mn, work);
-       struct radeon_device *rdev = rmn->rdev;
-       struct radeon_mn_node *node, *next_node;
-       struct radeon_bo *bo, *next_bo;
-       mutex_lock(&rdev->mn_lock);
-       mutex_lock(&rmn->lock);
-       hash_del(&rmn->node);
-       rbtree_postorder_for_each_entry_safe(node, next_node,
-                                            &rmn->objects.rb_root, it.rb) {
-               interval_tree_remove(&node->it, &rmn->objects);
-               list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) {
-                       bo->mn = NULL;
-                       list_del_init(&bo->mn_list);
-               }
-               kfree(node);
-       }
-       mutex_unlock(&rmn->lock);
-       mutex_unlock(&rdev->mn_lock);
-       mmu_notifier_unregister(&rmn->mn, rmn->mm);
-       kfree(rmn);
- }
- /**
-  * radeon_mn_release - callback to notify about mm destruction
-  *
-  * @mn: our notifier
-  * @mn: the mm this callback is about
-  *
-  * Shedule a work item to lazy destroy our notifier.
-  */
- static void radeon_mn_release(struct mmu_notifier *mn,
-                             struct mm_struct *mm)
- {
-       struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
-       INIT_WORK(&rmn->work, radeon_mn_destroy);
-       schedule_work(&rmn->work);
- }
  /**
   * radeon_mn_invalidate_range_start - callback to notify about mm change
   *
@@@ -163,7 -105,7 +105,7 @@@ static int radeon_mn_invalidate_range_s
                                continue;
                        }
  
 -                      r = reservation_object_wait_timeout_rcu(bo->tbo.resv,
 +                      r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv,
                                true, false, MAX_SCHEDULE_TIMEOUT);
                        if (r <= 0)
                                DRM_ERROR("(%ld) failed to wait for user bo\n", r);
@@@ -183,65 -125,44 +125,44 @@@ out_unlock
        return ret;
  }
  
- static const struct mmu_notifier_ops radeon_mn_ops = {
-       .release = radeon_mn_release,
-       .invalidate_range_start = radeon_mn_invalidate_range_start,
- };
+ static void radeon_mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
+ {
+       struct mmu_notifier_range range = {
+               .mm = mm,
+               .start = 0,
+               .end = ULONG_MAX,
+               .flags = 0,
+               .event = MMU_NOTIFY_UNMAP,
+       };
+       radeon_mn_invalidate_range_start(mn, &range);
+ }
  
- /**
-  * radeon_mn_get - create notifier context
-  *
-  * @rdev: radeon device pointer
-  *
-  * Creates a notifier context for current->mm.
-  */
- static struct radeon_mn *radeon_mn_get(struct radeon_device *rdev)
+ static struct mmu_notifier *radeon_mn_alloc_notifier(struct mm_struct *mm)
  {
-       struct mm_struct *mm = current->mm;
        struct radeon_mn *rmn;
-       int r;
-       if (down_write_killable(&mm->mmap_sem))
-               return ERR_PTR(-EINTR);
-       mutex_lock(&rdev->mn_lock);
-       hash_for_each_possible(rdev->mn_hash, rmn, node, (unsigned long)mm)
-               if (rmn->mm == mm)
-                       goto release_locks;
  
        rmn = kzalloc(sizeof(*rmn), GFP_KERNEL);
-       if (!rmn) {
-               rmn = ERR_PTR(-ENOMEM);
-               goto release_locks;
-       }
+       if (!rmn)
+               return ERR_PTR(-ENOMEM);
  
-       rmn->rdev = rdev;
-       rmn->mm = mm;
-       rmn->mn.ops = &radeon_mn_ops;
        mutex_init(&rmn->lock);
        rmn->objects = RB_ROOT_CACHED;
-       
-       r = __mmu_notifier_register(&rmn->mn, mm);
-       if (r)
-               goto free_rmn;
-       hash_add(rdev->mn_hash, &rmn->node, (unsigned long)mm);
- release_locks:
-       mutex_unlock(&rdev->mn_lock);
-       up_write(&mm->mmap_sem);
-       return rmn;
- free_rmn:
-       mutex_unlock(&rdev->mn_lock);
-       up_write(&mm->mmap_sem);
-       kfree(rmn);
+       return &rmn->mn;
+ }
  
-       return ERR_PTR(r);
+ static void radeon_mn_free_notifier(struct mmu_notifier *mn)
+ {
+       kfree(container_of(mn, struct radeon_mn, mn));
  }
  
+ static const struct mmu_notifier_ops radeon_mn_ops = {
+       .release = radeon_mn_release,
+       .invalidate_range_start = radeon_mn_invalidate_range_start,
+       .alloc_notifier = radeon_mn_alloc_notifier,
+       .free_notifier = radeon_mn_free_notifier,
+ };
  /**
   * radeon_mn_register - register a BO for notifier updates
   *
  int radeon_mn_register(struct radeon_bo *bo, unsigned long addr)
  {
        unsigned long end = addr + radeon_bo_size(bo) - 1;
-       struct radeon_device *rdev = bo->rdev;
+       struct mmu_notifier *mn;
        struct radeon_mn *rmn;
        struct radeon_mn_node *node = NULL;
        struct list_head bos;
        struct interval_tree_node *it;
  
-       rmn = radeon_mn_get(rdev);
-       if (IS_ERR(rmn))
-               return PTR_ERR(rmn);
+       mn = mmu_notifier_get(&radeon_mn_ops, current->mm);
+       if (IS_ERR(mn))
+               return PTR_ERR(mn);
+       rmn = container_of(mn, struct radeon_mn, mn);
  
        INIT_LIST_HEAD(&bos);
  
   */
  void radeon_mn_unregister(struct radeon_bo *bo)
  {
-       struct radeon_device *rdev = bo->rdev;
-       struct radeon_mn *rmn;
+       struct radeon_mn *rmn = bo->mn;
        struct list_head *head;
  
-       mutex_lock(&rdev->mn_lock);
-       rmn = bo->mn;
-       if (rmn == NULL) {
-               mutex_unlock(&rdev->mn_lock);
+       if (!rmn)
                return;
-       }
  
        mutex_lock(&rmn->lock);
        /* save the next list entry for later */
        head = bo->mn_list.next;
  
-       bo->mn = NULL;
        list_del(&bo->mn_list);
  
        if (list_empty(head)) {
        }
  
        mutex_unlock(&rmn->lock);
-       mutex_unlock(&rdev->mn_lock);
+       mmu_notifier_put(&rmn->mn);
+       bo->mn = NULL;
  }
index 4e9f1507ffd9409dd42701ec4962cd0cb417ee62,4ba73a95475a98791f97e6c6087d80eba456cd42..bface798ee590705f342e9046d85c2b526c15e52
@@@ -1023,7 -1023,7 +1023,7 @@@ static int mlx5_ib_query_device(struct 
        props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
  
        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
 -              if (MLX5_CAP_GEN(mdev, pg))
 +              if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
                        props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
                props->odp_caps = dev->odp_caps;
        }
@@@ -1867,10 -1867,6 +1867,6 @@@ static int mlx5_ib_alloc_ucontext(struc
        if (err)
                goto out_sys_pages;
  
-       if (ibdev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)
-               context->ibucontext.invalidate_range =
-                       &mlx5_ib_invalidate_range;
        if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
                err = mlx5_ib_devx_create(dev, true);
                if (err < 0)
@@@ -1999,11 -1995,6 +1995,6 @@@ static void mlx5_ib_dealloc_ucontext(st
        struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
        struct mlx5_bfreg_info *bfregi;
  
-       /* All umem's must be destroyed before destroying the ucontext. */
-       mutex_lock(&ibcontext->per_mm_list_lock);
-       WARN_ON(!list_empty(&ibcontext->per_mm_list));
-       mutex_unlock(&ibcontext->per_mm_list_lock);
        bfregi = &context->bfregi;
        mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
  
@@@ -2280,7 -2271,6 +2271,7 @@@ static inline int check_dm_type_support
                        return -EOPNOTSUPP;
                break;
        case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
 +      case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
                if (!capable(CAP_SYS_RAWIO) ||
                    !capable(CAP_NET_RAW))
                        return -EPERM;
@@@ -2345,20 -2335,20 +2336,20 @@@ static int handle_alloc_dm_sw_icm(struc
                                  struct uverbs_attr_bundle *attrs,
                                  int type)
  {
 -      struct mlx5_dm *dm_db = &to_mdev(ctx->device)->dm;
 +      struct mlx5_core_dev *dev = to_mdev(ctx->device)->mdev;
        u64 act_size;
        int err;
  
        /* Allocation size must a multiple of the basic block size
         * and a power of 2.
         */
 -      act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dm_db->dev));
 +      act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dev));
        act_size = roundup_pow_of_two(act_size);
  
        dm->size = act_size;
 -      err = mlx5_cmd_alloc_sw_icm(dm_db, type, act_size,
 -                                  to_mucontext(ctx)->devx_uid, &dm->dev_addr,
 -                                  &dm->icm_dm.obj_id);
 +      err = mlx5_dm_sw_icm_alloc(dev, type, act_size,
 +                                 to_mucontext(ctx)->devx_uid, &dm->dev_addr,
 +                                 &dm->icm_dm.obj_id);
        if (err)
                return err;
  
                             MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
                             &dm->dev_addr, sizeof(dm->dev_addr));
        if (err)
 -              mlx5_cmd_dealloc_sw_icm(dm_db, type, dm->size,
 -                                      to_mucontext(ctx)->devx_uid,
 -                                      dm->dev_addr, dm->icm_dm.obj_id);
 +              mlx5_dm_sw_icm_dealloc(dev, type, dm->size,
 +                                     to_mucontext(ctx)->devx_uid, dm->dev_addr,
 +                                     dm->icm_dm.obj_id);
  
        return err;
  }
@@@ -2408,14 -2398,8 +2399,14 @@@ struct ib_dm *mlx5_ib_alloc_dm(struct i
                                            attrs);
                break;
        case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
 +              err = handle_alloc_dm_sw_icm(context, dm,
 +                                           attr, attrs,
 +                                           MLX5_SW_ICM_TYPE_STEERING);
 +              break;
        case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
 -              err = handle_alloc_dm_sw_icm(context, dm, attr, attrs, type);
 +              err = handle_alloc_dm_sw_icm(context, dm,
 +                                           attr, attrs,
 +                                           MLX5_SW_ICM_TYPE_HEADER_MODIFY);
                break;
        default:
                err = -EOPNOTSUPP;
@@@ -2435,7 -2419,6 +2426,7 @@@ int mlx5_ib_dealloc_dm(struct ib_dm *ib
  {
        struct mlx5_ib_ucontext *ctx = rdma_udata_to_drv_context(
                &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
 +      struct mlx5_core_dev *dev = to_mdev(ibdm->device)->mdev;
        struct mlx5_dm *dm_db = &to_mdev(ibdm->device)->dm;
        struct mlx5_ib_dm *dm = to_mdm(ibdm);
        u32 page_idx;
                if (ret)
                        return ret;
  
 -              page_idx = (dm->dev_addr -
 -                          pci_resource_start(dm_db->dev->pdev, 0) -
 -                          MLX5_CAP64_DEV_MEM(dm_db->dev,
 -                                             memic_bar_start_addr)) >>
 -                         PAGE_SHIFT;
 +              page_idx = (dm->dev_addr - pci_resource_start(dev->pdev, 0) -
 +                          MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr)) >>
 +                          PAGE_SHIFT;
                bitmap_clear(ctx->dm_pages, page_idx,
                             DIV_ROUND_UP(dm->size, PAGE_SIZE));
                break;
        case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
 +              ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_STEERING,
 +                                           dm->size, ctx->devx_uid, dm->dev_addr,
 +                                           dm->icm_dm.obj_id);
 +              if (ret)
 +                      return ret;
 +              break;
        case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
 -              ret = mlx5_cmd_dealloc_sw_icm(dm_db, dm->type, dm->size,
 -                                            ctx->devx_uid, dm->dev_addr,
 -                                            dm->icm_dm.obj_id);
 +              ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_HEADER_MODIFY,
 +                                           dm->size, ctx->devx_uid, dm->dev_addr,
 +                                           dm->icm_dm.obj_id);
                if (ret)
                        return ret;
                break;
@@@ -2658,8 -2637,7 +2649,8 @@@ int parse_flow_flow_action(struct mlx5_
                        if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
                                return -EINVAL;
                        action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
 -                      action->modify_id = maction->flow_action_raw.action_id;
 +                      action->modify_hdr =
 +                              maction->flow_action_raw.modify_hdr;
                        return 0;
                }
                if (maction->flow_action_raw.sub_type ==
                                return -EINVAL;
                        action->action |=
                                MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
 -                      action->reformat_id =
 -                              maction->flow_action_raw.action_id;
 +                      action->pkt_reformat =
 +                              maction->flow_action_raw.pkt_reformat;
                        return 0;
                }
                /* fall through */
@@@ -6109,6 -6087,8 +6100,6 @@@ static struct ib_counters *mlx5_ib_crea
  
  static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
  {
 -      struct mlx5_core_dev *mdev = dev->mdev;
 -
        mlx5_ib_cleanup_multiport_master(dev);
        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
                srcu_barrier(&dev->mr_srcu);
        }
  
        WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
 -
 -      WARN_ON(dev->dm.steering_sw_icm_alloc_blocks &&
 -              !bitmap_empty(
 -                      dev->dm.steering_sw_icm_alloc_blocks,
 -                      BIT(MLX5_CAP_DEV_MEM(mdev, log_steering_sw_icm_size) -
 -                          MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev))));
 -
 -      kfree(dev->dm.steering_sw_icm_alloc_blocks);
 -
 -      WARN_ON(dev->dm.header_modify_sw_icm_alloc_blocks &&
 -              !bitmap_empty(dev->dm.header_modify_sw_icm_alloc_blocks,
 -                            BIT(MLX5_CAP_DEV_MEM(
 -                                        mdev, log_header_modify_sw_icm_size) -
 -                                MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev))));
 -
 -      kfree(dev->dm.header_modify_sw_icm_alloc_blocks);
  }
  
  static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
  {
        struct mlx5_core_dev *mdev = dev->mdev;
 -      u64 header_modify_icm_blocks = 0;
 -      u64 steering_icm_blocks = 0;
        int err;
        int i;
  
                dev->port[i].roce.last_port_state = IB_PORT_DOWN;
        }
  
 +      mlx5_ib_internal_fill_odp_caps(dev);
 +
        err = mlx5_ib_init_multiport_master(dev);
        if (err)
                return err;
        INIT_LIST_HEAD(&dev->qp_list);
        spin_lock_init(&dev->reset_flow_resource_lock);
  
 -      if (MLX5_CAP_GEN_64(mdev, general_obj_types) &
 -          MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM) {
 -              if (MLX5_CAP64_DEV_MEM(mdev, steering_sw_icm_start_address)) {
 -                      steering_icm_blocks =
 -                              BIT(MLX5_CAP_DEV_MEM(mdev,
 -                                                   log_steering_sw_icm_size) -
 -                                  MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev));
 -
 -                      dev->dm.steering_sw_icm_alloc_blocks =
 -                              kcalloc(BITS_TO_LONGS(steering_icm_blocks),
 -                                      sizeof(unsigned long), GFP_KERNEL);
 -                      if (!dev->dm.steering_sw_icm_alloc_blocks)
 -                              goto err_mp;
 -              }
 -
 -              if (MLX5_CAP64_DEV_MEM(mdev,
 -                                     header_modify_sw_icm_start_address)) {
 -                      header_modify_icm_blocks = BIT(
 -                              MLX5_CAP_DEV_MEM(
 -                                      mdev, log_header_modify_sw_icm_size) -
 -                              MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev));
 -
 -                      dev->dm.header_modify_sw_icm_alloc_blocks =
 -                              kcalloc(BITS_TO_LONGS(header_modify_icm_blocks),
 -                                      sizeof(unsigned long), GFP_KERNEL);
 -                      if (!dev->dm.header_modify_sw_icm_alloc_blocks)
 -                              goto err_dm;
 -              }
 -      }
 -
        spin_lock_init(&dev->dm.lock);
        dev->dm.dev = mdev;
  
        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
                err = init_srcu_struct(&dev->mr_srcu);
                if (err)
 -                      goto err_dm;
 +                      goto err_mp;
        }
  
        return 0;
  
 -err_dm:
 -      kfree(dev->dm.steering_sw_icm_alloc_blocks);
 -      kfree(dev->dm.header_modify_sw_icm_alloc_blocks);
 -
  err_mp:
        mlx5_ib_cleanup_multiport_master(dev);
  
@@@ -6524,6 -6554,8 +6515,6 @@@ static void mlx5_ib_stage_dev_res_clean
  
  static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev)
  {
 -      mlx5_ib_internal_fill_odp_caps(dev);
 -
        return mlx5_ib_odp_init_one(dev);
  }
  
index 3401f5f6792e6bb79e472f6411eade5c2d1e2dab,b7da619614e4511f3d6ec43dd9213d95530178b3..1eff031ef04842f06ab4d088b04a610b9388aa1b
@@@ -784,19 -784,37 +784,37 @@@ static int mr_umem_get(struct mlx5_ib_d
                       int *ncont, int *order)
  {
        struct ib_umem *u;
-       int err;
  
        *umem = NULL;
  
-       u = ib_umem_get(udata, start, length, access_flags, 0);
-       err = PTR_ERR_OR_ZERO(u);
-       if (err) {
-               mlx5_ib_dbg(dev, "umem get failed (%d)\n", err);
-               return err;
+       if (access_flags & IB_ACCESS_ON_DEMAND) {
+               struct ib_umem_odp *odp;
+               odp = ib_umem_odp_get(udata, start, length, access_flags);
+               if (IS_ERR(odp)) {
+                       mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
+                                   PTR_ERR(odp));
+                       return PTR_ERR(odp);
+               }
+               u = &odp->umem;
+               *page_shift = odp->page_shift;
+               *ncont = ib_umem_odp_num_pages(odp);
+               *npages = *ncont << (*page_shift - PAGE_SHIFT);
+               if (order)
+                       *order = ilog2(roundup_pow_of_two(*ncont));
+       } else {
+               u = ib_umem_get(udata, start, length, access_flags, 0);
+               if (IS_ERR(u)) {
+                       mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u));
+                       return PTR_ERR(u);
+               }
+               mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
+                                  page_shift, ncont, order);
        }
  
-       mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
-                          page_shift, ncont, order);
        if (!*npages) {
                mlx5_ib_warn(dev, "avoid zero region\n");
                ib_umem_release(u);
@@@ -1293,7 -1311,9 +1311,7 @@@ struct ib_mr *mlx5_ib_reg_user_mr(struc
        if (err < 0)
                return ERR_PTR(err);
  
 -      use_umr = !MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled) &&
 -                (!MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled) ||
 -                 !MLX5_CAP_GEN(dev->mdev, atomic));
 +      use_umr = mlx5_ib_can_use_umr(dev, true);
  
        if (order <= mr_cache_max_order(dev) && use_umr) {
                mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont,
@@@ -1446,8 -1466,7 +1464,8 @@@ int mlx5_ib_rereg_user_mr(struct ib_mr 
                        goto err;
        }
  
 -      if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) {
 +      if (!mlx5_ib_can_use_umr(dev, true) ||
 +          (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len))) {
                /*
                 * UMR can't be used - MKey needs to be replaced.
                 */
@@@ -1599,7 -1618,7 +1617,7 @@@ static void dereg_mr(struct mlx5_ib_de
                /* Wait for all running page-fault handlers to finish. */
                synchronize_srcu(&dev->mr_srcu);
                /* Destroy all page mappings */
-               if (umem_odp->page_list)
+               if (!umem_odp->is_implicit_odp)
                        mlx5_ib_invalidate_range(umem_odp,
                                                 ib_umem_start(umem_odp),
                                                 ib_umem_end(umem_odp));
                 * so that there will not be any invalidations in
                 * flight, looking at the *mr struct.
                 */
-               ib_umem_release(umem);
+               ib_umem_odp_release(umem_odp);
                atomic_sub(npages, &dev->mdev->priv.reg_pages);
  
                /* Avoid double-freeing the umem. */
index 0a59912a4cef640067472319da83d7123f6eb5d1,762038ab83e290b2860cb45b324bdf0f40e6e0b8..dd26e7acb37e4b331890d7ab929389b0d76e3fe7
@@@ -184,7 -184,7 +184,7 @@@ void mlx5_odp_populate_klm(struct mlx5_
        for (i = 0; i < nentries; i++, pklm++) {
                pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
                va = (offset + i) * MLX5_IMR_MTT_SIZE;
-               if (odp && odp->umem.address == va) {
+               if (odp && ib_umem_start(odp) == va) {
                        struct mlx5_ib_mr *mtt = odp->private;
  
                        pklm->key = cpu_to_be32(mtt->ibmr.lkey);
@@@ -206,7 -206,7 +206,7 @@@ static void mr_leaf_free_action(struct 
        mr->parent = NULL;
        synchronize_srcu(&mr->dev->mr_srcu);
  
-       ib_umem_release(&odp->umem);
+       ib_umem_odp_release(odp);
        if (imr->live)
                mlx5_ib_update_xlt(imr, idx, 1, 0,
                                   MLX5_IB_UPD_XLT_INDIRECT |
@@@ -301,8 -301,7 +301,8 @@@ void mlx5_ib_internal_fill_odp_caps(str
  
        memset(caps, 0, sizeof(*caps));
  
 -      if (!MLX5_CAP_GEN(dev->mdev, pg))
 +      if (!MLX5_CAP_GEN(dev->mdev, pg) ||
 +          !mlx5_ib_can_use_umr(dev, true))
                return;
  
        caps->general_caps = IB_ODP_SUPPORT;
  
        if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
            MLX5_CAP_GEN(dev->mdev, null_mkey) &&
 -          MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
 +          MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
 +          !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled))
                caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
  
        return;
@@@ -386,7 -384,7 +386,7 @@@ static void mlx5_ib_page_fault_resume(s
  }
  
  static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
-                                           struct ib_umem *umem,
+                                           struct ib_umem_odp *umem_odp,
                                            bool ksm, int access_flags)
  {
        struct mlx5_ib_dev *dev = to_mdev(pd->device);
        mr->dev = dev;
        mr->access_flags = access_flags;
        mr->mmkey.iova = 0;
-       mr->umem = umem;
+       mr->umem = &umem_odp->umem;
  
        if (ksm) {
                err = mlx5_ib_update_xlt(mr, 0,
@@@ -464,18 -462,17 +464,17 @@@ next_mr
                if (nentries)
                        nentries++;
        } else {
-               odp = ib_alloc_odp_umem(odp_mr, addr,
-                                       MLX5_IMR_MTT_SIZE);
+               odp = ib_umem_odp_alloc_child(odp_mr, addr, MLX5_IMR_MTT_SIZE);
                if (IS_ERR(odp)) {
                        mutex_unlock(&odp_mr->umem_mutex);
                        return ERR_CAST(odp);
                }
  
-               mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0,
+               mtt = implicit_mr_alloc(mr->ibmr.pd, odp, 0,
                                        mr->access_flags);
                if (IS_ERR(mtt)) {
                        mutex_unlock(&odp_mr->umem_mutex);
-                       ib_umem_release(&odp->umem);
+                       ib_umem_odp_release(odp);
                        return ERR_CAST(mtt);
                }
  
        addr += MLX5_IMR_MTT_SIZE;
        if (unlikely(addr < io_virt + bcnt)) {
                odp = odp_next(odp);
-               if (odp && odp->umem.address != addr)
+               if (odp && ib_umem_start(odp) != addr)
                        odp = NULL;
                goto next_mr;
        }
@@@ -521,19 -518,19 +520,19 @@@ struct mlx5_ib_mr *mlx5_ib_alloc_implic
                                             int access_flags)
  {
        struct mlx5_ib_mr *imr;
-       struct ib_umem *umem;
+       struct ib_umem_odp *umem_odp;
  
-       umem = ib_umem_get(udata, 0, 0, access_flags, 0);
-       if (IS_ERR(umem))
-               return ERR_CAST(umem);
+       umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags);
+       if (IS_ERR(umem_odp))
+               return ERR_CAST(umem_odp);
  
-       imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags);
+       imr = implicit_mr_alloc(&pd->ibpd, umem_odp, 1, access_flags);
        if (IS_ERR(imr)) {
-               ib_umem_release(umem);
+               ib_umem_odp_release(umem_odp);
                return ERR_CAST(imr);
        }
  
-       imr->umem = umem;
+       imr->umem = &umem_odp->umem;
        init_waitqueue_head(&imr->q_leaf_free);
        atomic_set(&imr->num_leaf_free, 0);
        atomic_set(&imr->num_pending_prefetch, 0);
        return imr;
  }
  
- static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end,
-                       void *cookie)
+ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
  {
-       struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie;
-       if (mr->parent != imr)
-               return 0;
-       ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
-                                   ib_umem_end(umem_odp));
+       struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr);
+       struct rb_node *node;
  
-       if (umem_odp->dying)
-               return 0;
+       down_read(&per_mm->umem_rwsem);
+       for (node = rb_first_cached(&per_mm->umem_tree); node;
+            node = rb_next(node)) {
+               struct ib_umem_odp *umem_odp =
+                       rb_entry(node, struct ib_umem_odp, interval_tree.rb);
+               struct mlx5_ib_mr *mr = umem_odp->private;
  
-       WRITE_ONCE(umem_odp->dying, 1);
-       atomic_inc(&imr->num_leaf_free);
-       schedule_work(&umem_odp->work);
+               if (mr->parent != imr)
+                       continue;
  
-       return 0;
- }
+               ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+                                           ib_umem_end(umem_odp));
  
- void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
- {
-       struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr);
+               if (umem_odp->dying)
+                       continue;
  
-       down_read(&per_mm->umem_rwsem);
-       rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX,
-                                     mr_leaf_free, true, imr);
+               WRITE_ONCE(umem_odp->dying, 1);
+               atomic_inc(&imr->num_leaf_free);
+               schedule_work(&umem_odp->work);
+       }
        up_read(&per_mm->umem_rwsem);
  
        wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
@@@ -589,7 -583,7 +585,7 @@@ static int pagefault_mr(struct mlx5_ib_
        struct ib_umem_odp *odp;
        size_t size;
  
-       if (!odp_mr->page_list) {
+       if (odp_mr->is_implicit_odp) {
                odp = implicit_mr_get_data(mr, io_virt, bcnt);
  
                if (IS_ERR(odp))
@@@ -607,7 -601,7 +603,7 @@@ next_mr
        start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
        access_mask = ODP_READ_ALLOWED_BIT;
  
-       if (prefetch && !downgrade && !mr->umem->writable) {
+       if (prefetch && !downgrade && !odp->umem.writable) {
                /* prefetch with write-access must
                 * be supported by the MR
                 */
                goto out;
        }
  
-       if (mr->umem->writable && !downgrade)
+       if (odp->umem.writable && !downgrade)
                access_mask |= ODP_WRITE_ALLOWED_BIT;
  
        current_seq = READ_ONCE(odp->notifiers_seq);
         */
        smp_rmb();
  
-       ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size,
-                                       access_mask, current_seq);
+       ret = ib_umem_odp_map_dma_pages(odp, io_virt, size, access_mask,
+                                       current_seq);
  
        if (ret < 0)
                goto out;
        np = ret;
  
        mutex_lock(&odp->umem_mutex);
-       if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem),
-                                       current_seq)) {
+       if (!ib_umem_mmu_notifier_retry(odp, current_seq)) {
                /*
                 * No need to check whether the MTTs really belong to
                 * this MR, since ib_umem_odp_map_dma_pages already
  
                io_virt += size;
                next = odp_next(odp);
-               if (unlikely(!next || next->umem.address != io_virt)) {
+               if (unlikely(!next || ib_umem_start(next) != io_virt)) {
                        mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
                                    io_virt, next);
                        return -EAGAIN;
@@@ -1618,16 -1611,15 +1613,17 @@@ void mlx5_odp_init_mr_cache_entry(struc
  
  static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
        .advise_mr = mlx5_ib_advise_mr,
+       .invalidate_range = mlx5_ib_invalidate_range,
  };
  
  int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
  {
        int ret = 0;
  
 -      if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
 -              ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
 +      if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
 +              return ret;
 +
 +      ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
  
        if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
                ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
                }
        }
  
 -      if (!MLX5_CAP_GEN(dev->mdev, pg))
 -              return ret;
 -
        ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq);
  
        return ret;
  
  void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
  {
 -      if (!MLX5_CAP_GEN(dev->mdev, pg))
 +      if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
                return;
  
        mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq);
diff --combined include/linux/sched.h
index b75b282870053e083a1f79ae9b55cba6a95caa74,c5630f3dca1fafac7cfde1ed7051f0b61314ff7e..70db597d6fd4f2ceb53b1a71c0db243e681d7070
  #include <linux/resource.h>
  #include <linux/latencytop.h>
  #include <linux/sched/prio.h>
 +#include <linux/sched/types.h>
  #include <linux/signal_types.h>
  #include <linux/mm_types_task.h>
  #include <linux/task_io_accounting.h>
 +#include <linux/posix-timers.h>
  #include <linux/rseq.h>
  
  /* task_struct member predeclarations (sorted alphabetically): */
@@@ -246,6 -244,27 +246,6 @@@ struct prev_cputime 
  #endif
  };
  
 -/**
 - * struct task_cputime - collected CPU time counts
 - * @utime:            time spent in user mode, in nanoseconds
 - * @stime:            time spent in kernel mode, in nanoseconds
 - * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
 - *
 - * This structure groups together three kinds of CPU time that are tracked for
 - * threads and thread groups.  Most things considering CPU time want to group
 - * these counts together and treat all three of them in parallel.
 - */
 -struct task_cputime {
 -      u64                             utime;
 -      u64                             stime;
 -      unsigned long long              sum_exec_runtime;
 -};
 -
 -/* Alternate field names when used on cache expirations: */
 -#define virt_exp                      utime
 -#define prof_exp                      stime
 -#define sched_exp                     sum_exec_runtime
 -
  enum vtime_state {
        /* Task is sleeping or running in a CPU with VTIME inactive: */
        VTIME_INACTIVE = 0,
@@@ -276,11 -295,6 +276,11 @@@ enum uclamp_id 
        UCLAMP_CNT
  };
  
 +#ifdef CONFIG_SMP
 +extern struct root_domain def_root_domain;
 +extern struct mutex sched_domains_mutex;
 +#endif
 +
  struct sched_info {
  #ifdef CONFIG_SCHED_INFO
        /* Cumulative counters: */
@@@ -862,8 -876,10 +862,8 @@@ struct task_struct 
        unsigned long                   min_flt;
        unsigned long                   maj_flt;
  
 -#ifdef CONFIG_POSIX_TIMERS
 -      struct task_cputime             cputime_expires;
 -      struct list_head                cpu_timers[3];
 -#endif
 +      /* Empty if CONFIG_POSIX_CPUTIMERS=n */
 +      struct posix_cputimers          posix_cputimers;
  
        /* Process credentials: */
  
        struct mutex_waiter             *blocked_on;
  #endif
  
+ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+       int                             non_block_count;
+ #endif
  #ifdef CONFIG_TRACE_IRQFLAGS
        unsigned int                    irq_events;
        unsigned long                   hardirq_enable_ip;
@@@ -1751,7 -1771,7 +1755,7 @@@ static inline int test_tsk_need_resched
   * value indicates whether a reschedule was done in fact.
   * cond_resched_lock() will drop the spinlock before scheduling,
   */
 -#ifndef CONFIG_PREEMPT
 +#ifndef CONFIG_PREEMPTION
  extern int _cond_resched(void);
  #else
  static inline int _cond_resched(void) { return 0; }
@@@ -1780,12 -1800,12 +1784,12 @@@ static inline void cond_resched_rcu(voi
  
  /*
   * Does a critical section need to be broken due to another
 - * task waiting?: (technically does not depend on CONFIG_PREEMPT,
 + * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
   * but a general need for low latency)
   */
  static inline int spin_needbreak(spinlock_t *lock)
  {
 -#ifdef CONFIG_PREEMPT
 +#ifdef CONFIG_PREEMPTION
        return spin_is_contended(lock);
  #else
        return 0;
diff --combined kernel/fork.c
index 53e780748fe3367973182edd594c3e27fc9108fd,92c8559d9745b10d00a790125ca3144198281650..5a0fd518e04e44dd6ec3080f6134fdecc952d653
@@@ -768,7 -768,6 +768,7 @@@ static void set_max_threads(unsigned in
  int arch_task_struct_size __read_mostly;
  #endif
  
 +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
  static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
  {
        /* Fetch thread_struct whitelist for the architecture. */
        else
                *offset += offsetof(struct task_struct, thread);
  }
 +#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
  
  void __init fork_init(void)
  {
@@@ -1009,7 -1007,6 +1009,6 @@@ static struct mm_struct *mm_init(struc
        mm_init_owner(mm, p);
        RCU_INIT_POINTER(mm->exe_file, NULL);
        mmu_notifier_mm_init(mm);
-       hmm_mm_init(mm);
        init_tlb_flush_pending(mm);
  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        mm->pmd_huge_pte = NULL;
@@@ -1519,17 -1516,28 +1518,17 @@@ void __cleanup_sighand(struct sighand_s
        }
  }
  
 -#ifdef CONFIG_POSIX_TIMERS
  /*
   * Initialize POSIX timer handling for a thread group.
   */
  static void posix_cpu_timers_init_group(struct signal_struct *sig)
  {
 +      struct posix_cputimers *pct = &sig->posix_cputimers;
        unsigned long cpu_limit;
  
        cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 -      if (cpu_limit != RLIM_INFINITY) {
 -              sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
 -              sig->cputimer.running = true;
 -      }
 -
 -      /* The timer lists. */
 -      INIT_LIST_HEAD(&sig->cpu_timers[0]);
 -      INIT_LIST_HEAD(&sig->cpu_timers[1]);
 -      INIT_LIST_HEAD(&sig->cpu_timers[2]);
 +      posix_cputimers_group_init(pct, cpu_limit);
  }
 -#else
 -static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
 -#endif
  
  static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
  {
@@@ -1631,6 -1639,23 +1630,6 @@@ static void rt_mutex_init_task(struct t
  #endif
  }
  
 -#ifdef CONFIG_POSIX_TIMERS
 -/*
 - * Initialize POSIX timer handling for a single task.
 - */
 -static void posix_cpu_timers_init(struct task_struct *tsk)
 -{
 -      tsk->cputime_expires.prof_exp = 0;
 -      tsk->cputime_expires.virt_exp = 0;
 -      tsk->cputime_expires.sched_exp = 0;
 -      INIT_LIST_HEAD(&tsk->cpu_timers[0]);
 -      INIT_LIST_HEAD(&tsk->cpu_timers[1]);
 -      INIT_LIST_HEAD(&tsk->cpu_timers[2]);
 -}
 -#else
 -static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
 -#endif
 -
  static inline void init_task_pid_links(struct task_struct *task)
  {
        enum pid_type type;
@@@ -1664,14 -1689,6 +1663,14 @@@ static inline void rcu_copy_process(str
  #endif /* #ifdef CONFIG_TASKS_RCU */
  }
  
 +struct pid *pidfd_pid(const struct file *file)
 +{
 +      if (file->f_op == &pidfd_fops)
 +              return file->private_data;
 +
 +      return ERR_PTR(-EBADF);
 +}
 +
  static int pidfd_release(struct inode *inode, struct file *file)
  {
        struct pid *pid = file->private_data;
@@@ -1917,7 -1934,7 +1916,7 @@@ static __latent_entropy struct task_str
        task_io_accounting_init(&p->ioac);
        acct_clear_integrals(p);
  
 -      posix_cpu_timers_init(p);
 +      posix_cputimers_init(&p->posix_cputimers);
  
        p->io_context = NULL;
        audit_set_context(p, NULL);
@@@ -2320,8 -2337,6 +2319,8 @@@ struct mm_struct *copy_init_mm(void
   *
   * It copies the process, and if successful kick-starts
   * it and waits for it to finish using the VM if required.
 + *
 + * args->exit_signal is expected to be checked for sanity by the caller.
   */
  long _do_fork(struct kernel_clone_args *args)
  {
@@@ -2546,14 -2561,6 +2545,14 @@@ noinline static int copy_clone_args_fro
        if (copy_from_user(&args, uargs, size))
                return -EFAULT;
  
 +      /*
 +       * Verify that higher 32bits of exit_signal are unset and that
 +       * it is a valid signal
 +       */
 +      if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
 +                   !valid_signal(args.exit_signal)))
 +              return -EINVAL;
 +
        *kargs = (struct kernel_clone_args){
                .flags          = args.flags,
                .pidfd          = u64_to_user_ptr(args.pidfd),
diff --combined kernel/sched/core.c
index 5e8387bdd09c65c9b804534afba93a654d39d8a3,57245770d6cc23ef2ab76b86e743a3d6dafe4746..f9a1346a5fa9502be6ca45ecb1bd822bf706725e
@@@ -255,7 -255,7 +255,7 @@@ static void __hrtick_restart(struct rq 
  {
        struct hrtimer *timer = &rq->hrtick_timer;
  
 -      hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
 +      hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
  }
  
  /*
@@@ -314,7 -314,7 +314,7 @@@ void hrtick_start(struct rq *rq, u64 de
         */
        delay = max_t(u64, delay, 10000LL);
        hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
 -                    HRTIMER_MODE_REL_PINNED);
 +                    HRTIMER_MODE_REL_PINNED_HARD);
  }
  #endif /* CONFIG_SMP */
  
@@@ -328,7 -328,7 +328,7 @@@ static void hrtick_rq_init(struct rq *r
        rq->hrtick_csd.info = rq;
  #endif
  
 -      hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 +      hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        rq->hrtick_timer.function = hrtick;
  }
  #else /* CONFIG_SCHED_HRTICK */
@@@ -773,18 -773,6 +773,18 @@@ static void set_load_weight(struct task
  }
  
  #ifdef CONFIG_UCLAMP_TASK
 +/*
 + * Serializes updates of utilization clamp values
 + *
 + * The (slow-path) user-space triggers utilization clamp value updates which
 + * can require updates on (fast-path) scheduler's data structures used to
 + * support enqueue/dequeue operations.
 + * While the per-CPU rq lock protects fast-path update operations, user-space
 + * requests are serialized using a mutex to reduce the risk of conflicting
 + * updates or API abuses.
 + */
 +static DEFINE_MUTEX(uclamp_mutex);
 +
  /* Max allowed minimum utilization */
  unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
  
@@@ -810,7 -798,7 +810,7 @@@ static inline unsigned int uclamp_bucke
        return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
  }
  
 -static inline unsigned int uclamp_none(int clamp_id)
 +static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
  {
        if (clamp_id == UCLAMP_MIN)
                return 0;
@@@ -826,7 -814,7 +826,7 @@@ static inline void uclamp_se_set(struc
  }
  
  static inline unsigned int
 -uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
 +uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
                  unsigned int clamp_value)
  {
        /*
        return uclamp_none(UCLAMP_MIN);
  }
  
 -static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
 +static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
                                     unsigned int clamp_value)
  {
        /* Reset max-clamp retention only on idle exit */
  }
  
  static inline
 -unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
 -                               unsigned int clamp_value)
 +enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
 +                                 unsigned int clamp_value)
  {
        struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
        int bucket_id = UCLAMP_BUCKETS - 1;
        return uclamp_idle_value(rq, clamp_id, clamp_value);
  }
  
 +static inline struct uclamp_se
 +uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
 +{
 +      struct uclamp_se uc_req = p->uclamp_req[clamp_id];
 +#ifdef CONFIG_UCLAMP_TASK_GROUP
 +      struct uclamp_se uc_max;
 +
 +      /*
 +       * Tasks in autogroups or root task group will be
 +       * restricted by system defaults.
 +       */
 +      if (task_group_is_autogroup(task_group(p)))
 +              return uc_req;
 +      if (task_group(p) == &root_task_group)
 +              return uc_req;
 +
 +      uc_max = task_group(p)->uclamp[clamp_id];
 +      if (uc_req.value > uc_max.value || !uc_req.user_defined)
 +              return uc_max;
 +#endif
 +
 +      return uc_req;
 +}
 +
  /*
   * The effective clamp bucket index of a task depends on, by increasing
   * priority:
   * - the task specific clamp value, when explicitly requested from userspace
 + * - the task group effective clamp value, for tasks not either in the root
 + *   group or in an autogroup
   * - the system default clamp value, defined by the sysadmin
   */
  static inline struct uclamp_se
 -uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
 +uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
  {
 -      struct uclamp_se uc_req = p->uclamp_req[clamp_id];
 +      struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
        struct uclamp_se uc_max = uclamp_default[clamp_id];
  
        /* System default restrictions always apply */
        return uc_req;
  }
  
 -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
 +enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
  {
        struct uclamp_se uc_eff;
  
   * for each bucket when all its RUNNABLE tasks require the same clamp.
   */
  static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
 -                                  unsigned int clamp_id)
 +                                  enum uclamp_id clamp_id)
  {
        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
   * enforce the expected state and warn.
   */
  static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
 -                                  unsigned int clamp_id)
 +                                  enum uclamp_id clamp_id)
  {
        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
  
  static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
  {
 -      unsigned int clamp_id;
 +      enum uclamp_id clamp_id;
  
        if (unlikely(!p->sched_class->uclamp_enabled))
                return;
  
  static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
  {
 -      unsigned int clamp_id;
 +      enum uclamp_id clamp_id;
  
        if (unlikely(!p->sched_class->uclamp_enabled))
                return;
                uclamp_rq_dec_id(rq, p, clamp_id);
  }
  
 +static inline void
 +uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
 +{
 +      struct rq_flags rf;
 +      struct rq *rq;
 +
 +      /*
 +       * Lock the task and the rq where the task is (or was) queued.
 +       *
 +       * We might lock the (previous) rq of a !RUNNABLE task, but that's the
 +       * price to pay to safely serialize util_{min,max} updates with
 +       * enqueues, dequeues and migration operations.
 +       * This is the same locking schema used by __set_cpus_allowed_ptr().
 +       */
 +      rq = task_rq_lock(p, &rf);
 +
 +      /*
 +       * Setting the clamp bucket is serialized by task_rq_lock().
 +       * If the task is not yet RUNNABLE and its task_struct is not
 +       * affecting a valid clamp bucket, the next time it's enqueued,
 +       * it will already see the updated clamp bucket value.
 +       */
 +      if (!p->uclamp[clamp_id].active) {
 +              uclamp_rq_dec_id(rq, p, clamp_id);
 +              uclamp_rq_inc_id(rq, p, clamp_id);
 +      }
 +
 +      task_rq_unlock(rq, p, &rf);
 +}
 +
 +static inline void
 +uclamp_update_active_tasks(struct cgroup_subsys_state *css,
 +                         unsigned int clamps)
 +{
 +      enum uclamp_id clamp_id;
 +      struct css_task_iter it;
 +      struct task_struct *p;
 +
 +      css_task_iter_start(css, 0, &it);
 +      while ((p = css_task_iter_next(&it))) {
 +              for_each_clamp_id(clamp_id) {
 +                      if ((0x1 << clamp_id) & clamps)
 +                              uclamp_update_active(p, clamp_id);
 +              }
 +      }
 +      css_task_iter_end(&it);
 +}
 +
 +#ifdef CONFIG_UCLAMP_TASK_GROUP
 +static void cpu_util_update_eff(struct cgroup_subsys_state *css);
 +static void uclamp_update_root_tg(void)
 +{
 +      struct task_group *tg = &root_task_group;
 +
 +      uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
 +                    sysctl_sched_uclamp_util_min, false);
 +      uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
 +                    sysctl_sched_uclamp_util_max, false);
 +
 +      rcu_read_lock();
 +      cpu_util_update_eff(&root_task_group.css);
 +      rcu_read_unlock();
 +}
 +#else
 +static void uclamp_update_root_tg(void) { }
 +#endif
 +
  int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp,
                                loff_t *ppos)
  {
 +      bool update_root_tg = false;
        int old_min, old_max;
 -      static DEFINE_MUTEX(mutex);
        int result;
  
 -      mutex_lock(&mutex);
 +      mutex_lock(&uclamp_mutex);
        old_min = sysctl_sched_uclamp_util_min;
        old_max = sysctl_sched_uclamp_util_max;
  
        if (old_min != sysctl_sched_uclamp_util_min) {
                uclamp_se_set(&uclamp_default[UCLAMP_MIN],
                              sysctl_sched_uclamp_util_min, false);
 +              update_root_tg = true;
        }
        if (old_max != sysctl_sched_uclamp_util_max) {
                uclamp_se_set(&uclamp_default[UCLAMP_MAX],
                              sysctl_sched_uclamp_util_max, false);
 +              update_root_tg = true;
        }
  
 +      if (update_root_tg)
 +              uclamp_update_root_tg();
 +
        /*
 -       * Updating all the RUNNABLE task is expensive, keep it simple and do
 -       * just a lazy update at each next enqueue time.
 +       * We update all RUNNABLE tasks only when task groups are in use.
 +       * Otherwise, keep it simple and do just a lazy update at each next
 +       * task enqueue time.
         */
 +
        goto done;
  
  undo:
        sysctl_sched_uclamp_util_min = old_min;
        sysctl_sched_uclamp_util_max = old_max;
  done:
 -      mutex_unlock(&mutex);
 +      mutex_unlock(&uclamp_mutex);
  
        return result;
  }
@@@ -1187,7 -1075,7 +1187,7 @@@ static int uclamp_validate(struct task_
  static void __setscheduler_uclamp(struct task_struct *p,
                                  const struct sched_attr *attr)
  {
 -      unsigned int clamp_id;
 +      enum uclamp_id clamp_id;
  
        /*
         * On scheduling class change, reset to default clamps for tasks
  
  static void uclamp_fork(struct task_struct *p)
  {
 -      unsigned int clamp_id;
 +      enum uclamp_id clamp_id;
  
        for_each_clamp_id(clamp_id)
                p->uclamp[clamp_id].active = false;
  static void __init init_uclamp(void)
  {
        struct uclamp_se uc_max = {};
 -      unsigned int clamp_id;
 +      enum uclamp_id clamp_id;
        int cpu;
  
 +      mutex_init(&uclamp_mutex);
 +
        for_each_possible_cpu(cpu) {
                memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
                cpu_rq(cpu)->uclamp_flags = 0;
  
        /* System defaults allow max clamp values for both indexes */
        uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
 -      for_each_clamp_id(clamp_id)
 +      for_each_clamp_id(clamp_id) {
                uclamp_default[clamp_id] = uc_max;
 +#ifdef CONFIG_UCLAMP_TASK_GROUP
 +              root_task_group.uclamp_req[clamp_id] = uc_max;
 +              root_task_group.uclamp[clamp_id] = uc_max;
 +#endif
 +      }
  }
  
  #else /* CONFIG_UCLAMP_TASK */
@@@ -1613,7 -1494,7 +1613,7 @@@ void do_set_cpus_allowed(struct task_st
        if (queued)
                enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
        if (running)
 -              set_curr_task(rq, p);
 +              set_next_task(rq, p);
  }
  
  /*
@@@ -3333,8 -3214,12 +3333,8 @@@ static __always_inline struct rq 
  context_switch(struct rq *rq, struct task_struct *prev,
               struct task_struct *next, struct rq_flags *rf)
  {
 -      struct mm_struct *mm, *oldmm;
 -
        prepare_task_switch(rq, prev, next);
  
 -      mm = next->mm;
 -      oldmm = prev->active_mm;
        /*
         * For paravirt, this is coupled with an exit in switch_to to
         * combine the page table reload and the switch backend into
        arch_start_context_switch(prev);
  
        /*
 -       * If mm is non-NULL, we pass through switch_mm(). If mm is
 -       * NULL, we will pass through mmdrop() in finish_task_switch().
 -       * Both of these contain the full memory barrier required by
 -       * membarrier after storing to rq->curr, before returning to
 -       * user-space.
 +       * kernel -> kernel   lazy + transfer active
 +       *   user -> kernel   lazy + mmgrab() active
 +       *
 +       * kernel ->   user   switch + mmdrop() active
 +       *   user ->   user   switch
         */
 -      if (!mm) {
 -              next->active_mm = oldmm;
 -              mmgrab(oldmm);
 -              enter_lazy_tlb(oldmm, next);
 -      } else
 -              switch_mm_irqs_off(oldmm, mm, next);
 +      if (!next->mm) {                                // to kernel
 +              enter_lazy_tlb(prev->active_mm, next);
 +
 +              next->active_mm = prev->active_mm;
 +              if (prev->mm)                           // from user
 +                      mmgrab(prev->active_mm);
 +              else
 +                      prev->active_mm = NULL;
 +      } else {                                        // to user
 +              /*
 +               * sys_membarrier() requires an smp_mb() between setting
 +               * rq->curr and returning to userspace.
 +               *
 +               * The below provides this either through switch_mm(), or in
 +               * case 'prev->active_mm == next->mm' through
 +               * finish_task_switch()'s mmdrop().
 +               */
  
 -      if (!prev->mm) {
 -              prev->active_mm = NULL;
 -              rq->prev_mm = oldmm;
 +              switch_mm_irqs_off(prev->active_mm, next->mm, next);
 +
 +              if (!prev->mm) {                        // from kernel
 +                      /* will mmdrop() in finish_task_switch(). */
 +                      rq->prev_mm = prev->active_mm;
 +                      prev->active_mm = NULL;
 +              }
        }
  
        rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
@@@ -3616,36 -3486,8 +3616,36 @@@ void scheduler_tick(void
  
  struct tick_work {
        int                     cpu;
 +      atomic_t                state;
        struct delayed_work     work;
  };
 +/* Values for ->state, see diagram below. */
 +#define TICK_SCHED_REMOTE_OFFLINE     0
 +#define TICK_SCHED_REMOTE_OFFLINING   1
 +#define TICK_SCHED_REMOTE_RUNNING     2
 +
 +/*
 + * State diagram for ->state:
 + *
 + *
 + *          TICK_SCHED_REMOTE_OFFLINE
 + *                    |   ^
 + *                    |   |
 + *                    |   | sched_tick_remote()
 + *                    |   |
 + *                    |   |
 + *                    +--TICK_SCHED_REMOTE_OFFLINING
 + *                    |   ^
 + *                    |   |
 + * sched_tick_start() |   | sched_tick_stop()
 + *                    |   |
 + *                    V   |
 + *          TICK_SCHED_REMOTE_RUNNING
 + *
 + *
 + * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
 + * and sched_tick_start() are happy to leave the state in RUNNING.
 + */
  
  static struct tick_work __percpu *tick_work_cpu;
  
@@@ -3658,7 -3500,6 +3658,7 @@@ static void sched_tick_remote(struct wo
        struct task_struct *curr;
        struct rq_flags rf;
        u64 delta;
 +      int os;
  
        /*
         * Handle the tick only if it appears the remote CPU is running in full
  
        rq_lock_irq(rq, &rf);
        curr = rq->curr;
 -      if (is_idle_task(curr))
 +      if (is_idle_task(curr) || cpu_is_offline(cpu))
                goto out_unlock;
  
        update_rq_clock(rq);
@@@ -3692,18 -3533,13 +3692,18 @@@ out_requeue
        /*
         * Run the remote tick once per second (1Hz). This arbitrary
         * frequency is large enough to avoid overload but short enough
 -       * to keep scheduler internal stats reasonably up to date.
 +       * to keep scheduler internal stats reasonably up to date.  But
 +       * first update state to reflect hotplug activity if required.
         */
 -      queue_delayed_work(system_unbound_wq, dwork, HZ);
 +      os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
 +      WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
 +      if (os == TICK_SCHED_REMOTE_RUNNING)
 +              queue_delayed_work(system_unbound_wq, dwork, HZ);
  }
  
  static void sched_tick_start(int cpu)
  {
 +      int os;
        struct tick_work *twork;
  
        if (housekeeping_cpu(cpu, HK_FLAG_TICK))
        WARN_ON_ONCE(!tick_work_cpu);
  
        twork = per_cpu_ptr(tick_work_cpu, cpu);
 -      twork->cpu = cpu;
 -      INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
 -      queue_delayed_work(system_unbound_wq, &twork->work, HZ);
 +      os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
 +      WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
 +      if (os == TICK_SCHED_REMOTE_OFFLINE) {
 +              twork->cpu = cpu;
 +              INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
 +              queue_delayed_work(system_unbound_wq, &twork->work, HZ);
 +      }
  }
  
  #ifdef CONFIG_HOTPLUG_CPU
  static void sched_tick_stop(int cpu)
  {
        struct tick_work *twork;
 +      int os;
  
        if (housekeeping_cpu(cpu, HK_FLAG_TICK))
                return;
        WARN_ON_ONCE(!tick_work_cpu);
  
        twork = per_cpu_ptr(tick_work_cpu, cpu);
 -      cancel_delayed_work_sync(&twork->work);
 +      /* There cannot be competing actions, but don't rely on stop-machine. */
 +      os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
 +      WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
 +      /* Don't cancel, as this would mess up the state machine. */
  }
  #endif /* CONFIG_HOTPLUG_CPU */
  
@@@ -3744,6 -3572,7 +3744,6 @@@ int __init sched_tick_offload_init(void
  {
        tick_work_cpu = alloc_percpu(struct tick_work);
        BUG_ON(!tick_work_cpu);
 -
        return 0;
  }
  
@@@ -3752,7 -3581,7 +3752,7 @@@ static inline void sched_tick_start(in
  static inline void sched_tick_stop(int cpu) { }
  #endif
  
 -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 +#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                defined(CONFIG_TRACE_PREEMPT_TOGGLE))
  /*
   * If the value passed in is equal to the current preempt count
@@@ -3871,13 -3700,22 +3871,22 @@@ static noinline void __schedule_bug(str
  /*
   * Various schedule()-time debugging checks and statistics:
   */
- static inline void schedule_debug(struct task_struct *prev)
+ static inline void schedule_debug(struct task_struct *prev, bool preempt)
  {
  #ifdef CONFIG_SCHED_STACK_END_CHECK
        if (task_stack_end_corrupted(prev))
                panic("corrupted stack end detected inside scheduler\n");
  #endif
  
+ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+       if (!preempt && prev->state && prev->non_block_count) {
+               printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
+                       prev->comm, prev->pid, prev->non_block_count);
+               dump_stack();
+               add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+       }
+ #endif
        if (unlikely(in_atomic_preempt_off())) {
                __schedule_bug(prev);
                preempt_count_set(PREEMPT_DISABLED);
@@@ -3910,7 -3748,7 +3919,7 @@@ pick_next_task(struct rq *rq, struct ta
  
                p = fair_sched_class.pick_next_task(rq, prev, rf);
                if (unlikely(p == RETRY_TASK))
 -                      goto again;
 +                      goto restart;
  
                /* Assumes fair_sched_class->next == idle_sched_class */
                if (unlikely(!p))
                return p;
        }
  
 -again:
 +restart:
 +      /*
 +       * Ensure that we put DL/RT tasks before the pick loop, such that they
 +       * can PULL higher prio tasks when we lower the RQ 'priority'.
 +       */
 +      prev->sched_class->put_prev_task(rq, prev, rf);
 +      if (!rq->nr_running)
 +              newidle_balance(rq, rf);
 +
        for_each_class(class) {
 -              p = class->pick_next_task(rq, prev, rf);
 -              if (p) {
 -                      if (unlikely(p == RETRY_TASK))
 -                              goto again;
 +              p = class->pick_next_task(rq, NULL, NULL);
 +              if (p)
                        return p;
 -              }
        }
  
        /* The idle class should always have a runnable task: */
   *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
   *      called on the nearest possible occasion:
   *
 - *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
 + *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
   *
   *         - in syscall or exception context, at the next outmost
   *           preempt_enable(). (this might be as soon as the wake_up()'s
   *         - in IRQ context, return from interrupt-handler to
   *           preemptible context
   *
 - *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
 + *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
   *         then at the next:
   *
   *          - cond_resched() call
@@@ -3989,7 -3822,7 +3998,7 @@@ static void __sched notrace __schedule(
        rq = cpu_rq(cpu);
        prev = rq->curr;
  
-       schedule_debug(prev);
+       schedule_debug(prev, preempt);
  
        if (sched_feat(HRTICK))
                hrtick_clear(rq);
@@@ -4080,7 -3913,7 +4089,7 @@@ void __noreturn do_task_dead(void
  
  static inline void sched_submit_work(struct task_struct *tsk)
  {
 -      if (!tsk->state || tsk_is_pi_blocked(tsk))
 +      if (!tsk->state)
                return;
  
        /*
                preempt_enable_no_resched();
        }
  
 +      if (tsk_is_pi_blocked(tsk))
 +              return;
 +
        /*
         * If we are going to sleep and we have plugged IO queued,
         * make sure to submit it to avoid deadlocks.
@@@ -4212,7 -4042,7 +4221,7 @@@ static void __sched notrace preempt_sch
        } while (need_resched());
  }
  
 -#ifdef CONFIG_PREEMPT
 +#ifdef CONFIG_PREEMPTION
  /*
   * this is the entry point to schedule() from in-kernel preemption
   * off of preempt_enable. Kernel preemptions off return from interrupt
@@@ -4284,7 -4114,7 +4293,7 @@@ asmlinkage __visible void __sched notra
  }
  EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
  
 -#endif /* CONFIG_PREEMPT */
 +#endif /* CONFIG_PREEMPTION */
  
  /*
   * this is the entry point to schedule() from kernel preemption
@@@ -4452,7 -4282,7 +4461,7 @@@ void rt_mutex_setprio(struct task_struc
        if (queued)
                enqueue_task(rq, p, queue_flag);
        if (running)
 -              set_curr_task(rq, p);
 +              set_next_task(rq, p);
  
        check_class_changed(rq, p, prev_class, oldprio);
  out_unlock:
@@@ -4519,7 -4349,7 +4528,7 @@@ void set_user_nice(struct task_struct *
                        resched_curr(rq);
        }
        if (running)
 -              set_curr_task(rq, p);
 +              set_next_task(rq, p);
  out_unlock:
        task_rq_unlock(rq, p, &rf);
  }
@@@ -4836,9 -4666,6 +4845,9 @@@ recheck
                        return retval;
        }
  
 +      if (pi)
 +              cpuset_read_lock();
 +
        /*
         * Make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
         * Changing the policy of the stop threads its a very bad idea:
         */
        if (p == rq->stop) {
 -              task_rq_unlock(rq, p, &rf);
 -              return -EINVAL;
 +              retval = -EINVAL;
 +              goto unlock;
        }
  
        /*
                        goto change;
  
                p->sched_reset_on_fork = reset_on_fork;
 -              task_rq_unlock(rq, p, &rf);
 -              return 0;
 +              retval = 0;
 +              goto unlock;
        }
  change:
  
                if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                !task_group_is_autogroup(task_group(p))) {
 -                      task_rq_unlock(rq, p, &rf);
 -                      return -EPERM;
 +                      retval = -EPERM;
 +                      goto unlock;
                }
  #endif
  #ifdef CONFIG_SMP
                         */
                        if (!cpumask_subset(span, p->cpus_ptr) ||
                            rq->rd->dl_bw.bw == 0) {
 -                              task_rq_unlock(rq, p, &rf);
 -                              return -EPERM;
 +                              retval = -EPERM;
 +                              goto unlock;
                        }
                }
  #endif
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                policy = oldpolicy = -1;
                task_rq_unlock(rq, p, &rf);
 +              if (pi)
 +                      cpuset_read_unlock();
                goto recheck;
        }
  
         * is available.
         */
        if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
 -              task_rq_unlock(rq, p, &rf);
 -              return -EBUSY;
 +              retval = -EBUSY;
 +              goto unlock;
        }
  
        p->sched_reset_on_fork = reset_on_fork;
                enqueue_task(rq, p, queue_flags);
        }
        if (running)
 -              set_curr_task(rq, p);
 +              set_next_task(rq, p);
  
        check_class_changed(rq, p, prev_class, oldprio);
  
        preempt_disable();
        task_rq_unlock(rq, p, &rf);
  
 -      if (pi)
 +      if (pi) {
 +              cpuset_read_unlock();
                rt_mutex_adjust_pi(p);
 +      }
  
        /* Run balance callbacks after we've adjusted the PI chain: */
        balance_callback(rq);
        preempt_enable();
  
        return 0;
 +
 +unlock:
 +      task_rq_unlock(rq, p, &rf);
 +      if (pi)
 +              cpuset_read_unlock();
 +      return retval;
  }
  
  static int _sched_setscheduler(struct task_struct *p, int policy,
@@@ -5074,15 -4891,10 +5083,15 @@@ do_sched_setscheduler(pid_t pid, int po
        rcu_read_lock();
        retval = -ESRCH;
        p = find_process_by_pid(pid);
 -      if (p != NULL)
 -              retval = sched_setscheduler(p, policy, &lparam);
 +      if (likely(p))
 +              get_task_struct(p);
        rcu_read_unlock();
  
 +      if (likely(p)) {
 +              retval = sched_setscheduler(p, policy, &lparam);
 +              put_task_struct(p);
 +      }
 +
        return retval;
  }
  
@@@ -5299,40 -5111,37 +5308,40 @@@ out_unlock
        return retval;
  }
  
 -static int sched_read_attr(struct sched_attr __user *uattr,
 -                         struct sched_attr *attr,
 -                         unsigned int usize)
 +/*
 + * Copy the kernel size attribute structure (which might be larger
 + * than what user-space knows about) to user-space.
 + *
 + * Note that all cases are valid: user-space buffer can be larger or
 + * smaller than the kernel-space buffer. The usual case is that both
 + * have the same size.
 + */
 +static int
 +sched_attr_copy_to_user(struct sched_attr __user *uattr,
 +                      struct sched_attr *kattr,
 +                      unsigned int usize)
  {
 -      int ret;
 +      unsigned int ksize = sizeof(*kattr);
  
        if (!access_ok(uattr, usize))
                return -EFAULT;
  
        /*
 -       * If we're handed a smaller struct than we know of,
 -       * ensure all the unknown bits are 0 - i.e. old
 -       * user-space does not get uncomplete information.
 +       * sched_getattr() ABI forwards and backwards compatibility:
 +       *
 +       * If usize == ksize then we just copy everything to user-space and all is good.
 +       *
 +       * If usize < ksize then we only copy as much as user-space has space for,
 +       * this keeps ABI compatibility as well. We skip the rest.
 +       *
 +       * If usize > ksize then user-space is using a newer version of the ABI,
 +       * which part the kernel doesn't know about. Just ignore it - tooling can
 +       * detect the kernel's knowledge of attributes from the attr->size value
 +       * which is set to ksize in this case.
         */
 -      if (usize < sizeof(*attr)) {
 -              unsigned char *addr;
 -              unsigned char *end;
 -
 -              addr = (void *)attr + usize;
 -              end  = (void *)attr + sizeof(*attr);
 -
 -              for (; addr < end; addr++) {
 -                      if (*addr)
 -                              return -EFBIG;
 -              }
 -
 -              attr->size = usize;
 -      }
 +      kattr->size = min(usize, ksize);
  
 -      ret = copy_to_user(uattr, attr, attr->size);
 -      if (ret)
 +      if (copy_to_user(uattr, kattr, kattr->size))
                return -EFAULT;
  
        return 0;
   * sys_sched_getattr - similar to sched_getparam, but with sched_attr
   * @pid: the pid in question.
   * @uattr: structure containing the extended parameters.
 - * @size: sizeof(attr) for fwd/bwd comp.
 + * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
   * @flags: for future extension.
   */
  SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 -              unsigned int, size, unsigned int, flags)
 +              unsigned int, usize, unsigned int, flags)
  {
 -      struct sched_attr attr = {
 -              .size = sizeof(struct sched_attr),
 -      };
 +      struct sched_attr kattr = { };
        struct task_struct *p;
        int retval;
  
 -      if (!uattr || pid < 0 || size > PAGE_SIZE ||
 -          size < SCHED_ATTR_SIZE_VER0 || flags)
 +      if (!uattr || pid < 0 || usize > PAGE_SIZE ||
 +          usize < SCHED_ATTR_SIZE_VER0 || flags)
                return -EINVAL;
  
        rcu_read_lock();
        if (retval)
                goto out_unlock;
  
 -      attr.sched_policy = p->policy;
 +      kattr.sched_policy = p->policy;
        if (p->sched_reset_on_fork)
 -              attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
 +              kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
        if (task_has_dl_policy(p))
 -              __getparam_dl(p, &attr);
 +              __getparam_dl(p, &kattr);
        else if (task_has_rt_policy(p))
 -              attr.sched_priority = p->rt_priority;
 +              kattr.sched_priority = p->rt_priority;
        else
 -              attr.sched_nice = task_nice(p);
 +              kattr.sched_nice = task_nice(p);
  
  #ifdef CONFIG_UCLAMP_TASK
 -      attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
 -      attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
 +      kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
 +      kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
  #endif
  
        rcu_read_unlock();
  
 -      retval = sched_read_attr(uattr, &attr, size);
 -      return retval;
 +      return sched_attr_copy_to_user(uattr, &kattr, usize);
  
  out_unlock:
        rcu_read_unlock();
@@@ -5613,7 -5425,7 +5622,7 @@@ SYSCALL_DEFINE0(sched_yield
        return 0;
  }
  
 -#ifndef CONFIG_PREEMPT
 +#ifndef CONFIG_PREEMPTION
  int __sched _cond_resched(void)
  {
        if (should_resched(0)) {
@@@ -5630,7 -5442,7 +5639,7 @@@ EXPORT_SYMBOL(_cond_resched)
   * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
   * call schedule, and on return reacquire the lock.
   *
 - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
 + * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
   * operations here to prevent schedule() from being called twice (once via
   * spin_unlock(), once by hand).
   */
@@@ -6169,7 -5981,7 +6178,7 @@@ void sched_setnuma(struct task_struct *
        if (queued)
                enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
        if (running)
 -              set_curr_task(rq, p);
 +              set_next_task(rq, p);
        task_rq_unlock(rq, p, &rf);
  }
  #endif /* CONFIG_NUMA_BALANCING */
@@@ -6209,22 -6021,21 +6218,22 @@@ static void calc_load_migrate(struct r
                atomic_long_add(delta, &calc_load_tasks);
  }
  
 -static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
 +static struct task_struct *__pick_migrate_task(struct rq *rq)
  {
 -}
 +      const struct sched_class *class;
 +      struct task_struct *next;
  
 -static const struct sched_class fake_sched_class = {
 -      .put_prev_task = put_prev_task_fake,
 -};
 +      for_each_class(class) {
 +              next = class->pick_next_task(rq, NULL, NULL);
 +              if (next) {
 +                      next->sched_class->put_prev_task(rq, next, NULL);
 +                      return next;
 +              }
 +      }
  
 -static struct task_struct fake_task = {
 -      /*
 -       * Avoid pull_{rt,dl}_task()
 -       */
 -      .prio = MAX_PRIO + 1,
 -      .sched_class = &fake_sched_class,
 -};
 +      /* The idle class should always have a runnable task */
 +      BUG();
 +}
  
  /*
   * Migrate all tasks from the rq, sleeping tasks will be migrated by
@@@ -6267,7 -6078,12 +6276,7 @@@ static void migrate_tasks(struct rq *de
                if (rq->nr_running == 1)
                        break;
  
 -              /*
 -               * pick_next_task() assumes pinned rq->lock:
 -               */
 -              next = pick_next_task(rq, &fake_task, rf);
 -              BUG_ON(!next);
 -              put_prev_task(rq, next);
 +              next = __pick_migrate_task(rq);
  
                /*
                 * Rules for changing task_struct::cpus_mask are holding
@@@ -6564,19 -6380,19 +6573,19 @@@ DECLARE_PER_CPU(cpumask_var_t, select_i
  
  void __init sched_init(void)
  {
 -      unsigned long alloc_size = 0, ptr;
 +      unsigned long ptr = 0;
        int i;
  
        wait_bit_init();
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
 -      alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 +      ptr += 2 * nr_cpu_ids * sizeof(void **);
  #endif
  #ifdef CONFIG_RT_GROUP_SCHED
 -      alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 +      ptr += 2 * nr_cpu_ids * sizeof(void **);
  #endif
 -      if (alloc_size) {
 -              ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 +      if (ptr) {
 +              ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
                root_task_group.se = (struct sched_entity **)ptr;
@@@ -6763,7 -6579,7 +6772,7 @@@ void ___might_sleep(const char *file, i
        rcu_sleep_check();
  
        if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
-            !is_idle_task(current)) ||
+            !is_idle_task(current) && !current->non_block_count) ||
            system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
            oops_in_progress)
                return;
                "BUG: sleeping function called from invalid context at %s:%d\n",
                        file, line);
        printk(KERN_ERR
-               "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
-                       in_atomic(), irqs_disabled(),
+               "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+                       in_atomic(), irqs_disabled(), current->non_block_count,
                        current->pid, current->comm);
  
        if (task_stack_end_corrupted(current))
@@@ -6895,7 -6711,7 +6904,7 @@@ struct task_struct *curr_task(int cpu
  
  #ifdef CONFIG_IA64
  /**
 - * set_curr_task - set the current task for a given CPU.
 + * ia64_set_curr_task - set the current task for a given CPU.
   * @cpu: the processor in question.
   * @p: the task pointer to set.
   *
@@@ -6920,20 -6736,6 +6929,20 @@@ void ia64_set_curr_task(int cpu, struc
  /* task_group_lock serializes the addition/removal of task groups */
  static DEFINE_SPINLOCK(task_group_lock);
  
 +static inline void alloc_uclamp_sched_group(struct task_group *tg,
 +                                          struct task_group *parent)
 +{
 +#ifdef CONFIG_UCLAMP_TASK_GROUP
 +      enum uclamp_id clamp_id;
 +
 +      for_each_clamp_id(clamp_id) {
 +              uclamp_se_set(&tg->uclamp_req[clamp_id],
 +                            uclamp_none(clamp_id), false);
 +              tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
 +      }
 +#endif
 +}
 +
  static void sched_free_group(struct task_group *tg)
  {
        free_fair_sched_group(tg);
@@@ -6957,8 -6759,6 +6966,8 @@@ struct task_group *sched_create_group(s
        if (!alloc_rt_sched_group(tg, parent))
                goto err;
  
 +      alloc_uclamp_sched_group(tg, parent);
 +
        return tg;
  
  err:
@@@ -7062,7 -6862,7 +7071,7 @@@ void sched_move_task(struct task_struc
        if (queued)
                enqueue_task(rq, tsk, queue_flags);
        if (running)
 -              set_curr_task(rq, tsk);
 +              set_next_task(rq, tsk);
  
        task_rq_unlock(rq, tsk, &rf);
  }
@@@ -7145,6 -6945,10 +7154,6 @@@ static int cpu_cgroup_can_attach(struc
  #ifdef CONFIG_RT_GROUP_SCHED
                if (!sched_rt_can_attach(css_tg(css), task))
                        return -EINVAL;
 -#else
 -              /* We don't support RT-tasks being in separate groups */
 -              if (task->sched_class != &fair_sched_class)
 -                      return -EINVAL;
  #endif
                /*
                 * Serialize against wake_up_new_task() such that if its
@@@ -7175,178 -6979,6 +7184,178 @@@ static void cpu_cgroup_attach(struct cg
                sched_move_task(task);
  }
  
 +#ifdef CONFIG_UCLAMP_TASK_GROUP
 +static void cpu_util_update_eff(struct cgroup_subsys_state *css)
 +{
 +      struct cgroup_subsys_state *top_css = css;
 +      struct uclamp_se *uc_parent = NULL;
 +      struct uclamp_se *uc_se = NULL;
 +      unsigned int eff[UCLAMP_CNT];
 +      enum uclamp_id clamp_id;
 +      unsigned int clamps;
 +
 +      css_for_each_descendant_pre(css, top_css) {
 +              uc_parent = css_tg(css)->parent
 +                      ? css_tg(css)->parent->uclamp : NULL;
 +
 +              for_each_clamp_id(clamp_id) {
 +                      /* Assume effective clamps matches requested clamps */
 +                      eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
 +                      /* Cap effective clamps with parent's effective clamps */
 +                      if (uc_parent &&
 +                          eff[clamp_id] > uc_parent[clamp_id].value) {
 +                              eff[clamp_id] = uc_parent[clamp_id].value;
 +                      }
 +              }
 +              /* Ensure protection is always capped by limit */
 +              eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
 +
 +              /* Propagate most restrictive effective clamps */
 +              clamps = 0x0;
 +              uc_se = css_tg(css)->uclamp;
 +              for_each_clamp_id(clamp_id) {
 +                      if (eff[clamp_id] == uc_se[clamp_id].value)
 +                              continue;
 +                      uc_se[clamp_id].value = eff[clamp_id];
 +                      uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
 +                      clamps |= (0x1 << clamp_id);
 +              }
 +              if (!clamps) {
 +                      css = css_rightmost_descendant(css);
 +                      continue;
 +              }
 +
 +              /* Immediately update descendants RUNNABLE tasks */
 +              uclamp_update_active_tasks(css, clamps);
 +      }
 +}
 +
 +/*
 + * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
 + * C expression. Since there is no way to convert a macro argument (N) into a
 + * character constant, use two levels of macros.
 + */
 +#define _POW10(exp) ((unsigned int)1e##exp)
 +#define POW10(exp) _POW10(exp)
 +
 +struct uclamp_request {
 +#define UCLAMP_PERCENT_SHIFT  2
 +#define UCLAMP_PERCENT_SCALE  (100 * POW10(UCLAMP_PERCENT_SHIFT))
 +      s64 percent;
 +      u64 util;
 +      int ret;
 +};
 +
 +static inline struct uclamp_request
 +capacity_from_percent(char *buf)
 +{
 +      struct uclamp_request req = {
 +              .percent = UCLAMP_PERCENT_SCALE,
 +              .util = SCHED_CAPACITY_SCALE,
 +              .ret = 0,
 +      };
 +
 +      buf = strim(buf);
 +      if (strcmp(buf, "max")) {
 +              req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
 +                                           &req.percent);
 +              if (req.ret)
 +                      return req;
 +              if (req.percent > UCLAMP_PERCENT_SCALE) {
 +                      req.ret = -ERANGE;
 +                      return req;
 +              }
 +
 +              req.util = req.percent << SCHED_CAPACITY_SHIFT;
 +              req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
 +      }
 +
 +      return req;
 +}
 +
 +static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
 +                              size_t nbytes, loff_t off,
 +                              enum uclamp_id clamp_id)
 +{
 +      struct uclamp_request req;
 +      struct task_group *tg;
 +
 +      req = capacity_from_percent(buf);
 +      if (req.ret)
 +              return req.ret;
 +
 +      mutex_lock(&uclamp_mutex);
 +      rcu_read_lock();
 +
 +      tg = css_tg(of_css(of));
 +      if (tg->uclamp_req[clamp_id].value != req.util)
 +              uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
 +
 +      /*
 +       * Because of not recoverable conversion rounding we keep track of the
 +       * exact requested value
 +       */
 +      tg->uclamp_pct[clamp_id] = req.percent;
 +
 +      /* Update effective clamps to track the most restrictive value */
 +      cpu_util_update_eff(of_css(of));
 +
 +      rcu_read_unlock();
 +      mutex_unlock(&uclamp_mutex);
 +
 +      return nbytes;
 +}
 +
 +static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
 +                                  char *buf, size_t nbytes,
 +                                  loff_t off)
 +{
 +      return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
 +}
 +
 +static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
 +                                  char *buf, size_t nbytes,
 +                                  loff_t off)
 +{
 +      return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
 +}
 +
 +static inline void cpu_uclamp_print(struct seq_file *sf,
 +                                  enum uclamp_id clamp_id)
 +{
 +      struct task_group *tg;
 +      u64 util_clamp;
 +      u64 percent;
 +      u32 rem;
 +
 +      rcu_read_lock();
 +      tg = css_tg(seq_css(sf));
 +      util_clamp = tg->uclamp_req[clamp_id].value;
 +      rcu_read_unlock();
 +
 +      if (util_clamp == SCHED_CAPACITY_SCALE) {
 +              seq_puts(sf, "max\n");
 +              return;
 +      }
 +
 +      percent = tg->uclamp_pct[clamp_id];
 +      percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
 +      seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
 +}
 +
 +static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
 +{
 +      cpu_uclamp_print(sf, UCLAMP_MIN);
 +      return 0;
 +}
 +
 +static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
 +{
 +      cpu_uclamp_print(sf, UCLAMP_MAX);
 +      return 0;
 +}
 +#endif /* CONFIG_UCLAMP_TASK_GROUP */
 +
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                struct cftype *cftype, u64 shareval)
@@@ -7691,20 -7323,6 +7700,20 @@@ static struct cftype cpu_legacy_files[
                .read_u64 = cpu_rt_period_read_uint,
                .write_u64 = cpu_rt_period_write_uint,
        },
 +#endif
 +#ifdef CONFIG_UCLAMP_TASK_GROUP
 +      {
 +              .name = "uclamp.min",
 +              .flags = CFTYPE_NOT_ON_ROOT,
 +              .seq_show = cpu_uclamp_min_show,
 +              .write = cpu_uclamp_min_write,
 +      },
 +      {
 +              .name = "uclamp.max",
 +              .flags = CFTYPE_NOT_ON_ROOT,
 +              .seq_show = cpu_uclamp_max_show,
 +              .write = cpu_uclamp_max_write,
 +      },
  #endif
        { }     /* Terminate */
  };
@@@ -7872,20 -7490,6 +7881,20 @@@ static struct cftype cpu_files[] = 
                .seq_show = cpu_max_show,
                .write = cpu_max_write,
        },
 +#endif
 +#ifdef CONFIG_UCLAMP_TASK_GROUP
 +      {
 +              .name = "uclamp.min",
 +              .flags = CFTYPE_NOT_ON_ROOT,
 +              .seq_show = cpu_uclamp_min_show,
 +              .write = cpu_uclamp_min_write,
 +      },
 +      {
 +              .name = "uclamp.max",
 +              .flags = CFTYPE_NOT_ON_ROOT,
 +              .seq_show = cpu_uclamp_max_show,
 +              .write = cpu_uclamp_max_write,
 +      },
  #endif
        { }     /* terminate */
  };
diff --combined mm/madvise.c
index bac973b9f2cc71a11a5d010f38cc672adbdc1772,afe2b015ea58a3a9be6356b97775aa3674bc1234..88babcc384b9d4362b929d1771921dda30a80b67
  #include <linux/userfaultfd_k.h>
  #include <linux/hugetlb.h>
  #include <linux/falloc.h>
 +#include <linux/fadvise.h>
  #include <linux/sched.h>
  #include <linux/ksm.h>
  #include <linux/fs.h>
  #include <linux/file.h>
  #include <linux/blkdev.h>
  #include <linux/backing-dev.h>
+ #include <linux/pagewalk.h>
  #include <linux/swap.h>
  #include <linux/swapops.h>
  #include <linux/shmem_fs.h>
@@@ -226,19 -226,9 +227,9 @@@ static int swapin_walk_pmd_entry(pmd_t 
        return 0;
  }
  
- static void force_swapin_readahead(struct vm_area_struct *vma,
-               unsigned long start, unsigned long end)
- {
-       struct mm_walk walk = {
-               .mm = vma->vm_mm,
-               .pmd_entry = swapin_walk_pmd_entry,
-               .private = vma,
-       };
-       walk_page_range(start, end, &walk);
-       lru_add_drain();        /* Push any new pages onto the LRU now */
- }
+ static const struct mm_walk_ops swapin_walk_ops = {
+       .pmd_entry              = swapin_walk_pmd_entry,
+ };
  
  static void force_shm_swapin_readahead(struct vm_area_struct *vma,
                unsigned long start, unsigned long end,
@@@ -276,12 -266,12 +267,13 @@@ static long madvise_willneed(struct vm_
                             unsigned long start, unsigned long end)
  {
        struct file *file = vma->vm_file;
 +      loff_t offset;
  
        *prev = vma;
  #ifdef CONFIG_SWAP
        if (!file) {
-               force_swapin_readahead(vma, start, end);
+               walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
+               lru_add_drain(); /* Push any new pages onto the LRU now */
                return 0;
        }
  
                return 0;
        }
  
 -      start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 -      if (end > vma->vm_end)
 -              end = vma->vm_end;
 -      end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 -
 -      force_page_cache_readahead(file->f_mapping, file, start, end - start);
 +      /*
 +       * Filesystem's fadvise may need to take various locks.  We need to
 +       * explicitly grab a reference because the vma (and hence the
 +       * vma's reference to the file) can go away as soon as we drop
 +       * mmap_sem.
 +       */
 +      *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
 +      get_file(file);
 +      up_read(&current->mm->mmap_sem);
 +      offset = (loff_t)(start - vma->vm_start)
 +                      + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 +      vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
 +      fput(file);
 +      down_read(&current->mm->mmap_sem);
        return 0;
  }
  
@@@ -450,20 -432,9 +442,9 @@@ next
        return 0;
  }
  
- static void madvise_free_page_range(struct mmu_gather *tlb,
-                            struct vm_area_struct *vma,
-                            unsigned long addr, unsigned long end)
- {
-       struct mm_walk free_walk = {
-               .pmd_entry = madvise_free_pte_range,
-               .mm = vma->vm_mm,
-               .private = tlb,
-       };
-       tlb_start_vma(tlb, vma);
-       walk_page_range(addr, end, &free_walk);
-       tlb_end_vma(tlb, vma);
- }
+ static const struct mm_walk_ops madvise_free_walk_ops = {
+       .pmd_entry              = madvise_free_pte_range,
+ };
  
  static int madvise_free_single_vma(struct vm_area_struct *vma,
                        unsigned long start_addr, unsigned long end_addr)
        update_hiwater_rss(mm);
  
        mmu_notifier_invalidate_range_start(&range);
-       madvise_free_page_range(&tlb, vma, range.start, range.end);
+       tlb_start_vma(&tlb, vma);
+       walk_page_range(vma->vm_mm, range.start, range.end,
+                       &madvise_free_walk_ops, &tlb);
+       tlb_end_vma(&tlb, vma);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb, range.start, range.end);
  
diff --combined mm/memcontrol.c
index 597d5810187256a6dd681948266de8a362855d69,9b2516a76be234475a0b6beb4f341060e95ec419..f3c15bb07cce4be6dc9eb6143da2625828c56c4a
@@@ -25,7 -25,7 +25,7 @@@
  #include <linux/page_counter.h>
  #include <linux/memcontrol.h>
  #include <linux/cgroup.h>
- #include <linux/mm.h>
+ #include <linux/pagewalk.h>
  #include <linux/sched/mm.h>
  #include <linux/shmem_fs.h>
  #include <linux/hugetlb.h>
@@@ -87,10 -87,6 +87,10 @@@ int do_swap_account __read_mostly
  #define do_swap_account               0
  #endif
  
 +#ifdef CONFIG_CGROUP_WRITEBACK
 +static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
 +#endif
 +
  /* Whether legacy memory+swap accounting is active */
  static bool do_memsw_account(void)
  {
@@@ -756,13 -752,15 +756,13 @@@ void __mod_lruvec_state(struct lruvec *
        /* Update memcg */
        __mod_memcg_state(memcg, idx, val);
  
 +      /* Update lruvec */
 +      __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
 +
        x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
        if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
                struct mem_cgroup_per_node *pi;
  
 -              /*
 -               * Batch local counters to keep them in sync with
 -               * the hierarchical ones.
 -               */
 -              __this_cpu_add(pn->lruvec_stat_local->count[idx], x);
                for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
                        atomic_long_add(x, &pi->lruvec_stat[idx]);
                x = 0;
@@@ -3262,72 -3260,6 +3262,72 @@@ static u64 mem_cgroup_read_u64(struct c
        }
  }
  
 +static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only)
 +{
 +      unsigned long stat[MEMCG_NR_STAT];
 +      struct mem_cgroup *mi;
 +      int node, cpu, i;
 +      int min_idx, max_idx;
 +
 +      if (slab_only) {
 +              min_idx = NR_SLAB_RECLAIMABLE;
 +              max_idx = NR_SLAB_UNRECLAIMABLE;
 +      } else {
 +              min_idx = 0;
 +              max_idx = MEMCG_NR_STAT;
 +      }
 +
 +      for (i = min_idx; i < max_idx; i++)
 +              stat[i] = 0;
 +
 +      for_each_online_cpu(cpu)
 +              for (i = min_idx; i < max_idx; i++)
 +                      stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
 +
 +      for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
 +              for (i = min_idx; i < max_idx; i++)
 +                      atomic_long_add(stat[i], &mi->vmstats[i]);
 +
 +      if (!slab_only)
 +              max_idx = NR_VM_NODE_STAT_ITEMS;
 +
 +      for_each_node(node) {
 +              struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
 +              struct mem_cgroup_per_node *pi;
 +
 +              for (i = min_idx; i < max_idx; i++)
 +                      stat[i] = 0;
 +
 +              for_each_online_cpu(cpu)
 +                      for (i = min_idx; i < max_idx; i++)
 +                              stat[i] += per_cpu(
 +                                      pn->lruvec_stat_cpu->count[i], cpu);
 +
 +              for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
 +                      for (i = min_idx; i < max_idx; i++)
 +                              atomic_long_add(stat[i], &pi->lruvec_stat[i]);
 +      }
 +}
 +
 +static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
 +{
 +      unsigned long events[NR_VM_EVENT_ITEMS];
 +      struct mem_cgroup *mi;
 +      int cpu, i;
 +
 +      for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
 +              events[i] = 0;
 +
 +      for_each_online_cpu(cpu)
 +              for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
 +                      events[i] += per_cpu(memcg->vmstats_percpu->events[i],
 +                                           cpu);
 +
 +      for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
 +              for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
 +                      atomic_long_add(events[i], &mi->vmevents[i]);
 +}
 +
  #ifdef CONFIG_MEMCG_KMEM
  static int memcg_online_kmem(struct mem_cgroup *memcg)
  {
@@@ -3377,14 -3309,7 +3377,14 @@@ static void memcg_offline_kmem(struct m
        if (!parent)
                parent = root_mem_cgroup;
  
 +      /*
 +       * Deactivate and reparent kmem_caches. Then flush percpu
 +       * slab statistics to have precise values at the parent and
 +       * all ancestor levels. It's required to keep slab stats
 +       * accurate after the reparenting of kmem_caches.
 +       */
        memcg_deactivate_kmem_caches(memcg, parent);
 +      memcg_flush_percpu_vmstats(memcg, true);
  
        kmemcg_id = memcg->kmemcg_id;
        BUG_ON(kmemcg_id < 0);
@@@ -4176,8 -4101,6 +4176,8 @@@ static int mem_cgroup_oom_control_write
  
  #ifdef CONFIG_CGROUP_WRITEBACK
  
 +#include <trace/events/writeback.h>
 +
  static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
  {
        return wb_domain_init(&memcg->cgwb_domain, gfp);
@@@ -4261,130 -4184,6 +4261,130 @@@ void mem_cgroup_wb_stats(struct bdi_wri
        }
  }
  
 +/*
 + * Foreign dirty flushing
 + *
 + * There's an inherent mismatch between memcg and writeback.  The former
 + * trackes ownership per-page while the latter per-inode.  This was a
 + * deliberate design decision because honoring per-page ownership in the
 + * writeback path is complicated, may lead to higher CPU and IO overheads
 + * and deemed unnecessary given that write-sharing an inode across
 + * different cgroups isn't a common use-case.
 + *
 + * Combined with inode majority-writer ownership switching, this works well
 + * enough in most cases but there are some pathological cases.  For
 + * example, let's say there are two cgroups A and B which keep writing to
 + * different but confined parts of the same inode.  B owns the inode and
 + * A's memory is limited far below B's.  A's dirty ratio can rise enough to
 + * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
 + * triggering background writeback.  A will be slowed down without a way to
 + * make writeback of the dirty pages happen.
 + *
 + * Conditions like the above can lead to a cgroup getting repatedly and
 + * severely throttled after making some progress after each
 + * dirty_expire_interval while the underyling IO device is almost
 + * completely idle.
 + *
 + * Solving this problem completely requires matching the ownership tracking
 + * granularities between memcg and writeback in either direction.  However,
 + * the more egregious behaviors can be avoided by simply remembering the
 + * most recent foreign dirtying events and initiating remote flushes on
 + * them when local writeback isn't enough to keep the memory clean enough.
 + *
 + * The following two functions implement such mechanism.  When a foreign
 + * page - a page whose memcg and writeback ownerships don't match - is
 + * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
 + * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
 + * decides that the memcg needs to sleep due to high dirty ratio, it calls
 + * mem_cgroup_flush_foreign() which queues writeback on the recorded
 + * foreign bdi_writebacks which haven't expired.  Both the numbers of
 + * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
 + * limited to MEMCG_CGWB_FRN_CNT.
 + *
 + * The mechanism only remembers IDs and doesn't hold any object references.
 + * As being wrong occasionally doesn't matter, updates and accesses to the
 + * records are lockless and racy.
 + */
 +void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
 +                                           struct bdi_writeback *wb)
 +{
 +      struct mem_cgroup *memcg = page->mem_cgroup;
 +      struct memcg_cgwb_frn *frn;
 +      u64 now = get_jiffies_64();
 +      u64 oldest_at = now;
 +      int oldest = -1;
 +      int i;
 +
 +      trace_track_foreign_dirty(page, wb);
 +
 +      /*
 +       * Pick the slot to use.  If there is already a slot for @wb, keep
 +       * using it.  If not replace the oldest one which isn't being
 +       * written out.
 +       */
 +      for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
 +              frn = &memcg->cgwb_frn[i];
 +              if (frn->bdi_id == wb->bdi->id &&
 +                  frn->memcg_id == wb->memcg_css->id)
 +                      break;
 +              if (time_before64(frn->at, oldest_at) &&
 +                  atomic_read(&frn->done.cnt) == 1) {
 +                      oldest = i;
 +                      oldest_at = frn->at;
 +              }
 +      }
 +
 +      if (i < MEMCG_CGWB_FRN_CNT) {
 +              /*
 +               * Re-using an existing one.  Update timestamp lazily to
 +               * avoid making the cacheline hot.  We want them to be
 +               * reasonably up-to-date and significantly shorter than
 +               * dirty_expire_interval as that's what expires the record.
 +               * Use the shorter of 1s and dirty_expire_interval / 8.
 +               */
 +              unsigned long update_intv =
 +                      min_t(unsigned long, HZ,
 +                            msecs_to_jiffies(dirty_expire_interval * 10) / 8);
 +
 +              if (time_before64(frn->at, now - update_intv))
 +                      frn->at = now;
 +      } else if (oldest >= 0) {
 +              /* replace the oldest free one */
 +              frn = &memcg->cgwb_frn[oldest];
 +              frn->bdi_id = wb->bdi->id;
 +              frn->memcg_id = wb->memcg_css->id;
 +              frn->at = now;
 +      }
 +}
 +
 +/* issue foreign writeback flushes for recorded foreign dirtying events */
 +void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
 +{
 +      struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
 +      unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
 +      u64 now = jiffies_64;
 +      int i;
 +
 +      for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
 +              struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
 +
 +              /*
 +               * If the record is older than dirty_expire_interval,
 +               * writeback on it has already started.  No need to kick it
 +               * off again.  Also, don't start a new one if there's
 +               * already one in flight.
 +               */
 +              if (time_after64(frn->at, now - intv) &&
 +                  atomic_read(&frn->done.cnt) == 1) {
 +                      frn->at = 0;
 +                      trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
 +                      cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
 +                                             WB_REASON_FOREIGN_FLUSH,
 +                                             &frn->done);
 +              }
 +      }
 +}
 +
  #else /* CONFIG_CGROUP_WRITEBACK */
  
  static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
@@@ -4883,12 -4682,6 +4883,12 @@@ static void __mem_cgroup_free(struct me
  {
        int node;
  
 +      /*
 +       * Flush percpu vmstats and vmevents to guarantee the value correctness
 +       * on parent's and all ancestor levels.
 +       */
 +      memcg_flush_percpu_vmstats(memcg, false);
 +      memcg_flush_percpu_vmevents(memcg);
        for_each_node(node)
                free_mem_cgroup_per_node_info(memcg, node);
        free_percpu(memcg->vmstats_percpu);
@@@ -4907,7 -4700,6 +4907,7 @@@ static struct mem_cgroup *mem_cgroup_al
        struct mem_cgroup *memcg;
        unsigned int size;
        int node;
 +      int __maybe_unused i;
  
        size = sizeof(struct mem_cgroup);
        size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
  #endif
  #ifdef CONFIG_CGROUP_WRITEBACK
        INIT_LIST_HEAD(&memcg->cgwb_list);
 +      for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
 +              memcg->cgwb_frn[i].done =
 +                      __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
  #endif
        idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
        return memcg;
@@@ -5083,12 -4872,7 +5083,12 @@@ static void mem_cgroup_css_released(str
  static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
  {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 +      int __maybe_unused i;
  
 +#ifdef CONFIG_CGROUP_WRITEBACK
 +      for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
 +              wb_wait_for_completion(&memcg->cgwb_frn[i].done);
 +#endif
        if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
                static_branch_dec(&memcg_sockets_enabled_key);
  
@@@ -5499,17 -5283,16 +5499,16 @@@ static int mem_cgroup_count_precharge_p
        return 0;
  }
  
+ static const struct mm_walk_ops precharge_walk_ops = {
+       .pmd_entry      = mem_cgroup_count_precharge_pte_range,
+ };
  static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
  {
        unsigned long precharge;
  
-       struct mm_walk mem_cgroup_count_precharge_walk = {
-               .pmd_entry = mem_cgroup_count_precharge_pte_range,
-               .mm = mm,
-       };
        down_read(&mm->mmap_sem);
-       walk_page_range(0, mm->highest_vm_end,
-                       &mem_cgroup_count_precharge_walk);
+       walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
        up_read(&mm->mmap_sem);
  
        precharge = mc.precharge;
@@@ -5778,13 -5561,12 +5777,12 @@@ put:                 /* get_mctgt_type() gets the pag
        return ret;
  }
  
+ static const struct mm_walk_ops charge_walk_ops = {
+       .pmd_entry      = mem_cgroup_move_charge_pte_range,
+ };
  static void mem_cgroup_move_charge(void)
  {
-       struct mm_walk mem_cgroup_move_charge_walk = {
-               .pmd_entry = mem_cgroup_move_charge_pte_range,
-               .mm = mc.mm,
-       };
        lru_add_drain_all();
        /*
         * Signal lock_page_memcg() to take the memcg's move_lock
@@@ -5810,7 -5592,8 +5808,8 @@@ retry
         * When we have consumed all precharges and failed in doing
         * additional charge, the page walk just aborts.
         */
-       walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
+       walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
+                       NULL);
  
        up_read(&mc.mm->mmap_sem);
        atomic_dec(&mc.from->moving_account);
diff --combined mm/page_alloc.c
index 6991ccec9c322ffb843110bb69cf2326d64b266c,b39baa2b1fafcaf0674e28c6ae63c2c2a1d1485f..ff5484fdbdf9908a9064129f2990a13f4cfc2247
@@@ -2238,12 -2238,27 +2238,12 @@@ static int move_freepages(struct zone *
        unsigned int order;
        int pages_moved = 0;
  
 -#ifndef CONFIG_HOLES_IN_ZONE
 -      /*
 -       * page_zone is not safe to call in this context when
 -       * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
 -       * anyway as we check zone boundaries in move_freepages_block().
 -       * Remove at a later date when no bug reports exist related to
 -       * grouping pages by mobility
 -       */
 -      VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
 -                pfn_valid(page_to_pfn(end_page)) &&
 -                page_zone(start_page) != page_zone(end_page));
 -#endif
        for (page = start_page; page <= end_page;) {
                if (!pfn_valid_within(page_to_pfn(page))) {
                        page++;
                        continue;
                }
  
 -              /* Make sure we are not inadvertently changing nodes */
 -              VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
 -
                if (!PageBuddy(page)) {
                        /*
                         * We assume that pages that could be isolated for
                        continue;
                }
  
 +              /* Make sure we are not inadvertently changing nodes */
 +              VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
 +              VM_BUG_ON_PAGE(page_zone(page) != zone, page);
 +
                order = page_order(page);
                move_to_free_area(page, &zone->free_area[order], migratetype);
                page += 1 << order;
@@@ -3511,7 -3522,7 +3511,7 @@@ bool zone_watermark_ok_safe(struct zon
  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
  {
        return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
 -                              RECLAIM_DISTANCE;
 +                              node_reclaim_distance;
  }
  #else /* CONFIG_NUMA */
  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
@@@ -5971,7 -5982,7 +5971,7 @@@ void __ref memmap_init_zone_device(stru
                }
        }
  
-       pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
+       pr_info("%s initialised %lu pages in %ums\n", __func__,
                size, jiffies_to_msecs(jiffies - start));
  }
  
This page took 0.268392 seconds and 4 git commands to generate.