Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

author Linus Torvalds <[email protected]>

Sat, 21 Sep 2019 17:07:42 +0000 (10:07 -0700)

committer Linus Torvalds <[email protected]>

Sat, 21 Sep 2019 17:07:42 +0000 (10:07 -0700)
author Linus Torvalds <[email protected]>
Sat, 21 Sep 2019 17:07:42 +0000 (10:07 -0700)
committer Linus Torvalds <[email protected]>
Sat, 21 Sep 2019 17:07:42 +0000 (10:07 -0700)
diff --combined arch/s390/mm/gmap.c

index cd8e03f04d6daac463af57b15fe26dcc7ddc658d,bd78d504fdade86dcaac497702d9aa69d571a7c8..edcdca97e85eeecbddb4fb650aaa5c0432a60259
--- 1/arch/s390/mm/gmap.c
--- 2/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@@ -9,7 -9,7 +9,7 @@@
    */
   
   #include <linux/kernel.h>
- #include <linux/mm.h>
+ #include <linux/pagewalk.h>
   #include <linux/swap.h>
   #include <linux/smp.h>
   #include <linux/spinlock.h>
@@@ -67,7 -67,7 +67,7 @@@ static struct gmap *gmap_alloc(unsigne
         INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
         spin_lock_init(&gmap->guest_table_lock);
         spin_lock_init(&gmap->shadow_lock);
- -      atomic_set(&gmap->ref_count, 1);
+ +      refcount_set(&gmap->ref_count, 1);
         page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
         if (!page)
                 goto out_free;
@@@ -214,7 -214,7 +214,7 @@@ static void gmap_free(struct gmap *gmap
    */
   struct gmap *gmap_get(struct gmap *gmap)
   {
- -      atomic_inc(&gmap->ref_count);
+ +      refcount_inc(&gmap->ref_count);
         return gmap;
   }
   EXPORT_SYMBOL_GPL(gmap_get);
@@@ -227,7 -227,7 +227,7 @@@
    */
   void gmap_put(struct gmap *gmap)
   {
- -      if (atomic_dec_return(&gmap->ref_count) == 0)
+ +      if (refcount_dec_and_test(&gmap->ref_count))
                 gmap_free(gmap);
   }
   EXPORT_SYMBOL_GPL(gmap_put);
@@@ -1594,7 -1594,7 +1594,7 @@@ static struct gmap *gmap_find_shadow(st
                         continue;
                 if (!sg->initialized)
                         return ERR_PTR(-EAGAIN);
- -              atomic_inc(&sg->ref_count);
+ +              refcount_inc(&sg->ref_count);
                 return sg;
         }
         return NULL;
@@@ -1682,7 -1682,7 +1682,7 @@@ struct gmap *gmap_shadow(struct gmap *p
                         }
                 }
         }
- -      atomic_set(&new->ref_count, 2);
+ +      refcount_set(&new->ref_count, 2);
         list_add(&new->list, &parent->children);
         if (asce & _ASCE_REAL_SPACE) {
                 /* nothing to protect, return right away */
@@@ -2521,13 -2521,9 +2521,9 @@@ static int __zap_zero_pages(pmd_t *pmd
         return 0;
   }
   
- static inline void zap_zero_pages(struct mm_struct *mm)
- {
-       struct mm_walk walk = { .pmd_entry = __zap_zero_pages };
- 
-       walk.mm = mm;
-       walk_page_range(0, TASK_SIZE, &walk);
- }
+ static const struct mm_walk_ops zap_zero_walk_ops = {
+       .pmd_entry      = __zap_zero_pages,
+ };
   
   /*
    * switch on pgstes for its userspace process (for kvm)
@@@ -2546,7 -2542,7 +2542,7 @@@ int s390_enable_sie(void
         mm->context.has_pgste = 1;
         /* split thp mappings and disable thp for future mappings */
         thp_split_mm(mm);
-       zap_zero_pages(mm);
+       walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
         up_write(&mm->mmap_sem);
         return 0;
   }
@@@ -2589,12 -2585,13 +2585,13 @@@ static int __s390_enable_skey_hugetlb(p
         return 0;
   }
   
+ static const struct mm_walk_ops enable_skey_walk_ops = {
+       .hugetlb_entry          = __s390_enable_skey_hugetlb,
+       .pte_entry              = __s390_enable_skey_pte,
+ };
+ 
   int s390_enable_skey(void)
   {
-       struct mm_walk walk = {
-               .hugetlb_entry = __s390_enable_skey_hugetlb,
-               .pte_entry = __s390_enable_skey_pte,
-       };
         struct mm_struct *mm = current->mm;
         struct vm_area_struct *vma;
         int rc = 0;
@@@ -2614,8 -2611,7 +2611,7 @@@
         }
         mm->def_flags &= ~VM_MERGEABLE;
   
-       walk.mm = mm;
-       walk_page_range(0, TASK_SIZE, &walk);
+       walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
   
   out_up:
         up_write(&mm->mmap_sem);
@@@ -2633,13 -2629,14 +2629,14 @@@ static int __s390_reset_cmma(pte_t *pte
         return 0;
   }
   
+ static const struct mm_walk_ops reset_cmma_walk_ops = {
+       .pte_entry              = __s390_reset_cmma,
+ };
+ 
   void s390_reset_cmma(struct mm_struct *mm)
   {
-       struct mm_walk walk = { .pte_entry = __s390_reset_cmma };
- 
         down_write(&mm->mmap_sem);
-       walk.mm = mm;
-       walk_page_range(0, TASK_SIZE, &walk);
+       walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
         up_write(&mm->mmap_sem);
   }
   EXPORT_SYMBOL_GPL(s390_reset_cmma);
diff --combined drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

index 48a2070e72f2cbc20245ac7e8e7c409b66f0ba49,9a05a37603bdc72f041b1f8e79ed16d73809f039..bdf849da32e42e11d28017e59a18e47d9ca41898
--- 1/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
--- 2/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@@ -35,6 -35,7 +35,7 @@@
   #include <linux/pm_runtime.h>
   #include <linux/vga_switcheroo.h>
   #include <drm/drm_probe_helper.h>
+ #include <linux/mmu_notifier.h>
   
   #include "amdgpu.h"
   #include "amdgpu_irq.h"
@@@ -79,10 -80,9 +80,10 @@@
    * - 3.31.0 - Add support for per-flip tiling attribute changes with DC
    * - 3.32.0 - Add syncobj timeline support to AMDGPU_CS.
    * - 3.33.0 - Fixes for GDS ENOMEM failures in AMDGPU_CS.
+ + * - 3.34.0 - Non-DC can flip correctly between buffers with different pitches
    */
   #define KMS_DRIVER_MAJOR      3
- -#define KMS_DRIVER_MINOR      33
+ +#define KMS_DRIVER_MINOR      34
   #define KMS_DRIVER_PATCHLEVEL 0
   
   #define AMDGPU_MAX_TIMEOUT_PARAM_LENTH        256
@@@ -143,7 -143,7 +144,7 @@@ int amdgpu_async_gfx_ring = 1
   int amdgpu_mcbp = 0;
   int amdgpu_discovery = -1;
   int amdgpu_mes = 0;
- -int amdgpu_noretry;
+ +int amdgpu_noretry = 1;
   
   struct amdgpu_mgpu_info mgpu_info = {
         .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
@@@ -611,7 -611,7 +612,7 @@@ MODULE_PARM_DESC(mes
   module_param_named(mes, amdgpu_mes, int, 0444);
   
   MODULE_PARM_DESC(noretry,
- -      "Disable retry faults (0 = retry enabled (default), 1 = retry disabled)");
+ +      "Disable retry faults (0 = retry enabled, 1 = retry disabled (default))");
   module_param_named(noretry, amdgpu_noretry, int, 0644);
   
   #ifdef CONFIG_HSA_AMD
@@@ -997,11 -997,6 +998,11 @@@ static const struct pci_device_id pciid
         /* Raven */
         {0x1002, 0x15dd, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RAVEN|AMD_IS_APU},
         {0x1002, 0x15d8, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RAVEN|AMD_IS_APU},
+ +      /* Arcturus */
+ +      {0x1002, 0x738C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARCTURUS|AMD_EXP_HW_SUPPORT},
+ +      {0x1002, 0x7388, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARCTURUS|AMD_EXP_HW_SUPPORT},
+ +      {0x1002, 0x738E, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARCTURUS|AMD_EXP_HW_SUPPORT},
+ +      {0x1002, 0x7390, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARCTURUS|AMD_EXP_HW_SUPPORT},
         /* Navi10 */
         {0x1002, 0x7310, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
         {0x1002, 0x7312, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
@@@ -1010,11 -1005,6 +1011,11 @@@
         {0x1002, 0x731A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
         {0x1002, 0x731B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
         {0x1002, 0x731F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
+ +      /* Navi14 */
+ +      {0x1002, 0x7340, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14},
+ +
+ +      /* Renoir */
+ +      {0x1002, 0x1636, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU|AMD_EXP_HW_SUPPORT},
   
         {0, 0, 0}
   };
@@@ -1103,21 -1093,21 +1104,21 @@@ amdgpu_pci_shutdown(struct pci_dev *pde
          * unfortunately we can't detect certain
          * hypervisors so just do this all the time.
          */
+ +      adev->mp1_state = PP_MP1_STATE_UNLOAD;
         amdgpu_device_ip_suspend(adev);
+ +      adev->mp1_state = PP_MP1_STATE_NONE;
   }
   
   static int amdgpu_pmops_suspend(struct device *dev)
   {
- -      struct pci_dev *pdev = to_pci_dev(dev);
+ +      struct drm_device *drm_dev = dev_get_drvdata(dev);
   
- -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
         return amdgpu_device_suspend(drm_dev, true, true);
   }
   
   static int amdgpu_pmops_resume(struct device *dev)
   {
- -      struct pci_dev *pdev = to_pci_dev(dev);
- -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
+ +      struct drm_device *drm_dev = dev_get_drvdata(dev);
   
         /* GPU comes up enabled by the bios on resume */
         if (amdgpu_device_is_px(drm_dev)) {
@@@ -1131,29 -1121,33 +1132,29 @@@
   
   static int amdgpu_pmops_freeze(struct device *dev)
   {
- -      struct pci_dev *pdev = to_pci_dev(dev);
+ +      struct drm_device *drm_dev = dev_get_drvdata(dev);
   
- -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
         return amdgpu_device_suspend(drm_dev, false, true);
   }
   
   static int amdgpu_pmops_thaw(struct device *dev)
   {
- -      struct pci_dev *pdev = to_pci_dev(dev);
+ +      struct drm_device *drm_dev = dev_get_drvdata(dev);
   
- -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
         return amdgpu_device_resume(drm_dev, false, true);
   }
   
   static int amdgpu_pmops_poweroff(struct device *dev)
   {
- -      struct pci_dev *pdev = to_pci_dev(dev);
+ +      struct drm_device *drm_dev = dev_get_drvdata(dev);
   
- -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
         return amdgpu_device_suspend(drm_dev, true, true);
   }
   
   static int amdgpu_pmops_restore(struct device *dev)
   {
- -      struct pci_dev *pdev = to_pci_dev(dev);
+ +      struct drm_device *drm_dev = dev_get_drvdata(dev);
   
- -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
         return amdgpu_device_resume(drm_dev, false, true);
   }
   
@@@ -1212,7 -1206,8 +1213,7 @@@ static int amdgpu_pmops_runtime_resume(
   
   static int amdgpu_pmops_runtime_idle(struct device *dev)
   {
- -      struct pci_dev *pdev = to_pci_dev(dev);
- -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
+ +      struct drm_device *drm_dev = dev_get_drvdata(dev);
         struct drm_crtc *crtc;
   
         if (!amdgpu_device_is_px(drm_dev)) {
@@@ -1379,7 -1374,7 +1380,7 @@@ static struct drm_driver kms_driver = 
         .driver_features =
             DRIVER_USE_AGP | DRIVER_ATOMIC |
             DRIVER_GEM |
- -          DRIVER_PRIME | DRIVER_RENDER | DRIVER_MODESET | DRIVER_SYNCOBJ,
+ +          DRIVER_RENDER | DRIVER_MODESET | DRIVER_SYNCOBJ,
         .load = amdgpu_driver_load_kms,
         .open = amdgpu_driver_open_kms,
         .postclose = amdgpu_driver_postclose_kms,
@@@ -1403,6 -1398,7 +1404,6 @@@
         .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
         .gem_prime_export = amdgpu_gem_prime_export,
         .gem_prime_import = amdgpu_gem_prime_import,
- -      .gem_prime_res_obj = amdgpu_gem_prime_res_obj,
         .gem_prime_get_sg_table = amdgpu_gem_prime_get_sg_table,
         .gem_prime_import_sg_table = amdgpu_gem_prime_import_sg_table,
         .gem_prime_vmap = amdgpu_gem_prime_vmap,
@@@ -1469,6 -1465,7 +1470,7 @@@ static void __exit amdgpu_exit(void
         amdgpu_unregister_atpx_handler();
         amdgpu_sync_fini();
         amdgpu_fence_slab_fini();
+       mmu_notifier_synchronize();
   }
   
   module_init(amdgpu_init);
diff --combined drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c

index f1f8cdd695d3f6a556c74351c3eb8dc4547e200e,60b9fc9561d7f5467776be71fcde8b60a2d6afbb..31d4deb5d294846aa6fc66c9f7ec8a7963604854
--- 1/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
--- 2/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@@ -179,7 -179,7 +179,7 @@@ static void amdgpu_mn_invalidate_node(s
                 if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, start, end))
                         continue;
   
- -              r = reservation_object_wait_timeout_rcu(bo->tbo.resv,
+ +              r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv,
                         true, false, MAX_SCHEDULE_TIMEOUT);
                 if (r <= 0)
                         DRM_ERROR("(%ld) failed to wait for user bo\n", r);
@@@ -195,13 -195,14 +195,14 @@@
    * Block for operations on BOs to finish and mark pages as accessed and
    * potentially dirty.
    */
- static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror,
-                       const struct hmm_update *update)
+ static int
+ amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror,
+                             const struct mmu_notifier_range *update)
   {
         struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
         unsigned long start = update->start;
         unsigned long end = update->end;
-       bool blockable = update->blockable;
+       bool blockable = mmu_notifier_range_blockable(update);
         struct interval_tree_node *it;
   
         /* notification is exclusive, but interval is inclusive */
@@@ -243,13 -244,14 +244,14 @@@
    * necessitates evicting all user-mode queues of the process. The BOs
    * are restorted in amdgpu_mn_invalidate_range_end_hsa.
    */
- static int amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror,
-                       const struct hmm_update *update)
+ static int
+ amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror,
+                             const struct mmu_notifier_range *update)
   {
         struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
         unsigned long start = update->start;
         unsigned long end = update->end;
-       bool blockable = update->blockable;
+       bool blockable = mmu_notifier_range_blockable(update);
         struct interval_tree_node *it;
   
         /* notification is exclusive, but interval is inclusive */
@@@ -482,6 -484,5 +484,5 @@@ void amdgpu_hmm_init_range(struct hmm_r
                 range->flags = hmm_range_flags;
                 range->values = hmm_range_values;
                 range->pfn_shift = PAGE_SHIFT;
-               INIT_LIST_HEAD(&range->list);
         }
   }
diff --combined drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

index 13b144c8f67d68c2c261d16efbfea8772ff0a95b,8bf79288c4e29b240c45c23eef024b720be7f49c..dff41d0a85fe969b7bc96323ba66d235d210fed3
--- 1/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
--- 2/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@@ -227,7 -227,7 +227,7 @@@ static int amdgpu_verify_access(struct 
   
         if (amdgpu_ttm_tt_get_usermm(bo->ttm))
                 return -EPERM;
- -      return drm_vma_node_verify_access(&abo->gem_base.vma_node,
+ +      return drm_vma_node_verify_access(&abo->tbo.base.vma_node,
                                           filp->private_data);
   }
   
@@@ -303,7 -303,7 +303,7 @@@ int amdgpu_ttm_copy_mem_to_mem(struct a
                                struct amdgpu_copy_mem *src,
                                struct amdgpu_copy_mem *dst,
                                uint64_t size,
- -                             struct reservation_object *resv,
+ +                             struct dma_resv *resv,
                                struct dma_fence **f)
   {
         struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
@@@ -440,26 -440,10 +440,26 @@@ static int amdgpu_move_blit(struct ttm_
   
         r = amdgpu_ttm_copy_mem_to_mem(adev, &src, &dst,
                                        new_mem->num_pages << PAGE_SHIFT,
- -                                     bo->resv, &fence);
+ +                                     bo->base.resv, &fence);
         if (r)
                 goto error;
   
+ +      /* clear the space being freed */
+ +      if (old_mem->mem_type == TTM_PL_VRAM &&
+ +          (ttm_to_amdgpu_bo(bo)->flags &
+ +           AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) {
+ +              struct dma_fence *wipe_fence = NULL;
+ +
+ +              r = amdgpu_fill_buffer(ttm_to_amdgpu_bo(bo), AMDGPU_POISON,
+ +                                     NULL, &wipe_fence);
+ +              if (r) {
+ +                      goto error;
+ +              } else if (wipe_fence) {
+ +                      dma_fence_put(fence);
+ +                      fence = wipe_fence;
+ +              }
+ +      }
+ +
         /* Always block for VM page tables before committing the new location */
         if (bo->type == ttm_bo_type_kernel)
                 r = ttm_bo_move_accel_cleanup(bo, fence, true, new_mem);
@@@ -794,7 -778,6 +794,6 @@@ int amdgpu_ttm_tt_get_user_pages(struc
         struct hmm_range *range;
         unsigned long i;
         uint64_t *pfns;
-       int retry = 0;
         int r = 0;
   
         if (!mm) /* Happens during process shutdown */
@@@ -835,10 -818,11 +834,11 @@@
                                 0 : range->flags[HMM_PFN_WRITE];
         range->pfn_flags_mask = 0;
         range->pfns = pfns;
-       hmm_range_register(range, mirror, start,
-                          start + ttm->num_pages * PAGE_SIZE, PAGE_SHIFT);
+       range->start = start;
+       range->end = start + ttm->num_pages * PAGE_SIZE;
+ 
+       hmm_range_register(range, mirror);
   
- retry:
         /*
          * Just wait for range to be valid, safe to ignore return value as we
          * will use the return value of hmm_range_fault() below under the
@@@ -847,24 -831,12 +847,12 @@@
         hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT);
   
         down_read(&mm->mmap_sem);
- 
-       r = hmm_range_fault(range, true);
-       if (unlikely(r < 0)) {
-               if (likely(r == -EAGAIN)) {
-                       /*
-                        * return -EAGAIN, mmap_sem is dropped
-                        */
-                       if (retry++ < MAX_RETRY_HMM_RANGE_FAULT)
-                               goto retry;
-                       else
-                               pr_err("Retry hmm fault too many times\n");
-               }
- 
-               goto out_up_read;
-       }
- 
+       r = hmm_range_fault(range, 0);
         up_read(&mm->mmap_sem);
   
+       if (unlikely(r < 0))
+               goto out_free_pfns;
+ 
         for (i = 0; i < ttm->num_pages; i++) {
                 pages[i] = hmm_device_entry_to_page(range, pfns[i]);
                 if (unlikely(!pages[i])) {
@@@ -880,9 -852,6 +868,6 @@@
   
         return 0;
   
- out_up_read:
-       if (likely(r != -EAGAIN))
-               up_read(&mm->mmap_sem);
   out_free_pfns:
         hmm_range_unregister(range);
         kvfree(pfns);
@@@ -1486,7 -1455,7 +1471,7 @@@ static bool amdgpu_ttm_bo_eviction_valu
   {
         unsigned long num_pages = bo->mem.num_pages;
         struct drm_mm_node *node = bo->mem.mm_node;
- -      struct reservation_object_list *flist;
+ +      struct dma_resv_list *flist;
         struct dma_fence *f;
         int i;
   
@@@ -1494,18 -1463,18 +1479,18 @@@
          * cleanly handle page faults.
          */
         if (bo->type == ttm_bo_type_kernel &&
- -          !reservation_object_test_signaled_rcu(bo->resv, true))
+ +          !dma_resv_test_signaled_rcu(bo->base.resv, true))
                 return false;
   
         /* If bo is a KFD BO, check if the bo belongs to the current process.
          * If true, then return false as any KFD process needs all its BOs to
          * be resident to run successfully
          */
- -      flist = reservation_object_get_list(bo->resv);
+ +      flist = dma_resv_get_list(bo->base.resv);
         if (flist) {
                 for (i = 0; i < flist->shared_count; ++i) {
                         f = rcu_dereference_protected(flist->shared[i],
- -                              reservation_object_held(bo->resv));
+ +                              dma_resv_held(bo->base.resv));
                         if (amdkfd_fence_check_mm(f, current->mm))
                                 return false;
                 }
@@@ -1615,7 -1584,6 +1600,7 @@@ static struct ttm_bo_driver amdgpu_bo_d
         .move = &amdgpu_bo_move,
         .verify_access = &amdgpu_verify_access,
         .move_notify = &amdgpu_bo_move_notify,
+ +      .release_notify = &amdgpu_bo_release_notify,
         .fault_reserve_notify = &amdgpu_bo_fault_reserve_notify,
         .io_mem_reserve = &amdgpu_ttm_io_mem_reserve,
         .io_mem_free = &amdgpu_ttm_io_mem_free,
@@@ -1738,7 -1706,6 +1723,7 @@@ int amdgpu_ttm_init(struct amdgpu_devic
         uint64_t gtt_size;
         int r;
         u64 vis_vram_limit;
+ +      void *stolen_vga_buf;
   
         mutex_init(&adev->mman.gtt_window_lock);
   
@@@ -1746,7 -1713,7 +1731,7 @@@
         r = ttm_bo_device_init(&adev->mman.bdev,
                                &amdgpu_bo_driver,
                                adev->ddev->anon_inode->i_mapping,
- -                             adev->need_dma32);
+ +                             dma_addressing_limited(adev->dev));
         if (r) {
                 DRM_ERROR("failed initializing buffer object driver(%d).\n", r);
                 return r;
@@@ -1793,7 -1760,7 +1778,7 @@@
         r = amdgpu_bo_create_kernel(adev, adev->gmc.stolen_size, PAGE_SIZE,
                                     AMDGPU_GEM_DOMAIN_VRAM,
                                     &adev->stolen_vga_memory,
- -                                  NULL, NULL);
+ +                                  NULL, &stolen_vga_buf);
         if (r)
                 return r;
         DRM_INFO("amdgpu: %uM of VRAM memory ready\n",
@@@ -1857,9 -1824,8 +1842,9 @@@
    */
   void amdgpu_ttm_late_init(struct amdgpu_device *adev)
   {
+ +      void *stolen_vga_buf;
         /* return the VGA stolen memory (if any) back to VRAM */
- -      amdgpu_bo_free_kernel(&adev->stolen_vga_memory, NULL, NULL);
+ +      amdgpu_bo_free_kernel(&adev->stolen_vga_memory, NULL, &stolen_vga_buf);
   }
   
   /**
@@@ -2011,7 -1977,7 +1996,7 @@@ error_free
   
   int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
                        uint64_t dst_offset, uint32_t byte_count,
- -                     struct reservation_object *resv,
+ +                     struct dma_resv *resv,
                        struct dma_fence **fence, bool direct_submit,
                        bool vm_needs_flush)
   {
@@@ -2085,7 -2051,7 +2070,7 @@@ error_free
   
   int amdgpu_fill_buffer(struct amdgpu_bo *bo,
                        uint32_t src_data,
- -                     struct reservation_object *resv,
+ +                     struct dma_resv *resv,
                        struct dma_fence **fence)
   {
         struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
diff --combined drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 3bb75d11a6627339bde0a9769b78b0bb9cf94b2a,9450e20d17093bd4670422c4356c09fbbe1f1dd6..c89326125d71170509531a8ffc434db3101f1908
--- 1/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
--- 2/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@@ -195,7 -195,6 +195,7 @@@ struct kfd_event_interrupt_class 
   
   struct kfd_device_info {
         enum amd_asic_type asic_family;
+ +      const char *asic_name;
         const struct kfd_event_interrupt_class *event_interrupt_class;
         unsigned int max_pasid_bits;
         unsigned int max_no_of_hqd;
@@@ -687,9 -686,6 +687,6 @@@ struct kfd_process 
         /* We want to receive a notification when the mm_struct is destroyed */
         struct mmu_notifier mmu_notifier;
   
-       /* Use for delayed freeing of kfd_process structure */
-       struct rcu_head rcu;
- 
         unsigned int pasid;
         unsigned int doorbell_index;
   
diff --combined drivers/gpu/drm/amd/amdkfd/kfd_process.c

index 0c6ac043ae3cc54cad8284e3eda06eb027361d22,e5e326f2f2675edc88dce8eede0a93a33d2cc4ae..40e3fc0c6942120b83e724038a537ed11f756e9f
--- 1/drivers/gpu/drm/amd/amdkfd/kfd_process.c
--- 2/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@@ -62,8 -62,8 +62,8 @@@ static struct workqueue_struct *kfd_res
   
   static struct kfd_process *find_process(const struct task_struct *thread);
   static void kfd_process_ref_release(struct kref *ref);
- static struct kfd_process *create_process(const struct task_struct *thread,
-                                       struct file *filep);
+ static struct kfd_process *create_process(const struct task_struct *thread);
+ static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep);
   
   static void evict_process_worker(struct work_struct *work);
   static void restore_process_worker(struct work_struct *work);
@@@ -289,7 -289,15 +289,15 @@@ struct kfd_process *kfd_create_process(
         if (process) {
                 pr_debug("Process already found\n");
         } else {
-               process = create_process(thread, filep);
+               process = create_process(thread);
+               if (IS_ERR(process))
+                       goto out;
+ 
+               ret = kfd_process_init_cwsr_apu(process, filep);
+               if (ret) {
+                       process = ERR_PTR(ret);
+                       goto out;
+               }
   
                 if (!procfs.kobj)
                         goto out;
@@@ -478,11 -486,9 +486,9 @@@ static void kfd_process_ref_release(str
         queue_work(kfd_process_wq, &p->release_work);
   }
   
- static void kfd_process_destroy_delayed(struct rcu_head *rcu)
+ static void kfd_process_free_notifier(struct mmu_notifier *mn)
   {
-       struct kfd_process *p = container_of(rcu, struct kfd_process, rcu);
- 
-       kfd_unref_process(p);
+       kfd_unref_process(container_of(mn, struct kfd_process, mmu_notifier));
   }
   
   static void kfd_process_notifier_release(struct mmu_notifier *mn,
@@@ -534,12 -540,12 +540,12 @@@
   
         mutex_unlock(&p->mutex);
   
-       mmu_notifier_unregister_no_release(&p->mmu_notifier, mm);
-       mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed);
+       mmu_notifier_put(&p->mmu_notifier);
   }
   
   static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
         .release = kfd_process_notifier_release,
+       .free_notifier = kfd_process_free_notifier,
   };
   
   static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
@@@ -609,81 -615,69 +615,69 @@@ static int kfd_process_device_init_cwsr
         return 0;
   }
   
- static struct kfd_process *create_process(const struct task_struct *thread,
-                                       struct file *filep)
+ /*
+  * On return the kfd_process is fully operational and will be freed when the
+  * mm is released
+  */
+ static struct kfd_process *create_process(const struct task_struct *thread)
   {
         struct kfd_process *process;
         int err = -ENOMEM;
   
         process = kzalloc(sizeof(*process), GFP_KERNEL);
- 
         if (!process)
                 goto err_alloc_process;
   
-       process->pasid = kfd_pasid_alloc();
-       if (process->pasid == 0)
-               goto err_alloc_pasid;
- 
-       if (kfd_alloc_process_doorbells(process) < 0)
-               goto err_alloc_doorbells;
- 
         kref_init(&process->ref);
- 
         mutex_init(&process->mutex);
- 
         process->mm = thread->mm;
- 
-       /* register notifier */
-       process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
-       err = mmu_notifier_register(&process->mmu_notifier, process->mm);
-       if (err)
-               goto err_mmu_notifier;
- 
-       hash_add_rcu(kfd_processes_table, &process->kfd_processes,
-                       (uintptr_t)process->mm);
- 
         process->lead_thread = thread->group_leader;
-       get_task_struct(process->lead_thread);
- 
         INIT_LIST_HEAD(&process->per_device_data);
- 
+       INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
+       INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
+       process->last_restore_timestamp = get_jiffies_64();
         kfd_event_init_process(process);
+       process->is_32bit_user_mode = in_compat_syscall();
+ 
+       process->pasid = kfd_pasid_alloc();
+       if (process->pasid == 0)
+               goto err_alloc_pasid;
+ 
+       if (kfd_alloc_process_doorbells(process) < 0)
+               goto err_alloc_doorbells;
   
         err = pqm_init(&process->pqm, process);
         if (err != 0)
                 goto err_process_pqm_init;
   
         /* init process apertures*/
-       process->is_32bit_user_mode = in_compat_syscall();
         err = kfd_init_apertures(process);
         if (err != 0)
                 goto err_init_apertures;
   
-       INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
-       INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
-       process->last_restore_timestamp = get_jiffies_64();
- 
-       err = kfd_process_init_cwsr_apu(process, filep);
+       /* Must be last, have to use release destruction after this */
+       process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
+       err = mmu_notifier_register(&process->mmu_notifier, process->mm);
         if (err)
-               goto err_init_cwsr;
+               goto err_register_notifier;
+ 
+       get_task_struct(process->lead_thread);
+       hash_add_rcu(kfd_processes_table, &process->kfd_processes,
+                       (uintptr_t)process->mm);
   
         return process;
   
- err_init_cwsr:
+ err_register_notifier:
         kfd_process_free_outstanding_kfd_bos(process);
         kfd_process_destroy_pdds(process);
   err_init_apertures:
         pqm_uninit(&process->pqm);
   err_process_pqm_init:
-       hash_del_rcu(&process->kfd_processes);
-       synchronize_rcu();
-       mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm);
- err_mmu_notifier:
-       mutex_destroy(&process->mutex);
         kfd_free_process_doorbells(process);
   err_alloc_doorbells:
         kfd_pasid_free(process->pasid);
   err_alloc_pasid:
+       mutex_destroy(&process->mutex);
         kfree(process);
   err_alloc_process:
         return ERR_PTR(err);
@@@ -801,8 -795,6 +795,8 @@@ int kfd_process_device_init_vm(struct k
                 return ret;
         }
   
+ +      amdgpu_vm_set_task_info(pdd->vm);
+ +
         ret = kfd_process_device_reserve_ib_mem(pdd);
         if (ret)
                 goto err_reserve_ib_mem;
@@@ -1044,6 -1036,7 +1038,6 @@@ static void restore_process_worker(stru
   {
         struct delayed_work *dwork;
         struct kfd_process *p;
- -      struct kfd_process_device *pdd;
         int ret = 0;
   
         dwork = to_delayed_work(work);
@@@ -1052,6 -1045,16 +1046,6 @@@
          * lifetime of this thread, kfd_process p will be valid
          */
         p = container_of(dwork, struct kfd_process, restore_work);
- -
- -      /* Call restore_process_bos on the first KGD device. This function
- -       * takes care of restoring the whole process including other devices.
- -       * Restore can fail if enough memory is not available. If so,
- -       * reschedule again.
- -       */
- -      pdd = list_first_entry(&p->per_device_data,
- -                             struct kfd_process_device,
- -                             per_device_list);
- -
         pr_debug("Started restoring pasid %d\n", p->pasid);
   
         /* Setting last_restore_timestamp before successful restoration.
diff --combined drivers/gpu/drm/nouveau/nouveau_drm.c

index bdc948352467a97fa702b522992f9027b8c8e714,a0e48a482452d70a6592d27b06ffb57667beeb6c..2cd83849600f34b2ff22ca9990af042e4ad8a16b
--- 1/drivers/gpu/drm/nouveau/nouveau_drm.c
--- 2/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@@ -28,10 -28,10 +28,11 @@@
   #include <linux/pci.h>
   #include <linux/pm_runtime.h>
   #include <linux/vga_switcheroo.h>
+ #include <linux/mmu_notifier.h>
   
- -#include <drm/drmP.h>
   #include <drm/drm_crtc_helper.h>
+ +#include <drm/drm_ioctl.h>
+ +#include <drm/drm_vblank.h>
   
   #include <core/gpuobj.h>
   #include <core/option.h>
@@@ -1047,20 -1047,20 +1048,20 @@@ nouveau_drm_postclose(struct drm_devic
   
   static const struct drm_ioctl_desc
   nouveau_ioctls[] = {
- -      DRM_IOCTL_DEF_DRV(NOUVEAU_GETPARAM, nouveau_abi16_ioctl_getparam, DRM_AUTH|DRM_RENDER_ALLOW),
- -      DRM_IOCTL_DEF_DRV(NOUVEAU_SETPARAM, nouveau_abi16_ioctl_setparam, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
- -      DRM_IOCTL_DEF_DRV(NOUVEAU_CHANNEL_ALLOC, nouveau_abi16_ioctl_channel_alloc, DRM_AUTH|DRM_RENDER_ALLOW),
- -      DRM_IOCTL_DEF_DRV(NOUVEAU_CHANNEL_FREE, nouveau_abi16_ioctl_channel_free, DRM_AUTH|DRM_RENDER_ALLOW),
- -      DRM_IOCTL_DEF_DRV(NOUVEAU_GROBJ_ALLOC, nouveau_abi16_ioctl_grobj_alloc, DRM_AUTH|DRM_RENDER_ALLOW),
- -      DRM_IOCTL_DEF_DRV(NOUVEAU_NOTIFIEROBJ_ALLOC, nouveau_abi16_ioctl_notifierobj_alloc, DRM_AUTH|DRM_RENDER_ALLOW),
- -      DRM_IOCTL_DEF_DRV(NOUVEAU_GPUOBJ_FREE, nouveau_abi16_ioctl_gpuobj_free, DRM_AUTH|DRM_RENDER_ALLOW),
- -      DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_INIT, nouveau_svmm_init, DRM_AUTH|DRM_RENDER_ALLOW),
- -      DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_BIND, nouveau_svmm_bind, DRM_AUTH|DRM_RENDER_ALLOW),
- -      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_NEW, nouveau_gem_ioctl_new, DRM_AUTH|DRM_RENDER_ALLOW),
- -      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_PUSHBUF, nouveau_gem_ioctl_pushbuf, DRM_AUTH|DRM_RENDER_ALLOW),
- -      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_PREP, nouveau_gem_ioctl_cpu_prep, DRM_AUTH|DRM_RENDER_ALLOW),
- -      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_FINI, nouveau_gem_ioctl_cpu_fini, DRM_AUTH|DRM_RENDER_ALLOW),
- -      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_INFO, nouveau_gem_ioctl_info, DRM_AUTH|DRM_RENDER_ALLOW),
+ +      DRM_IOCTL_DEF_DRV(NOUVEAU_GETPARAM, nouveau_abi16_ioctl_getparam, DRM_RENDER_ALLOW),
+ +      DRM_IOCTL_DEF_DRV(NOUVEAU_SETPARAM, drm_invalid_op, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
+ +      DRM_IOCTL_DEF_DRV(NOUVEAU_CHANNEL_ALLOC, nouveau_abi16_ioctl_channel_alloc, DRM_RENDER_ALLOW),
+ +      DRM_IOCTL_DEF_DRV(NOUVEAU_CHANNEL_FREE, nouveau_abi16_ioctl_channel_free, DRM_RENDER_ALLOW),
+ +      DRM_IOCTL_DEF_DRV(NOUVEAU_GROBJ_ALLOC, nouveau_abi16_ioctl_grobj_alloc, DRM_RENDER_ALLOW),
+ +      DRM_IOCTL_DEF_DRV(NOUVEAU_NOTIFIEROBJ_ALLOC, nouveau_abi16_ioctl_notifierobj_alloc, DRM_RENDER_ALLOW),
+ +      DRM_IOCTL_DEF_DRV(NOUVEAU_GPUOBJ_FREE, nouveau_abi16_ioctl_gpuobj_free, DRM_RENDER_ALLOW),
+ +      DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_INIT, nouveau_svmm_init, DRM_RENDER_ALLOW),
+ +      DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_BIND, nouveau_svmm_bind, DRM_RENDER_ALLOW),
+ +      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_NEW, nouveau_gem_ioctl_new, DRM_RENDER_ALLOW),
+ +      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_PUSHBUF, nouveau_gem_ioctl_pushbuf, DRM_RENDER_ALLOW),
+ +      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_PREP, nouveau_gem_ioctl_cpu_prep, DRM_RENDER_ALLOW),
+ +      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_FINI, nouveau_gem_ioctl_cpu_fini, DRM_RENDER_ALLOW),
+ +      DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_INFO, nouveau_gem_ioctl_info, DRM_RENDER_ALLOW),
   };
   
   long
@@@ -1106,7 -1106,7 +1107,7 @@@ nouveau_driver_fops = 
   static struct drm_driver
   driver_stub = {
         .driver_features =
- -              DRIVER_GEM | DRIVER_MODESET | DRIVER_PRIME | DRIVER_RENDER
+ +              DRIVER_GEM | DRIVER_MODESET | DRIVER_RENDER
   #if defined(CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT)
                 | DRIVER_KMS_LEGACY_CONTEXT
   #endif
@@@ -1131,7 -1131,10 +1132,7 @@@
   
         .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
         .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
- -      .gem_prime_export = drm_gem_prime_export,
- -      .gem_prime_import = drm_gem_prime_import,
         .gem_prime_pin = nouveau_gem_prime_pin,
- -      .gem_prime_res_obj = nouveau_gem_prime_res_obj,
         .gem_prime_unpin = nouveau_gem_prime_unpin,
         .gem_prime_get_sg_table = nouveau_gem_prime_get_sg_table,
         .gem_prime_import_sg_table = nouveau_gem_prime_import_sg_table,
@@@ -1290,6 -1293,8 +1291,8 @@@ nouveau_drm_exit(void
   #ifdef CONFIG_NOUVEAU_PLATFORM_DRIVER
         platform_driver_unregister(&nouveau_platform_driver);
   #endif
+       if (IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM))
+               mmu_notifier_synchronize();
   }
   
   module_init(nouveau_drm_init);
diff --combined drivers/gpu/drm/radeon/radeon.h

index 05b88491ccb9962ccbfb251ba9b8cdc8ed4faf4f,918164f90b114acbfe36256382ed9d8ae4ee209e..d59b004f6695831b4e6c9e71a180bde7a65988eb
--- 1/drivers/gpu/drm/radeon/radeon.h
--- 2/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@@ -505,6 -505,7 +505,6 @@@ struct radeon_bo 
         struct list_head                va;
         /* Constant after initialization */
         struct radeon_device            *rdev;
- -      struct drm_gem_object           gem_base;
   
         struct ttm_bo_kmap_obj          dma_buf_vmap;
         pid_t                           pid;
@@@ -512,7 -513,7 +512,7 @@@
         struct radeon_mn                *mn;
         struct list_head                mn_list;
   };
- -#define gem_to_radeon_bo(gobj) container_of((gobj), struct radeon_bo, gem_base)
+ +#define gem_to_radeon_bo(gobj) container_of((gobj), struct radeon_bo, tbo.base)
   
   int radeon_gem_debugfs_init(struct radeon_device *rdev);
   
@@@ -619,7 -620,7 +619,7 @@@ void radeon_sync_fence(struct radeon_sy
                        struct radeon_fence *fence);
   int radeon_sync_resv(struct radeon_device *rdev,
                      struct radeon_sync *sync,
- -                   struct reservation_object *resv,
+ +                   struct dma_resv *resv,
                      bool shared);
   int radeon_sync_rings(struct radeon_device *rdev,
                       struct radeon_sync *sync,
@@@ -1912,20 -1913,20 +1912,20 @@@ struct radeon_asic 
                                              uint64_t src_offset,
                                              uint64_t dst_offset,
                                              unsigned num_gpu_pages,
- -                                           struct reservation_object *resv);
+ +                                           struct dma_resv *resv);
                 u32 blit_ring_index;
                 struct radeon_fence *(*dma)(struct radeon_device *rdev,
                                             uint64_t src_offset,
                                             uint64_t dst_offset,
                                             unsigned num_gpu_pages,
- -                                          struct reservation_object *resv);
+ +                                          struct dma_resv *resv);
                 u32 dma_ring_index;
                 /* method used for bo copy */
                 struct radeon_fence *(*copy)(struct radeon_device *rdev,
                                              uint64_t src_offset,
                                              uint64_t dst_offset,
                                              unsigned num_gpu_pages,
- -                                           struct reservation_object *resv);
+ +                                           struct dma_resv *resv);
                 /* ring used for bo copies */
                 u32 copy_ring_index;
         } copy;
@@@ -2386,6 -2387,7 +2386,6 @@@ struct radeon_device 
         struct radeon_wb                wb;
         struct radeon_dummy_page        dummy_page;
         bool                            shutdown;
- -      bool                            need_dma32;
         bool                            need_swiotlb;
         bool                            accel_working;
         bool                            fastfb_working; /* IGP feature*/
@@@ -2449,9 -2451,6 +2449,6 @@@
         /* tracking pinned memory */
         u64 vram_pin_size;
         u64 gart_pin_size;
- 
-       struct mutex    mn_lock;
-       DECLARE_HASHTABLE(mn_hash, 7);
   };
   
   bool radeon_is_px(struct drm_device *dev);
diff --combined drivers/gpu/drm/radeon/radeon_device.c

index 88eb7cb522bb7dedbad56a28f2e6176d72e31e93,788b1d8a80e660392f7b55502e21700fad618b53..5d017f0aec665c9996435b2a211e83596e065757
--- 1/drivers/gpu/drm/radeon/radeon_device.c
--- 2/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@@ -1325,8 -1325,6 +1325,6 @@@ int radeon_device_init(struct radeon_de
         init_rwsem(&rdev->pm.mclk_lock);
         init_rwsem(&rdev->exclusive_lock);
         init_waitqueue_head(&rdev->irq.vblank_queue);
-       mutex_init(&rdev->mn_lock);
-       hash_init(rdev->mn_hash);
         r = radeon_gem_init(rdev);
         if (r)
                 return r;
@@@ -1365,27 -1363,34 +1363,27 @@@
         else
                 rdev->mc.mc_mask = 0xffffffffULL; /* 32 bit MC */
   
- -      /* set DMA mask + need_dma32 flags.
+ +      /* set DMA mask.
          * PCIE - can handle 40-bits.
          * IGP - can handle 40-bits
          * AGP - generally dma32 is safest
          * PCI - dma32 for legacy pci gart, 40 bits on newer asics
          */
- -      rdev->need_dma32 = false;
+ +      dma_bits = 40;
         if (rdev->flags & RADEON_IS_AGP)
- -              rdev->need_dma32 = true;
+ +              dma_bits = 32;
         if ((rdev->flags & RADEON_IS_PCI) &&
             (rdev->family <= CHIP_RS740))
- -              rdev->need_dma32 = true;
+ +              dma_bits = 32;
   #ifdef CONFIG_PPC64
         if (rdev->family == CHIP_CEDAR)
- -              rdev->need_dma32 = true;
+ +              dma_bits = 32;
   #endif
   
- -      dma_bits = rdev->need_dma32 ? 32 : 40;
- -      r = pci_set_dma_mask(rdev->pdev, DMA_BIT_MASK(dma_bits));
+ +      r = dma_set_mask_and_coherent(&rdev->pdev->dev, DMA_BIT_MASK(dma_bits));
         if (r) {
- -              rdev->need_dma32 = true;
- -              dma_bits = 32;
                 pr_warn("radeon: No suitable DMA available\n");
- -      }
- -      r = pci_set_consistent_dma_mask(rdev->pdev, DMA_BIT_MASK(dma_bits));
- -      if (r) {
- -              pci_set_consistent_dma_mask(rdev->pdev, DMA_BIT_MASK(32));
- -              pr_warn("radeon: No coherent DMA available\n");
+ +              return r;
         }
         rdev->need_swiotlb = drm_need_swiotlb(dma_bits);
   
diff --combined drivers/gpu/drm/radeon/radeon_drv.c

index 5838162f687fe33a8fdeb2611398e55800ec8f3d,b6535ac91fdb7458ef1f8cef6a6288a7481dbfa2..431e6b64b77db6914dedbe24f122e72eaddf9de9
--- 1/drivers/gpu/drm/radeon/radeon_drv.c
--- 2/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c
@@@ -35,6 -35,7 +35,7 @@@
   #include <linux/module.h>
   #include <linux/pm_runtime.h>
   #include <linux/vga_switcheroo.h>
+ #include <linux/mmu_notifier.h>
   
   #include <drm/drm_crtc_helper.h>
   #include <drm/drm_drv.h>
@@@ -130,7 -131,8 +131,7 @@@ int radeon_gem_object_open(struct drm_g
                                 struct drm_file *file_priv);
   void radeon_gem_object_close(struct drm_gem_object *obj,
                                 struct drm_file *file_priv);
- -struct dma_buf *radeon_gem_prime_export(struct drm_device *dev,
- -                                      struct drm_gem_object *gobj,
+ +struct dma_buf *radeon_gem_prime_export(struct drm_gem_object *gobj,
                                         int flags);
   extern int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int crtc,
                                       unsigned int flags, int *vpos, int *hpos,
@@@ -152,6 -154,7 +153,6 @@@ struct drm_gem_object *radeon_gem_prime
                                                         struct sg_table *sg);
   int radeon_gem_prime_pin(struct drm_gem_object *obj);
   void radeon_gem_prime_unpin(struct drm_gem_object *obj);
- -struct reservation_object *radeon_gem_prime_res_obj(struct drm_gem_object *);
   void *radeon_gem_prime_vmap(struct drm_gem_object *obj);
   void radeon_gem_prime_vunmap(struct drm_gem_object *obj, void *vaddr);
   
@@@ -347,30 -350,24 +348,30 @@@ radeon_pci_remove(struct pci_dev *pdev
   static void
   radeon_pci_shutdown(struct pci_dev *pdev)
   {
+ +      struct drm_device *ddev = pci_get_drvdata(pdev);
+ +
         /* if we are running in a VM, make sure the device
          * torn down properly on reboot/shutdown
          */
         if (radeon_device_is_virtual())
                 radeon_pci_remove(pdev);
+ +
+ +      /* Some adapters need to be suspended before a
+ +      * shutdown occurs in order to prevent an error
+ +      * during kexec.
+ +      */
+ +      radeon_suspend_kms(ddev, true, true, false);
   }
   
   static int radeon_pmops_suspend(struct device *dev)
   {
- -      struct pci_dev *pdev = to_pci_dev(dev);
- -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
+ +      struct drm_device *drm_dev = dev_get_drvdata(dev);
         return radeon_suspend_kms(drm_dev, true, true, false);
   }
   
   static int radeon_pmops_resume(struct device *dev)
   {
- -      struct pci_dev *pdev = to_pci_dev(dev);
- -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
+ +      struct drm_device *drm_dev = dev_get_drvdata(dev);
   
         /* GPU comes up enabled by the bios on resume */
         if (radeon_is_px(drm_dev)) {
@@@ -384,13 -381,15 +385,13 @@@
   
   static int radeon_pmops_freeze(struct device *dev)
   {
- -      struct pci_dev *pdev = to_pci_dev(dev);
- -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
+ +      struct drm_device *drm_dev = dev_get_drvdata(dev);
         return radeon_suspend_kms(drm_dev, false, true, true);
   }
   
   static int radeon_pmops_thaw(struct device *dev)
   {
- -      struct pci_dev *pdev = to_pci_dev(dev);
- -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
+ +      struct drm_device *drm_dev = dev_get_drvdata(dev);
         return radeon_resume_kms(drm_dev, false, true);
   }
   
@@@ -449,7 -448,8 +450,7 @@@ static int radeon_pmops_runtime_resume(
   
   static int radeon_pmops_runtime_idle(struct device *dev)
   {
- -      struct pci_dev *pdev = to_pci_dev(dev);
- -      struct drm_device *drm_dev = pci_get_drvdata(pdev);
+ +      struct drm_device *drm_dev = dev_get_drvdata(dev);
         struct drm_crtc *crtc;
   
         if (!radeon_is_px(drm_dev)) {
@@@ -540,7 -540,7 +541,7 @@@ radeon_get_crtc_scanout_position(struc
   
   static struct drm_driver kms_driver = {
         .driver_features =
- -          DRIVER_USE_AGP | DRIVER_GEM | DRIVER_PRIME | DRIVER_RENDER,
+ +          DRIVER_USE_AGP | DRIVER_GEM | DRIVER_RENDER,
         .load = radeon_driver_load_kms,
         .open = radeon_driver_open_kms,
         .postclose = radeon_driver_postclose_kms,
@@@ -566,8 -566,10 +567,8 @@@
         .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
         .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
         .gem_prime_export = radeon_gem_prime_export,
- -      .gem_prime_import = drm_gem_prime_import,
         .gem_prime_pin = radeon_gem_prime_pin,
         .gem_prime_unpin = radeon_gem_prime_unpin,
- -      .gem_prime_res_obj = radeon_gem_prime_res_obj,
         .gem_prime_get_sg_table = radeon_gem_prime_get_sg_table,
         .gem_prime_import_sg_table = radeon_gem_prime_import_sg_table,
         .gem_prime_vmap = radeon_gem_prime_vmap,
@@@ -623,6 -625,7 +624,7 @@@ static void __exit radeon_exit(void
   {
         pci_unregister_driver(pdriver);
         radeon_unregister_atpx_handler();
+       mmu_notifier_synchronize();
   }
   
   module_init(radeon_init);
diff --combined drivers/gpu/drm/radeon/radeon_mn.c

index 6902f998ede9068acbe859aa1d5552abac64a294,1ee20d528a7c2465f9a4c3f4aa523ad5433b78d0..dbab9a3a969b9e3a49bffa49b2bd127df7d34b5e
--- 1/drivers/gpu/drm/radeon/radeon_mn.c
--- 2/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@@ -37,17 -37,8 +37,8 @@@
   #include "radeon.h"
   
   struct radeon_mn {
-       /* constant after initialisation */
-       struct radeon_device    *rdev;
-       struct mm_struct        *mm;
         struct mmu_notifier     mn;
   
-       /* only used on destruction */
-       struct work_struct      work;
- 
-       /* protected by rdev->mn_lock */
-       struct hlist_node       node;
- 
         /* objects protected by lock */
         struct mutex            lock;
         struct rb_root_cached   objects;
@@@ -58,55 -49,6 +49,6 @@@ struct radeon_mn_node 
         struct list_head                bos;
   };
   
- /**
-  * radeon_mn_destroy - destroy the rmn
-  *
-  * @work: previously sheduled work item
-  *
-  * Lazy destroys the notifier from a work item
-  */
- static void radeon_mn_destroy(struct work_struct *work)
- {
-       struct radeon_mn *rmn = container_of(work, struct radeon_mn, work);
-       struct radeon_device *rdev = rmn->rdev;
-       struct radeon_mn_node *node, *next_node;
-       struct radeon_bo *bo, *next_bo;
- 
-       mutex_lock(&rdev->mn_lock);
-       mutex_lock(&rmn->lock);
-       hash_del(&rmn->node);
-       rbtree_postorder_for_each_entry_safe(node, next_node,
-                                            &rmn->objects.rb_root, it.rb) {
- 
-               interval_tree_remove(&node->it, &rmn->objects);
-               list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) {
-                       bo->mn = NULL;
-                       list_del_init(&bo->mn_list);
-               }
-               kfree(node);
-       }
-       mutex_unlock(&rmn->lock);
-       mutex_unlock(&rdev->mn_lock);
-       mmu_notifier_unregister(&rmn->mn, rmn->mm);
-       kfree(rmn);
- }
- 
- /**
-  * radeon_mn_release - callback to notify about mm destruction
-  *
-  * @mn: our notifier
-  * @mn: the mm this callback is about
-  *
-  * Shedule a work item to lazy destroy our notifier.
-  */
- static void radeon_mn_release(struct mmu_notifier *mn,
-                             struct mm_struct *mm)
- {
-       struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
-       INIT_WORK(&rmn->work, radeon_mn_destroy);
-       schedule_work(&rmn->work);
- }
- 
   /**
    * radeon_mn_invalidate_range_start - callback to notify about mm change
    *
@@@ -163,7 -105,7 +105,7 @@@ static int radeon_mn_invalidate_range_s
                                 continue;
                         }
   
- -                      r = reservation_object_wait_timeout_rcu(bo->tbo.resv,
+ +                      r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv,
                                 true, false, MAX_SCHEDULE_TIMEOUT);
                         if (r <= 0)
                                 DRM_ERROR("(%ld) failed to wait for user bo\n", r);
@@@ -183,65 -125,44 +125,44 @@@ out_unlock
         return ret;
   }
   
- static const struct mmu_notifier_ops radeon_mn_ops = {
-       .release = radeon_mn_release,
-       .invalidate_range_start = radeon_mn_invalidate_range_start,
- };
+ static void radeon_mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
+ {
+       struct mmu_notifier_range range = {
+               .mm = mm,
+               .start = 0,
+               .end = ULONG_MAX,
+               .flags = 0,
+               .event = MMU_NOTIFY_UNMAP,
+       };
+ 
+       radeon_mn_invalidate_range_start(mn, &range);
+ }
   
- /**
-  * radeon_mn_get - create notifier context
-  *
-  * @rdev: radeon device pointer
-  *
-  * Creates a notifier context for current->mm.
-  */
- static struct radeon_mn *radeon_mn_get(struct radeon_device *rdev)
+ static struct mmu_notifier *radeon_mn_alloc_notifier(struct mm_struct *mm)
   {
-       struct mm_struct *mm = current->mm;
         struct radeon_mn *rmn;
-       int r;
- 
-       if (down_write_killable(&mm->mmap_sem))
-               return ERR_PTR(-EINTR);
- 
-       mutex_lock(&rdev->mn_lock);
- 
-       hash_for_each_possible(rdev->mn_hash, rmn, node, (unsigned long)mm)
-               if (rmn->mm == mm)
-                       goto release_locks;
   
         rmn = kzalloc(sizeof(*rmn), GFP_KERNEL);
-       if (!rmn) {
-               rmn = ERR_PTR(-ENOMEM);
-               goto release_locks;
-       }
+       if (!rmn)
+               return ERR_PTR(-ENOMEM);
   
-       rmn->rdev = rdev;
-       rmn->mm = mm;
-       rmn->mn.ops = &radeon_mn_ops;
         mutex_init(&rmn->lock);
         rmn->objects = RB_ROOT_CACHED;
-       
-       r = __mmu_notifier_register(&rmn->mn, mm);
-       if (r)
-               goto free_rmn;
- 
-       hash_add(rdev->mn_hash, &rmn->node, (unsigned long)mm);
- 
- release_locks:
-       mutex_unlock(&rdev->mn_lock);
-       up_write(&mm->mmap_sem);
- 
-       return rmn;
- 
- free_rmn:
-       mutex_unlock(&rdev->mn_lock);
-       up_write(&mm->mmap_sem);
-       kfree(rmn);
+       return &rmn->mn;
+ }
   
-       return ERR_PTR(r);
+ static void radeon_mn_free_notifier(struct mmu_notifier *mn)
+ {
+       kfree(container_of(mn, struct radeon_mn, mn));
   }
   
+ static const struct mmu_notifier_ops radeon_mn_ops = {
+       .release = radeon_mn_release,
+       .invalidate_range_start = radeon_mn_invalidate_range_start,
+       .alloc_notifier = radeon_mn_alloc_notifier,
+       .free_notifier = radeon_mn_free_notifier,
+ };
+ 
   /**
    * radeon_mn_register - register a BO for notifier updates
    *
@@@ -254,15 -175,16 +175,16 @@@
   int radeon_mn_register(struct radeon_bo *bo, unsigned long addr)
   {
         unsigned long end = addr + radeon_bo_size(bo) - 1;
-       struct radeon_device *rdev = bo->rdev;
+       struct mmu_notifier *mn;
         struct radeon_mn *rmn;
         struct radeon_mn_node *node = NULL;
         struct list_head bos;
         struct interval_tree_node *it;
   
-       rmn = radeon_mn_get(rdev);
-       if (IS_ERR(rmn))
-               return PTR_ERR(rmn);
+       mn = mmu_notifier_get(&radeon_mn_ops, current->mm);
+       if (IS_ERR(mn))
+               return PTR_ERR(mn);
+       rmn = container_of(mn, struct radeon_mn, mn);
   
         INIT_LIST_HEAD(&bos);
   
@@@ -309,22 -231,16 +231,16 @@@
    */
   void radeon_mn_unregister(struct radeon_bo *bo)
   {
-       struct radeon_device *rdev = bo->rdev;
-       struct radeon_mn *rmn;
+       struct radeon_mn *rmn = bo->mn;
         struct list_head *head;
   
-       mutex_lock(&rdev->mn_lock);
-       rmn = bo->mn;
-       if (rmn == NULL) {
-               mutex_unlock(&rdev->mn_lock);
+       if (!rmn)
                 return;
-       }
   
         mutex_lock(&rmn->lock);
         /* save the next list entry for later */
         head = bo->mn_list.next;
   
-       bo->mn = NULL;
         list_del(&bo->mn_list);
   
         if (list_empty(head)) {
@@@ -335,5 -251,7 +251,7 @@@
         }
   
         mutex_unlock(&rmn->lock);
-       mutex_unlock(&rdev->mn_lock);
+ 
+       mmu_notifier_put(&rmn->mn);
+       bo->mn = NULL;
   }
diff --combined drivers/infiniband/hw/mlx5/main.c

index 4e9f1507ffd9409dd42701ec4962cd0cb417ee62,4ba73a95475a98791f97e6c6087d80eba456cd42..bface798ee590705f342e9046d85c2b526c15e52
--- 1/drivers/infiniband/hw/mlx5/main.c
--- 2/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@@ -1023,7 -1023,7 +1023,7 @@@ static int mlx5_ib_query_device(struct 
         props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
   
         if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
- -              if (MLX5_CAP_GEN(mdev, pg))
+ +              if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
                         props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
                 props->odp_caps = dev->odp_caps;
         }
@@@ -1867,10 -1867,6 +1867,6 @@@ static int mlx5_ib_alloc_ucontext(struc
         if (err)
                 goto out_sys_pages;
   
-       if (ibdev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)
-               context->ibucontext.invalidate_range =
-                       &mlx5_ib_invalidate_range;
- 
         if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
                 err = mlx5_ib_devx_create(dev, true);
                 if (err < 0)
@@@ -1999,11 -1995,6 +1995,6 @@@ static void mlx5_ib_dealloc_ucontext(st
         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
         struct mlx5_bfreg_info *bfregi;
   
-       /* All umem's must be destroyed before destroying the ucontext. */
-       mutex_lock(&ibcontext->per_mm_list_lock);
-       WARN_ON(!list_empty(&ibcontext->per_mm_list));
-       mutex_unlock(&ibcontext->per_mm_list_lock);
- 
         bfregi = &context->bfregi;
         mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
   
@@@ -2280,7 -2271,6 +2271,7 @@@ static inline int check_dm_type_support
                         return -EOPNOTSUPP;
                 break;
         case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+ +      case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
                 if (!capable(CAP_SYS_RAWIO) ||
                     !capable(CAP_NET_RAW))
                         return -EPERM;
@@@ -2345,20 -2335,20 +2336,20 @@@ static int handle_alloc_dm_sw_icm(struc
                                   struct uverbs_attr_bundle *attrs,
                                   int type)
   {
- -      struct mlx5_dm *dm_db = &to_mdev(ctx->device)->dm;
+ +      struct mlx5_core_dev *dev = to_mdev(ctx->device)->mdev;
         u64 act_size;
         int err;
   
         /* Allocation size must a multiple of the basic block size
          * and a power of 2.
          */
- -      act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dm_db->dev));
+ +      act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dev));
         act_size = roundup_pow_of_two(act_size);
   
         dm->size = act_size;
- -      err = mlx5_cmd_alloc_sw_icm(dm_db, type, act_size,
- -                                  to_mucontext(ctx)->devx_uid, &dm->dev_addr,
- -                                  &dm->icm_dm.obj_id);
+ +      err = mlx5_dm_sw_icm_alloc(dev, type, act_size,
+ +                                 to_mucontext(ctx)->devx_uid, &dm->dev_addr,
+ +                                 &dm->icm_dm.obj_id);
         if (err)
                 return err;
   
@@@ -2366,9 -2356,9 +2357,9 @@@
                              MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
                              &dm->dev_addr, sizeof(dm->dev_addr));
         if (err)
- -              mlx5_cmd_dealloc_sw_icm(dm_db, type, dm->size,
- -                                      to_mucontext(ctx)->devx_uid,
- -                                      dm->dev_addr, dm->icm_dm.obj_id);
+ +              mlx5_dm_sw_icm_dealloc(dev, type, dm->size,
+ +                                     to_mucontext(ctx)->devx_uid, dm->dev_addr,
+ +                                     dm->icm_dm.obj_id);
   
         return err;
   }
@@@ -2408,14 -2398,8 +2399,14 @@@ struct ib_dm *mlx5_ib_alloc_dm(struct i
                                             attrs);
                 break;
         case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+ +              err = handle_alloc_dm_sw_icm(context, dm,
+ +                                           attr, attrs,
+ +                                           MLX5_SW_ICM_TYPE_STEERING);
+ +              break;
         case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
- -              err = handle_alloc_dm_sw_icm(context, dm, attr, attrs, type);
+ +              err = handle_alloc_dm_sw_icm(context, dm,
+ +                                           attr, attrs,
+ +                                           MLX5_SW_ICM_TYPE_HEADER_MODIFY);
                 break;
         default:
                 err = -EOPNOTSUPP;
@@@ -2435,7 -2419,6 +2426,7 @@@ int mlx5_ib_dealloc_dm(struct ib_dm *ib
   {
         struct mlx5_ib_ucontext *ctx = rdma_udata_to_drv_context(
                 &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
+ +      struct mlx5_core_dev *dev = to_mdev(ibdm->device)->mdev;
         struct mlx5_dm *dm_db = &to_mdev(ibdm->device)->dm;
         struct mlx5_ib_dm *dm = to_mdm(ibdm);
         u32 page_idx;
@@@ -2447,23 -2430,19 +2438,23 @@@
                 if (ret)
                         return ret;
   
- -              page_idx = (dm->dev_addr -
- -                          pci_resource_start(dm_db->dev->pdev, 0) -
- -                          MLX5_CAP64_DEV_MEM(dm_db->dev,
- -                                             memic_bar_start_addr)) >>
- -                         PAGE_SHIFT;
+ +              page_idx = (dm->dev_addr - pci_resource_start(dev->pdev, 0) -
+ +                          MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr)) >>
+ +                          PAGE_SHIFT;
                 bitmap_clear(ctx->dm_pages, page_idx,
                              DIV_ROUND_UP(dm->size, PAGE_SIZE));
                 break;
         case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+ +              ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_STEERING,
+ +                                           dm->size, ctx->devx_uid, dm->dev_addr,
+ +                                           dm->icm_dm.obj_id);
+ +              if (ret)
+ +                      return ret;
+ +              break;
         case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
- -              ret = mlx5_cmd_dealloc_sw_icm(dm_db, dm->type, dm->size,
- -                                            ctx->devx_uid, dm->dev_addr,
- -                                            dm->icm_dm.obj_id);
+ +              ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_HEADER_MODIFY,
+ +                                           dm->size, ctx->devx_uid, dm->dev_addr,
+ +                                           dm->icm_dm.obj_id);
                 if (ret)
                         return ret;
                 break;
@@@ -2658,8 -2637,7 +2649,8 @@@ int parse_flow_flow_action(struct mlx5_
                         if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
                                 return -EINVAL;
                         action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
- -                      action->modify_id = maction->flow_action_raw.action_id;
+ +                      action->modify_hdr =
+ +                              maction->flow_action_raw.modify_hdr;
                         return 0;
                 }
                 if (maction->flow_action_raw.sub_type ==
@@@ -2676,8 -2654,8 +2667,8 @@@
                                 return -EINVAL;
                         action->action |=
                                 MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
- -                      action->reformat_id =
- -                              maction->flow_action_raw.action_id;
+ +                      action->pkt_reformat =
+ +                              maction->flow_action_raw.pkt_reformat;
                         return 0;
                 }
                 /* fall through */
@@@ -6109,6 -6087,8 +6100,6 @@@ static struct ib_counters *mlx5_ib_crea
   
   static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
   {
- -      struct mlx5_core_dev *mdev = dev->mdev;
- -
         mlx5_ib_cleanup_multiport_master(dev);
         if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
                 srcu_barrier(&dev->mr_srcu);
@@@ -6116,11 -6096,29 +6107,11 @@@
         }
   
         WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
- -
- -      WARN_ON(dev->dm.steering_sw_icm_alloc_blocks &&
- -              !bitmap_empty(
- -                      dev->dm.steering_sw_icm_alloc_blocks,
- -                      BIT(MLX5_CAP_DEV_MEM(mdev, log_steering_sw_icm_size) -
- -                          MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev))));
- -
- -      kfree(dev->dm.steering_sw_icm_alloc_blocks);
- -
- -      WARN_ON(dev->dm.header_modify_sw_icm_alloc_blocks &&
- -              !bitmap_empty(dev->dm.header_modify_sw_icm_alloc_blocks,
- -                            BIT(MLX5_CAP_DEV_MEM(
- -                                        mdev, log_header_modify_sw_icm_size) -
- -                                MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev))));
- -
- -      kfree(dev->dm.header_modify_sw_icm_alloc_blocks);
   }
   
   static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
   {
         struct mlx5_core_dev *mdev = dev->mdev;
- -      u64 header_modify_icm_blocks = 0;
- -      u64 steering_icm_blocks = 0;
         int err;
         int i;
   
@@@ -6132,8 -6130,6 +6123,8 @@@
                 dev->port[i].roce.last_port_state = IB_PORT_DOWN;
         }
   
+ +      mlx5_ib_internal_fill_odp_caps(dev);
+ +
         err = mlx5_ib_init_multiport_master(dev);
         if (err)
                 return err;
@@@ -6167,17 -6163,51 +6158,17 @@@
         INIT_LIST_HEAD(&dev->qp_list);
         spin_lock_init(&dev->reset_flow_resource_lock);
   
- -      if (MLX5_CAP_GEN_64(mdev, general_obj_types) &
- -          MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM) {
- -              if (MLX5_CAP64_DEV_MEM(mdev, steering_sw_icm_start_address)) {
- -                      steering_icm_blocks =
- -                              BIT(MLX5_CAP_DEV_MEM(mdev,
- -                                                   log_steering_sw_icm_size) -
- -                                  MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev));
- -
- -                      dev->dm.steering_sw_icm_alloc_blocks =
- -                              kcalloc(BITS_TO_LONGS(steering_icm_blocks),
- -                                      sizeof(unsigned long), GFP_KERNEL);
- -                      if (!dev->dm.steering_sw_icm_alloc_blocks)
- -                              goto err_mp;
- -              }
- -
- -              if (MLX5_CAP64_DEV_MEM(mdev,
- -                                     header_modify_sw_icm_start_address)) {
- -                      header_modify_icm_blocks = BIT(
- -                              MLX5_CAP_DEV_MEM(
- -                                      mdev, log_header_modify_sw_icm_size) -
- -                              MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev));
- -
- -                      dev->dm.header_modify_sw_icm_alloc_blocks =
- -                              kcalloc(BITS_TO_LONGS(header_modify_icm_blocks),
- -                                      sizeof(unsigned long), GFP_KERNEL);
- -                      if (!dev->dm.header_modify_sw_icm_alloc_blocks)
- -                              goto err_dm;
- -              }
- -      }
- -
         spin_lock_init(&dev->dm.lock);
         dev->dm.dev = mdev;
   
         if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
                 err = init_srcu_struct(&dev->mr_srcu);
                 if (err)
- -                      goto err_dm;
+ +                      goto err_mp;
         }
   
         return 0;
   
- -err_dm:
- -      kfree(dev->dm.steering_sw_icm_alloc_blocks);
- -      kfree(dev->dm.header_modify_sw_icm_alloc_blocks);
- -
   err_mp:
         mlx5_ib_cleanup_multiport_master(dev);
   
@@@ -6524,6 -6554,8 +6515,6 @@@ static void mlx5_ib_stage_dev_res_clean
   
   static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev)
   {
- -      mlx5_ib_internal_fill_odp_caps(dev);
- -
         return mlx5_ib_odp_init_one(dev);
   }
   
diff --combined drivers/infiniband/hw/mlx5/mr.c

index 3401f5f6792e6bb79e472f6411eade5c2d1e2dab,b7da619614e4511f3d6ec43dd9213d95530178b3..1eff031ef04842f06ab4d088b04a610b9388aa1b
--- 1/drivers/infiniband/hw/mlx5/mr.c
--- 2/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@@ -784,19 -784,37 +784,37 @@@ static int mr_umem_get(struct mlx5_ib_d
                        int *ncont, int *order)
   {
         struct ib_umem *u;
-       int err;
   
         *umem = NULL;
   
-       u = ib_umem_get(udata, start, length, access_flags, 0);
-       err = PTR_ERR_OR_ZERO(u);
-       if (err) {
-               mlx5_ib_dbg(dev, "umem get failed (%d)\n", err);
-               return err;
+       if (access_flags & IB_ACCESS_ON_DEMAND) {
+               struct ib_umem_odp *odp;
+ 
+               odp = ib_umem_odp_get(udata, start, length, access_flags);
+               if (IS_ERR(odp)) {
+                       mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
+                                   PTR_ERR(odp));
+                       return PTR_ERR(odp);
+               }
+ 
+               u = &odp->umem;
+ 
+               *page_shift = odp->page_shift;
+               *ncont = ib_umem_odp_num_pages(odp);
+               *npages = *ncont << (*page_shift - PAGE_SHIFT);
+               if (order)
+                       *order = ilog2(roundup_pow_of_two(*ncont));
+       } else {
+               u = ib_umem_get(udata, start, length, access_flags, 0);
+               if (IS_ERR(u)) {
+                       mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u));
+                       return PTR_ERR(u);
+               }
+ 
+               mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
+                                  page_shift, ncont, order);
         }
   
-       mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
-                          page_shift, ncont, order);
         if (!*npages) {
                 mlx5_ib_warn(dev, "avoid zero region\n");
                 ib_umem_release(u);
@@@ -1293,7 -1311,9 +1311,7 @@@ struct ib_mr *mlx5_ib_reg_user_mr(struc
         if (err < 0)
                 return ERR_PTR(err);
   
- -      use_umr = !MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled) &&
- -                (!MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled) ||
- -                 !MLX5_CAP_GEN(dev->mdev, atomic));
+ +      use_umr = mlx5_ib_can_use_umr(dev, true);
   
         if (order <= mr_cache_max_order(dev) && use_umr) {
                 mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont,
@@@ -1446,8 -1466,7 +1464,8 @@@ int mlx5_ib_rereg_user_mr(struct ib_mr 
                         goto err;
         }
   
- -      if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) {
+ +      if (!mlx5_ib_can_use_umr(dev, true) ||
+ +          (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len))) {
                 /*
                  * UMR can't be used - MKey needs to be replaced.
                  */
@@@ -1599,7 -1618,7 +1617,7 @@@ static void dereg_mr(struct mlx5_ib_de
                 /* Wait for all running page-fault handlers to finish. */
                 synchronize_srcu(&dev->mr_srcu);
                 /* Destroy all page mappings */
-               if (umem_odp->page_list)
+               if (!umem_odp->is_implicit_odp)
                         mlx5_ib_invalidate_range(umem_odp,
                                                  ib_umem_start(umem_odp),
                                                  ib_umem_end(umem_odp));
@@@ -1610,7 -1629,7 +1628,7 @@@
                  * so that there will not be any invalidations in
                  * flight, looking at the *mr struct.
                  */
-               ib_umem_release(umem);
+               ib_umem_odp_release(umem_odp);
                 atomic_sub(npages, &dev->mdev->priv.reg_pages);
   
                 /* Avoid double-freeing the umem. */
diff --combined drivers/infiniband/hw/mlx5/odp.c

index 0a59912a4cef640067472319da83d7123f6eb5d1,762038ab83e290b2860cb45b324bdf0f40e6e0b8..dd26e7acb37e4b331890d7ab929389b0d76e3fe7
--- 1/drivers/infiniband/hw/mlx5/odp.c
--- 2/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@@ -184,7 -184,7 +184,7 @@@ void mlx5_odp_populate_klm(struct mlx5_
         for (i = 0; i < nentries; i++, pklm++) {
                 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
                 va = (offset + i) * MLX5_IMR_MTT_SIZE;
-               if (odp && odp->umem.address == va) {
+               if (odp && ib_umem_start(odp) == va) {
                         struct mlx5_ib_mr *mtt = odp->private;
   
                         pklm->key = cpu_to_be32(mtt->ibmr.lkey);
@@@ -206,7 -206,7 +206,7 @@@ static void mr_leaf_free_action(struct 
         mr->parent = NULL;
         synchronize_srcu(&mr->dev->mr_srcu);
   
-       ib_umem_release(&odp->umem);
+       ib_umem_odp_release(odp);
         if (imr->live)
                 mlx5_ib_update_xlt(imr, idx, 1, 0,
                                    MLX5_IB_UPD_XLT_INDIRECT |
@@@ -301,8 -301,7 +301,8 @@@ void mlx5_ib_internal_fill_odp_caps(str
   
         memset(caps, 0, sizeof(*caps));
   
- -      if (!MLX5_CAP_GEN(dev->mdev, pg))
+ +      if (!MLX5_CAP_GEN(dev->mdev, pg) ||
+ +          !mlx5_ib_can_use_umr(dev, true))
                 return;
   
         caps->general_caps = IB_ODP_SUPPORT;
@@@ -356,8 -355,7 +356,8 @@@
   
         if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
             MLX5_CAP_GEN(dev->mdev, null_mkey) &&
- -          MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
+ +          MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
+ +          !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled))
                 caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
   
         return;
@@@ -386,7 -384,7 +386,7 @@@ static void mlx5_ib_page_fault_resume(s
   }
   
   static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
-                                           struct ib_umem *umem,
+                                           struct ib_umem_odp *umem_odp,
                                             bool ksm, int access_flags)
   {
         struct mlx5_ib_dev *dev = to_mdev(pd->device);
@@@ -404,7 -402,7 +404,7 @@@
         mr->dev = dev;
         mr->access_flags = access_flags;
         mr->mmkey.iova = 0;
-       mr->umem = umem;
+       mr->umem = &umem_odp->umem;
   
         if (ksm) {
                 err = mlx5_ib_update_xlt(mr, 0,
@@@ -464,18 -462,17 +464,17 @@@ next_mr
                 if (nentries)
                         nentries++;
         } else {
-               odp = ib_alloc_odp_umem(odp_mr, addr,
-                                       MLX5_IMR_MTT_SIZE);
+               odp = ib_umem_odp_alloc_child(odp_mr, addr, MLX5_IMR_MTT_SIZE);
                 if (IS_ERR(odp)) {
                         mutex_unlock(&odp_mr->umem_mutex);
                         return ERR_CAST(odp);
                 }
   
-               mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0,
+               mtt = implicit_mr_alloc(mr->ibmr.pd, odp, 0,
                                         mr->access_flags);
                 if (IS_ERR(mtt)) {
                         mutex_unlock(&odp_mr->umem_mutex);
-                       ib_umem_release(&odp->umem);
+                       ib_umem_odp_release(odp);
                         return ERR_CAST(mtt);
                 }
   
@@@ -497,7 -494,7 +496,7 @@@
         addr += MLX5_IMR_MTT_SIZE;
         if (unlikely(addr < io_virt + bcnt)) {
                 odp = odp_next(odp);
-               if (odp && odp->umem.address != addr)
+               if (odp && ib_umem_start(odp) != addr)
                         odp = NULL;
                 goto next_mr;
         }
@@@ -521,19 -518,19 +520,19 @@@ struct mlx5_ib_mr *mlx5_ib_alloc_implic
                                              int access_flags)
   {
         struct mlx5_ib_mr *imr;
-       struct ib_umem *umem;
+       struct ib_umem_odp *umem_odp;
   
-       umem = ib_umem_get(udata, 0, 0, access_flags, 0);
-       if (IS_ERR(umem))
-               return ERR_CAST(umem);
+       umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags);
+       if (IS_ERR(umem_odp))
+               return ERR_CAST(umem_odp);
   
-       imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags);
+       imr = implicit_mr_alloc(&pd->ibpd, umem_odp, 1, access_flags);
         if (IS_ERR(imr)) {
-               ib_umem_release(umem);
+               ib_umem_odp_release(umem_odp);
                 return ERR_CAST(imr);
         }
   
-       imr->umem = umem;
+       imr->umem = &umem_odp->umem;
         init_waitqueue_head(&imr->q_leaf_free);
         atomic_set(&imr->num_leaf_free, 0);
         atomic_set(&imr->num_pending_prefetch, 0);
@@@ -541,34 -538,31 +540,31 @@@
         return imr;
   }
   
- static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end,
-                       void *cookie)
+ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
   {
-       struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie;
- 
-       if (mr->parent != imr)
-               return 0;
- 
-       ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
-                                   ib_umem_end(umem_odp));
+       struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr);
+       struct rb_node *node;
   
-       if (umem_odp->dying)
-               return 0;
+       down_read(&per_mm->umem_rwsem);
+       for (node = rb_first_cached(&per_mm->umem_tree); node;
+            node = rb_next(node)) {
+               struct ib_umem_odp *umem_odp =
+                       rb_entry(node, struct ib_umem_odp, interval_tree.rb);
+               struct mlx5_ib_mr *mr = umem_odp->private;
   
-       WRITE_ONCE(umem_odp->dying, 1);
-       atomic_inc(&imr->num_leaf_free);
-       schedule_work(&umem_odp->work);
+               if (mr->parent != imr)
+                       continue;
   
-       return 0;
- }
+               ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+                                           ib_umem_end(umem_odp));
   
- void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
- {
-       struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr);
+               if (umem_odp->dying)
+                       continue;
   
-       down_read(&per_mm->umem_rwsem);
-       rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX,
-                                     mr_leaf_free, true, imr);
+               WRITE_ONCE(umem_odp->dying, 1);
+               atomic_inc(&imr->num_leaf_free);
+               schedule_work(&umem_odp->work);
+       }
         up_read(&per_mm->umem_rwsem);
   
         wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
@@@ -589,7 -583,7 +585,7 @@@ static int pagefault_mr(struct mlx5_ib_
         struct ib_umem_odp *odp;
         size_t size;
   
-       if (!odp_mr->page_list) {
+       if (odp_mr->is_implicit_odp) {
                 odp = implicit_mr_get_data(mr, io_virt, bcnt);
   
                 if (IS_ERR(odp))
@@@ -607,7 -601,7 +603,7 @@@ next_mr
         start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
         access_mask = ODP_READ_ALLOWED_BIT;
   
-       if (prefetch && !downgrade && !mr->umem->writable) {
+       if (prefetch && !downgrade && !odp->umem.writable) {
                 /* prefetch with write-access must
                  * be supported by the MR
                  */
@@@ -615,7 -609,7 +611,7 @@@
                 goto out;
         }
   
-       if (mr->umem->writable && !downgrade)
+       if (odp->umem.writable && !downgrade)
                 access_mask |= ODP_WRITE_ALLOWED_BIT;
   
         current_seq = READ_ONCE(odp->notifiers_seq);
@@@ -625,8 -619,8 +621,8 @@@
          */
         smp_rmb();
   
-       ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size,
-                                       access_mask, current_seq);
+       ret = ib_umem_odp_map_dma_pages(odp, io_virt, size, access_mask,
+                                       current_seq);
   
         if (ret < 0)
                 goto out;
@@@ -634,8 -628,7 +630,7 @@@
         np = ret;
   
         mutex_lock(&odp->umem_mutex);
-       if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem),
-                                       current_seq)) {
+       if (!ib_umem_mmu_notifier_retry(odp, current_seq)) {
                 /*
                  * No need to check whether the MTTs really belong to
                  * this MR, since ib_umem_odp_map_dma_pages already
@@@ -668,7 -661,7 +663,7 @@@
   
                 io_virt += size;
                 next = odp_next(odp);
-               if (unlikely(!next || next->umem.address != io_virt)) {
+               if (unlikely(!next || ib_umem_start(next) != io_virt)) {
                         mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
                                     io_virt, next);
                         return -EAGAIN;
@@@ -1618,16 -1611,15 +1613,17 @@@ void mlx5_odp_init_mr_cache_entry(struc
   
   static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
         .advise_mr = mlx5_ib_advise_mr,
+       .invalidate_range = mlx5_ib_invalidate_range,
   };
   
   int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
   {
         int ret = 0;
   
- -      if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
- -              ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
+ +      if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
+ +              return ret;
+ +
+ +      ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
   
         if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
                 ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
@@@ -1637,6 -1629,9 +1633,6 @@@
                 }
         }
   
- -      if (!MLX5_CAP_GEN(dev->mdev, pg))
- -              return ret;
- -
         ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq);
   
         return ret;
@@@ -1644,7 -1639,7 +1640,7 @@@
   
   void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
   {
- -      if (!MLX5_CAP_GEN(dev->mdev, pg))
+ +      if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
                 return;
   
         mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq);
diff --combined include/linux/sched.h

index b75b282870053e083a1f79ae9b55cba6a95caa74,c5630f3dca1fafac7cfde1ed7051f0b61314ff7e..70db597d6fd4f2ceb53b1a71c0db243e681d7070
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -25,11 -25,9 +25,11 @@@
   #include <linux/resource.h>
   #include <linux/latencytop.h>
   #include <linux/sched/prio.h>
+ +#include <linux/sched/types.h>
   #include <linux/signal_types.h>
   #include <linux/mm_types_task.h>
   #include <linux/task_io_accounting.h>
+ +#include <linux/posix-timers.h>
   #include <linux/rseq.h>
   
   /* task_struct member predeclarations (sorted alphabetically): */
@@@ -246,6 -244,27 +246,6 @@@ struct prev_cputime 
   #endif
   };
   
- -/**
- - * struct task_cputime - collected CPU time counts
- - * @utime:            time spent in user mode, in nanoseconds
- - * @stime:            time spent in kernel mode, in nanoseconds
- - * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
- - *
- - * This structure groups together three kinds of CPU time that are tracked for
- - * threads and thread groups.  Most things considering CPU time want to group
- - * these counts together and treat all three of them in parallel.
- - */
- -struct task_cputime {
- -      u64                             utime;
- -      u64                             stime;
- -      unsigned long long              sum_exec_runtime;
- -};
- -
- -/* Alternate field names when used on cache expirations: */
- -#define virt_exp                      utime
- -#define prof_exp                      stime
- -#define sched_exp                     sum_exec_runtime
- -
   enum vtime_state {
         /* Task is sleeping or running in a CPU with VTIME inactive: */
         VTIME_INACTIVE = 0,
@@@ -276,11 -295,6 +276,11 @@@ enum uclamp_id 
         UCLAMP_CNT
   };
   
+ +#ifdef CONFIG_SMP
+ +extern struct root_domain def_root_domain;
+ +extern struct mutex sched_domains_mutex;
+ +#endif
+ +
   struct sched_info {
   #ifdef CONFIG_SCHED_INFO
         /* Cumulative counters: */
@@@ -862,8 -876,10 +862,8 @@@ struct task_struct 
         unsigned long                   min_flt;
         unsigned long                   maj_flt;
   
- -#ifdef CONFIG_POSIX_TIMERS
- -      struct task_cputime             cputime_expires;
- -      struct list_head                cpu_timers[3];
- -#endif
+ +      /* Empty if CONFIG_POSIX_CPUTIMERS=n */
+ +      struct posix_cputimers          posix_cputimers;
   
         /* Process credentials: */
   
@@@ -958,6 -974,10 +958,10 @@@
         struct mutex_waiter             *blocked_on;
   #endif
   
+ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+       int                             non_block_count;
+ #endif
+ 
   #ifdef CONFIG_TRACE_IRQFLAGS
         unsigned int                    irq_events;
         unsigned long                   hardirq_enable_ip;
@@@ -1751,7 -1771,7 +1755,7 @@@ static inline int test_tsk_need_resched
    * value indicates whether a reschedule was done in fact.
    * cond_resched_lock() will drop the spinlock before scheduling,
    */
- -#ifndef CONFIG_PREEMPT
+ +#ifndef CONFIG_PREEMPTION
   extern int _cond_resched(void);
   #else
   static inline int _cond_resched(void) { return 0; }
@@@ -1780,12 -1800,12 +1784,12 @@@ static inline void cond_resched_rcu(voi
   
   /*
    * Does a critical section need to be broken due to another
- - * task waiting?: (technically does not depend on CONFIG_PREEMPT,
+ + * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
    * but a general need for low latency)
    */
   static inline int spin_needbreak(spinlock_t *lock)
   {
- -#ifdef CONFIG_PREEMPT
+ +#ifdef CONFIG_PREEMPTION
         return spin_is_contended(lock);
   #else
         return 0;
diff --combined kernel/fork.c

index 53e780748fe3367973182edd594c3e27fc9108fd,92c8559d9745b10d00a790125ca3144198281650..5a0fd518e04e44dd6ec3080f6134fdecc952d653
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -768,7 -768,6 +768,7 @@@ static void set_max_threads(unsigned in
   int arch_task_struct_size __read_mostly;
   #endif
   
+ +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
   static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
   {
         /* Fetch thread_struct whitelist for the architecture. */
@@@ -783,7 -782,6 +783,7 @@@
         else
                 *offset += offsetof(struct task_struct, thread);
   }
+ +#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
   
   void __init fork_init(void)
   {
@@@ -1009,7 -1007,6 +1009,6 @@@ static struct mm_struct *mm_init(struc
         mm_init_owner(mm, p);
         RCU_INIT_POINTER(mm->exe_file, NULL);
         mmu_notifier_mm_init(mm);
-       hmm_mm_init(mm);
         init_tlb_flush_pending(mm);
   #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
         mm->pmd_huge_pte = NULL;
@@@ -1519,17 -1516,28 +1518,17 @@@ void __cleanup_sighand(struct sighand_s
         }
   }
   
- -#ifdef CONFIG_POSIX_TIMERS
   /*
    * Initialize POSIX timer handling for a thread group.
    */
   static void posix_cpu_timers_init_group(struct signal_struct *sig)
   {
+ +      struct posix_cputimers *pct = &sig->posix_cputimers;
         unsigned long cpu_limit;
   
         cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
- -      if (cpu_limit != RLIM_INFINITY) {
- -              sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
- -              sig->cputimer.running = true;
- -      }
- -
- -      /* The timer lists. */
- -      INIT_LIST_HEAD(&sig->cpu_timers[0]);
- -      INIT_LIST_HEAD(&sig->cpu_timers[1]);
- -      INIT_LIST_HEAD(&sig->cpu_timers[2]);
+ +      posix_cputimers_group_init(pct, cpu_limit);
   }
- -#else
- -static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
- -#endif
   
   static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
   {
@@@ -1631,6 -1639,23 +1630,6 @@@ static void rt_mutex_init_task(struct t
   #endif
   }
   
- -#ifdef CONFIG_POSIX_TIMERS
- -/*
- - * Initialize POSIX timer handling for a single task.
- - */
- -static void posix_cpu_timers_init(struct task_struct *tsk)
- -{
- -      tsk->cputime_expires.prof_exp = 0;
- -      tsk->cputime_expires.virt_exp = 0;
- -      tsk->cputime_expires.sched_exp = 0;
- -      INIT_LIST_HEAD(&tsk->cpu_timers[0]);
- -      INIT_LIST_HEAD(&tsk->cpu_timers[1]);
- -      INIT_LIST_HEAD(&tsk->cpu_timers[2]);
- -}
- -#else
- -static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
- -#endif
- -
   static inline void init_task_pid_links(struct task_struct *task)
   {
         enum pid_type type;
@@@ -1664,14 -1689,6 +1663,14 @@@ static inline void rcu_copy_process(str
   #endif /* #ifdef CONFIG_TASKS_RCU */
   }
   
+ +struct pid *pidfd_pid(const struct file *file)
+ +{
+ +      if (file->f_op == &pidfd_fops)
+ +              return file->private_data;
+ +
+ +      return ERR_PTR(-EBADF);
+ +}
+ +
   static int pidfd_release(struct inode *inode, struct file *file)
   {
         struct pid *pid = file->private_data;
@@@ -1917,7 -1934,7 +1916,7 @@@ static __latent_entropy struct task_str
         task_io_accounting_init(&p->ioac);
         acct_clear_integrals(p);
   
- -      posix_cpu_timers_init(p);
+ +      posix_cputimers_init(&p->posix_cputimers);
   
         p->io_context = NULL;
         audit_set_context(p, NULL);
@@@ -2320,8 -2337,6 +2319,8 @@@ struct mm_struct *copy_init_mm(void
    *
    * It copies the process, and if successful kick-starts
    * it and waits for it to finish using the VM if required.
+ + *
+ + * args->exit_signal is expected to be checked for sanity by the caller.
    */
   long _do_fork(struct kernel_clone_args *args)
   {
@@@ -2546,14 -2561,6 +2545,14 @@@ noinline static int copy_clone_args_fro
         if (copy_from_user(&args, uargs, size))
                 return -EFAULT;
   
+ +      /*
+ +       * Verify that higher 32bits of exit_signal are unset and that
+ +       * it is a valid signal
+ +       */
+ +      if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
+ +                   !valid_signal(args.exit_signal)))
+ +              return -EINVAL;
+ +
         *kargs = (struct kernel_clone_args){
                 .flags          = args.flags,
                 .pidfd          = u64_to_user_ptr(args.pidfd),
diff --combined kernel/sched/core.c

index 5e8387bdd09c65c9b804534afba93a654d39d8a3,57245770d6cc23ef2ab76b86e743a3d6dafe4746..f9a1346a5fa9502be6ca45ecb1bd822bf706725e
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -255,7 -255,7 +255,7 @@@ static void __hrtick_restart(struct rq 
   {
         struct hrtimer *timer = &rq->hrtick_timer;
   
- -      hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ +      hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
   }
   
   /*
@@@ -314,7 -314,7 +314,7 @@@ void hrtick_start(struct rq *rq, u64 de
          */
         delay = max_t(u64, delay, 10000LL);
         hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
- -                    HRTIMER_MODE_REL_PINNED);
+ +                    HRTIMER_MODE_REL_PINNED_HARD);
   }
   #endif /* CONFIG_SMP */
   
@@@ -328,7 -328,7 +328,7 @@@ static void hrtick_rq_init(struct rq *r
         rq->hrtick_csd.info = rq;
   #endif
   
- -      hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ +      hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
         rq->hrtick_timer.function = hrtick;
   }
   #else /* CONFIG_SCHED_HRTICK */
@@@ -773,18 -773,6 +773,18 @@@ static void set_load_weight(struct task
   }
   
   #ifdef CONFIG_UCLAMP_TASK
+ +/*
+ + * Serializes updates of utilization clamp values
+ + *
+ + * The (slow-path) user-space triggers utilization clamp value updates which
+ + * can require updates on (fast-path) scheduler's data structures used to
+ + * support enqueue/dequeue operations.
+ + * While the per-CPU rq lock protects fast-path update operations, user-space
+ + * requests are serialized using a mutex to reduce the risk of conflicting
+ + * updates or API abuses.
+ + */
+ +static DEFINE_MUTEX(uclamp_mutex);
+ +
   /* Max allowed minimum utilization */
   unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
   
@@@ -810,7 -798,7 +810,7 @@@ static inline unsigned int uclamp_bucke
         return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
   }
   
- -static inline unsigned int uclamp_none(int clamp_id)
+ +static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
   {
         if (clamp_id == UCLAMP_MIN)
                 return 0;
@@@ -826,7 -814,7 +826,7 @@@ static inline void uclamp_se_set(struc
   }
   
   static inline unsigned int
- -uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+ +uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
                   unsigned int clamp_value)
   {
         /*
@@@ -842,7 -830,7 +842,7 @@@
         return uclamp_none(UCLAMP_MIN);
   }
   
- -static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+ +static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
                                      unsigned int clamp_value)
   {
         /* Reset max-clamp retention only on idle exit */
@@@ -853,8 -841,8 +853,8 @@@
   }
   
   static inline
- -unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
- -                               unsigned int clamp_value)
+ +enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+ +                                 unsigned int clamp_value)
   {
         struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
         int bucket_id = UCLAMP_BUCKETS - 1;
@@@ -873,42 -861,16 +873,42 @@@
         return uclamp_idle_value(rq, clamp_id, clamp_value);
   }
   
+ +static inline struct uclamp_se
+ +uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
+ +{
+ +      struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +      struct uclamp_se uc_max;
+ +
+ +      /*
+ +       * Tasks in autogroups or root task group will be
+ +       * restricted by system defaults.
+ +       */
+ +      if (task_group_is_autogroup(task_group(p)))
+ +              return uc_req;
+ +      if (task_group(p) == &root_task_group)
+ +              return uc_req;
+ +
+ +      uc_max = task_group(p)->uclamp[clamp_id];
+ +      if (uc_req.value > uc_max.value || !uc_req.user_defined)
+ +              return uc_max;
+ +#endif
+ +
+ +      return uc_req;
+ +}
+ +
   /*
    * The effective clamp bucket index of a task depends on, by increasing
    * priority:
    * - the task specific clamp value, when explicitly requested from userspace
+ + * - the task group effective clamp value, for tasks not either in the root
+ + *   group or in an autogroup
    * - the system default clamp value, defined by the sysadmin
    */
   static inline struct uclamp_se
- -uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+ +uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
   {
- -      struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+ +      struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
         struct uclamp_se uc_max = uclamp_default[clamp_id];
   
         /* System default restrictions always apply */
@@@ -918,7 -880,7 +918,7 @@@
         return uc_req;
   }
   
- -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+ +enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
   {
         struct uclamp_se uc_eff;
   
@@@ -942,7 -904,7 +942,7 @@@
    * for each bucket when all its RUNNABLE tasks require the same clamp.
    */
   static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
- -                                  unsigned int clamp_id)
+ +                                  enum uclamp_id clamp_id)
   {
         struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@@ -980,7 -942,7 +980,7 @@@
    * enforce the expected state and warn.
    */
   static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
- -                                  unsigned int clamp_id)
+ +                                  enum uclamp_id clamp_id)
   {
         struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@@ -1019,7 -981,7 +1019,7 @@@
   
   static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
   {
- -      unsigned int clamp_id;
+ +      enum uclamp_id clamp_id;
   
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
@@@ -1034,7 -996,7 +1034,7 @@@
   
   static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
   {
- -      unsigned int clamp_id;
+ +      enum uclamp_id clamp_id;
   
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
@@@ -1043,82 -1005,15 +1043,82 @@@
                 uclamp_rq_dec_id(rq, p, clamp_id);
   }
   
+ +static inline void
+ +uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+ +{
+ +      struct rq_flags rf;
+ +      struct rq *rq;
+ +
+ +      /*
+ +       * Lock the task and the rq where the task is (or was) queued.
+ +       *
+ +       * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+ +       * price to pay to safely serialize util_{min,max} updates with
+ +       * enqueues, dequeues and migration operations.
+ +       * This is the same locking schema used by __set_cpus_allowed_ptr().
+ +       */
+ +      rq = task_rq_lock(p, &rf);
+ +
+ +      /*
+ +       * Setting the clamp bucket is serialized by task_rq_lock().
+ +       * If the task is not yet RUNNABLE and its task_struct is not
+ +       * affecting a valid clamp bucket, the next time it's enqueued,
+ +       * it will already see the updated clamp bucket value.
+ +       */
+ +      if (!p->uclamp[clamp_id].active) {
+ +              uclamp_rq_dec_id(rq, p, clamp_id);
+ +              uclamp_rq_inc_id(rq, p, clamp_id);
+ +      }
+ +
+ +      task_rq_unlock(rq, p, &rf);
+ +}
+ +
+ +static inline void
+ +uclamp_update_active_tasks(struct cgroup_subsys_state *css,
+ +                         unsigned int clamps)
+ +{
+ +      enum uclamp_id clamp_id;
+ +      struct css_task_iter it;
+ +      struct task_struct *p;
+ +
+ +      css_task_iter_start(css, 0, &it);
+ +      while ((p = css_task_iter_next(&it))) {
+ +              for_each_clamp_id(clamp_id) {
+ +                      if ((0x1 << clamp_id) & clamps)
+ +                              uclamp_update_active(p, clamp_id);
+ +              }
+ +      }
+ +      css_task_iter_end(&it);
+ +}
+ +
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+ +static void uclamp_update_root_tg(void)
+ +{
+ +      struct task_group *tg = &root_task_group;
+ +
+ +      uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
+ +                    sysctl_sched_uclamp_util_min, false);
+ +      uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
+ +                    sysctl_sched_uclamp_util_max, false);
+ +
+ +      rcu_read_lock();
+ +      cpu_util_update_eff(&root_task_group.css);
+ +      rcu_read_unlock();
+ +}
+ +#else
+ +static void uclamp_update_root_tg(void) { }
+ +#endif
+ +
   int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
                                 void __user *buffer, size_t *lenp,
                                 loff_t *ppos)
   {
+ +      bool update_root_tg = false;
         int old_min, old_max;
- -      static DEFINE_MUTEX(mutex);
         int result;
   
- -      mutex_lock(&mutex);
+ +      mutex_lock(&uclamp_mutex);
         old_min = sysctl_sched_uclamp_util_min;
         old_max = sysctl_sched_uclamp_util_max;
   
@@@ -1137,30 -1032,23 +1137,30 @@@
         if (old_min != sysctl_sched_uclamp_util_min) {
                 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
                               sysctl_sched_uclamp_util_min, false);
+ +              update_root_tg = true;
         }
         if (old_max != sysctl_sched_uclamp_util_max) {
                 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
                               sysctl_sched_uclamp_util_max, false);
+ +              update_root_tg = true;
         }
   
+ +      if (update_root_tg)
+ +              uclamp_update_root_tg();
+ +
         /*
- -       * Updating all the RUNNABLE task is expensive, keep it simple and do
- -       * just a lazy update at each next enqueue time.
+ +       * We update all RUNNABLE tasks only when task groups are in use.
+ +       * Otherwise, keep it simple and do just a lazy update at each next
+ +       * task enqueue time.
          */
+ +
         goto done;
   
   undo:
         sysctl_sched_uclamp_util_min = old_min;
         sysctl_sched_uclamp_util_max = old_max;
   done:
- -      mutex_unlock(&mutex);
+ +      mutex_unlock(&uclamp_mutex);
   
         return result;
   }
@@@ -1187,7 -1075,7 +1187,7 @@@ static int uclamp_validate(struct task_
   static void __setscheduler_uclamp(struct task_struct *p,
                                   const struct sched_attr *attr)
   {
- -      unsigned int clamp_id;
+ +      enum uclamp_id clamp_id;
   
         /*
          * On scheduling class change, reset to default clamps for tasks
@@@ -1224,7 -1112,7 +1224,7 @@@
   
   static void uclamp_fork(struct task_struct *p)
   {
- -      unsigned int clamp_id;
+ +      enum uclamp_id clamp_id;
   
         for_each_clamp_id(clamp_id)
                 p->uclamp[clamp_id].active = false;
@@@ -1246,11 -1134,9 +1246,11 @@@
   static void __init init_uclamp(void)
   {
         struct uclamp_se uc_max = {};
- -      unsigned int clamp_id;
+ +      enum uclamp_id clamp_id;
         int cpu;
   
+ +      mutex_init(&uclamp_mutex);
+ +
         for_each_possible_cpu(cpu) {
                 memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
                 cpu_rq(cpu)->uclamp_flags = 0;
@@@ -1263,13 -1149,8 +1263,13 @@@
   
         /* System defaults allow max clamp values for both indexes */
         uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
- -      for_each_clamp_id(clamp_id)
+ +      for_each_clamp_id(clamp_id) {
                 uclamp_default[clamp_id] = uc_max;
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +              root_task_group.uclamp_req[clamp_id] = uc_max;
+ +              root_task_group.uclamp[clamp_id] = uc_max;
+ +#endif
+ +      }
   }
   
   #else /* CONFIG_UCLAMP_TASK */
@@@ -1613,7 -1494,7 +1613,7 @@@ void do_set_cpus_allowed(struct task_st
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
         if (running)
- -              set_curr_task(rq, p);
+ +              set_next_task(rq, p);
   }
   
   /*
@@@ -3333,8 -3214,12 +3333,8 @@@ static __always_inline struct rq 
   context_switch(struct rq *rq, struct task_struct *prev,
                struct task_struct *next, struct rq_flags *rf)
   {
- -      struct mm_struct *mm, *oldmm;
- -
         prepare_task_switch(rq, prev, next);
   
- -      mm = next->mm;
- -      oldmm = prev->active_mm;
         /*
          * For paravirt, this is coupled with an exit in switch_to to
          * combine the page table reload and the switch backend into
@@@ -3343,37 -3228,22 +3343,37 @@@
         arch_start_context_switch(prev);
   
         /*
- -       * If mm is non-NULL, we pass through switch_mm(). If mm is
- -       * NULL, we will pass through mmdrop() in finish_task_switch().
- -       * Both of these contain the full memory barrier required by
- -       * membarrier after storing to rq->curr, before returning to
- -       * user-space.
+ +       * kernel -> kernel   lazy + transfer active
+ +       *   user -> kernel   lazy + mmgrab() active
+ +       *
+ +       * kernel ->   user   switch + mmdrop() active
+ +       *   user ->   user   switch
          */
- -      if (!mm) {
- -              next->active_mm = oldmm;
- -              mmgrab(oldmm);
- -              enter_lazy_tlb(oldmm, next);
- -      } else
- -              switch_mm_irqs_off(oldmm, mm, next);
+ +      if (!next->mm) {                                // to kernel
+ +              enter_lazy_tlb(prev->active_mm, next);
+ +
+ +              next->active_mm = prev->active_mm;
+ +              if (prev->mm)                           // from user
+ +                      mmgrab(prev->active_mm);
+ +              else
+ +                      prev->active_mm = NULL;
+ +      } else {                                        // to user
+ +              /*
+ +               * sys_membarrier() requires an smp_mb() between setting
+ +               * rq->curr and returning to userspace.
+ +               *
+ +               * The below provides this either through switch_mm(), or in
+ +               * case 'prev->active_mm == next->mm' through
+ +               * finish_task_switch()'s mmdrop().
+ +               */
   
- -      if (!prev->mm) {
- -              prev->active_mm = NULL;
- -              rq->prev_mm = oldmm;
+ +              switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ +
+ +              if (!prev->mm) {                        // from kernel
+ +                      /* will mmdrop() in finish_task_switch(). */
+ +                      rq->prev_mm = prev->active_mm;
+ +                      prev->active_mm = NULL;
+ +              }
         }
   
         rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
@@@ -3616,36 -3486,8 +3616,36 @@@ void scheduler_tick(void
   
   struct tick_work {
         int                     cpu;
+ +      atomic_t                state;
         struct delayed_work     work;
   };
+ +/* Values for ->state, see diagram below. */
+ +#define TICK_SCHED_REMOTE_OFFLINE     0
+ +#define TICK_SCHED_REMOTE_OFFLINING   1
+ +#define TICK_SCHED_REMOTE_RUNNING     2
+ +
+ +/*
+ + * State diagram for ->state:
+ + *
+ + *
+ + *          TICK_SCHED_REMOTE_OFFLINE
+ + *                    |   ^
+ + *                    |   |
+ + *                    |   | sched_tick_remote()
+ + *                    |   |
+ + *                    |   |
+ + *                    +--TICK_SCHED_REMOTE_OFFLINING
+ + *                    |   ^
+ + *                    |   |
+ + * sched_tick_start() |   | sched_tick_stop()
+ + *                    |   |
+ + *                    V   |
+ + *          TICK_SCHED_REMOTE_RUNNING
+ + *
+ + *
+ + * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
+ + * and sched_tick_start() are happy to leave the state in RUNNING.
+ + */
   
   static struct tick_work __percpu *tick_work_cpu;
   
@@@ -3658,7 -3500,6 +3658,7 @@@ static void sched_tick_remote(struct wo
         struct task_struct *curr;
         struct rq_flags rf;
         u64 delta;
+ +      int os;
   
         /*
          * Handle the tick only if it appears the remote CPU is running in full
@@@ -3672,7 -3513,7 +3672,7 @@@
   
         rq_lock_irq(rq, &rf);
         curr = rq->curr;
- -      if (is_idle_task(curr))
+ +      if (is_idle_task(curr) || cpu_is_offline(cpu))
                 goto out_unlock;
   
         update_rq_clock(rq);
@@@ -3692,18 -3533,13 +3692,18 @@@ out_requeue
         /*
          * Run the remote tick once per second (1Hz). This arbitrary
          * frequency is large enough to avoid overload but short enough
- -       * to keep scheduler internal stats reasonably up to date.
+ +       * to keep scheduler internal stats reasonably up to date.  But
+ +       * first update state to reflect hotplug activity if required.
          */
- -      queue_delayed_work(system_unbound_wq, dwork, HZ);
+ +      os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
+ +      WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
+ +      if (os == TICK_SCHED_REMOTE_RUNNING)
+ +              queue_delayed_work(system_unbound_wq, dwork, HZ);
   }
   
   static void sched_tick_start(int cpu)
   {
+ +      int os;
         struct tick_work *twork;
   
         if (housekeeping_cpu(cpu, HK_FLAG_TICK))
@@@ -3712,20 -3548,15 +3712,20 @@@
         WARN_ON_ONCE(!tick_work_cpu);
   
         twork = per_cpu_ptr(tick_work_cpu, cpu);
- -      twork->cpu = cpu;
- -      INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
- -      queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ +      os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
+ +      WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
+ +      if (os == TICK_SCHED_REMOTE_OFFLINE) {
+ +              twork->cpu = cpu;
+ +              INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+ +              queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ +      }
   }
   
   #ifdef CONFIG_HOTPLUG_CPU
   static void sched_tick_stop(int cpu)
   {
         struct tick_work *twork;
+ +      int os;
   
         if (housekeeping_cpu(cpu, HK_FLAG_TICK))
                 return;
@@@ -3733,10 -3564,7 +3733,10 @@@
         WARN_ON_ONCE(!tick_work_cpu);
   
         twork = per_cpu_ptr(tick_work_cpu, cpu);
- -      cancel_delayed_work_sync(&twork->work);
+ +      /* There cannot be competing actions, but don't rely on stop-machine. */
+ +      os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
+ +      WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
+ +      /* Don't cancel, as this would mess up the state machine. */
   }
   #endif /* CONFIG_HOTPLUG_CPU */
   
@@@ -3744,6 -3572,7 +3744,6 @@@ int __init sched_tick_offload_init(void
   {
         tick_work_cpu = alloc_percpu(struct tick_work);
         BUG_ON(!tick_work_cpu);
- -
         return 0;
   }
   
@@@ -3752,7 -3581,7 +3752,7 @@@ static inline void sched_tick_start(in
   static inline void sched_tick_stop(int cpu) { }
   #endif
   
- -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ +#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
   /*
    * If the value passed in is equal to the current preempt count
@@@ -3871,13 -3700,22 +3871,22 @@@ static noinline void __schedule_bug(str
   /*
    * Various schedule()-time debugging checks and statistics:
    */
- static inline void schedule_debug(struct task_struct *prev)
+ static inline void schedule_debug(struct task_struct *prev, bool preempt)
   {
   #ifdef CONFIG_SCHED_STACK_END_CHECK
         if (task_stack_end_corrupted(prev))
                 panic("corrupted stack end detected inside scheduler\n");
   #endif
   
+ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+       if (!preempt && prev->state && prev->non_block_count) {
+               printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
+                       prev->comm, prev->pid, prev->non_block_count);
+               dump_stack();
+               add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+       }
+ #endif
+ 
         if (unlikely(in_atomic_preempt_off())) {
                 __schedule_bug(prev);
                 preempt_count_set(PREEMPT_DISABLED);
@@@ -3910,7 -3748,7 +3919,7 @@@ pick_next_task(struct rq *rq, struct ta
   
                 p = fair_sched_class.pick_next_task(rq, prev, rf);
                 if (unlikely(p == RETRY_TASK))
- -                      goto again;
+ +                      goto restart;
   
                 /* Assumes fair_sched_class->next == idle_sched_class */
                 if (unlikely(!p))
@@@ -3919,19 -3757,14 +3928,19 @@@
                 return p;
         }
   
- -again:
+ +restart:
+ +      /*
+ +       * Ensure that we put DL/RT tasks before the pick loop, such that they
+ +       * can PULL higher prio tasks when we lower the RQ 'priority'.
+ +       */
+ +      prev->sched_class->put_prev_task(rq, prev, rf);
+ +      if (!rq->nr_running)
+ +              newidle_balance(rq, rf);
+ +
         for_each_class(class) {
- -              p = class->pick_next_task(rq, prev, rf);
- -              if (p) {
- -                      if (unlikely(p == RETRY_TASK))
- -                              goto again;
+ +              p = class->pick_next_task(rq, NULL, NULL);
+ +              if (p)
                         return p;
- -              }
         }
   
         /* The idle class should always have a runnable task: */
@@@ -3958,7 -3791,7 +3967,7 @@@
    *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
    *      called on the nearest possible occasion:
    *
- - *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ + *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
    *
    *         - in syscall or exception context, at the next outmost
    *           preempt_enable(). (this might be as soon as the wake_up()'s
@@@ -3967,7 -3800,7 +3976,7 @@@
    *         - in IRQ context, return from interrupt-handler to
    *           preemptible context
    *
- - *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ + *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
    *         then at the next:
    *
    *          - cond_resched() call
@@@ -3989,7 -3822,7 +3998,7 @@@ static void __sched notrace __schedule(
         rq = cpu_rq(cpu);
         prev = rq->curr;
   
-       schedule_debug(prev);
+       schedule_debug(prev, preempt);
   
         if (sched_feat(HRTICK))
                 hrtick_clear(rq);
@@@ -4080,7 -3913,7 +4089,7 @@@ void __noreturn do_task_dead(void
   
   static inline void sched_submit_work(struct task_struct *tsk)
   {
- -      if (!tsk->state || tsk_is_pi_blocked(tsk))
+ +      if (!tsk->state)
                 return;
   
         /*
@@@ -4096,9 -3929,6 +4105,9 @@@
                 preempt_enable_no_resched();
         }
   
+ +      if (tsk_is_pi_blocked(tsk))
+ +              return;
+ +
         /*
          * If we are going to sleep and we have plugged IO queued,
          * make sure to submit it to avoid deadlocks.
@@@ -4212,7 -4042,7 +4221,7 @@@ static void __sched notrace preempt_sch
         } while (need_resched());
   }
   
- -#ifdef CONFIG_PREEMPT
+ +#ifdef CONFIG_PREEMPTION
   /*
    * this is the entry point to schedule() from in-kernel preemption
    * off of preempt_enable. Kernel preemptions off return from interrupt
@@@ -4284,7 -4114,7 +4293,7 @@@ asmlinkage __visible void __sched notra
   }
   EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
   
- -#endif /* CONFIG_PREEMPT */
+ +#endif /* CONFIG_PREEMPTION */
   
   /*
    * this is the entry point to schedule() from kernel preemption
@@@ -4452,7 -4282,7 +4461,7 @@@ void rt_mutex_setprio(struct task_struc
         if (queued)
                 enqueue_task(rq, p, queue_flag);
         if (running)
- -              set_curr_task(rq, p);
+ +              set_next_task(rq, p);
   
         check_class_changed(rq, p, prev_class, oldprio);
   out_unlock:
@@@ -4519,7 -4349,7 +4528,7 @@@ void set_user_nice(struct task_struct *
                         resched_curr(rq);
         }
         if (running)
- -              set_curr_task(rq, p);
+ +              set_next_task(rq, p);
   out_unlock:
         task_rq_unlock(rq, p, &rf);
   }
@@@ -4836,9 -4666,6 +4845,9 @@@ recheck
                         return retval;
         }
   
+ +      if (pi)
+ +              cpuset_read_lock();
+ +
         /*
          * Make sure no PI-waiters arrive (or leave) while we are
          * changing the priority of the task:
@@@ -4853,8 -4680,8 +4862,8 @@@
          * Changing the policy of the stop threads its a very bad idea:
          */
         if (p == rq->stop) {
- -              task_rq_unlock(rq, p, &rf);
- -              return -EINVAL;
+ +              retval = -EINVAL;
+ +              goto unlock;
         }
   
         /*
@@@ -4872,8 -4699,8 +4881,8 @@@
                         goto change;
   
                 p->sched_reset_on_fork = reset_on_fork;
- -              task_rq_unlock(rq, p, &rf);
- -              return 0;
+ +              retval = 0;
+ +              goto unlock;
         }
   change:
   
@@@ -4886,8 -4713,8 +4895,8 @@@
                 if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                 !task_group_is_autogroup(task_group(p))) {
- -                      task_rq_unlock(rq, p, &rf);
- -                      return -EPERM;
+ +                      retval = -EPERM;
+ +                      goto unlock;
                 }
   #endif
   #ifdef CONFIG_SMP
@@@ -4902,8 -4729,8 +4911,8 @@@
                          */
                         if (!cpumask_subset(span, p->cpus_ptr) ||
                             rq->rd->dl_bw.bw == 0) {
- -                              task_rq_unlock(rq, p, &rf);
- -                              return -EPERM;
+ +                              retval = -EPERM;
+ +                              goto unlock;
                         }
                 }
   #endif
@@@ -4913,8 -4740,6 +4922,8 @@@
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                 policy = oldpolicy = -1;
                 task_rq_unlock(rq, p, &rf);
+ +              if (pi)
+ +                      cpuset_read_unlock();
                 goto recheck;
         }
   
@@@ -4924,8 -4749,8 +4933,8 @@@
          * is available.
          */
         if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
- -              task_rq_unlock(rq, p, &rf);
- -              return -EBUSY;
+ +              retval = -EBUSY;
+ +              goto unlock;
         }
   
         p->sched_reset_on_fork = reset_on_fork;
@@@ -4967,7 -4792,7 +4976,7 @@@
                 enqueue_task(rq, p, queue_flags);
         }
         if (running)
- -              set_curr_task(rq, p);
+ +              set_next_task(rq, p);
   
         check_class_changed(rq, p, prev_class, oldprio);
   
@@@ -4975,22 -4800,14 +4984,22 @@@
         preempt_disable();
         task_rq_unlock(rq, p, &rf);
   
- -      if (pi)
+ +      if (pi) {
+ +              cpuset_read_unlock();
                 rt_mutex_adjust_pi(p);
+ +      }
   
         /* Run balance callbacks after we've adjusted the PI chain: */
         balance_callback(rq);
         preempt_enable();
   
         return 0;
+ +
+ +unlock:
+ +      task_rq_unlock(rq, p, &rf);
+ +      if (pi)
+ +              cpuset_read_unlock();
+ +      return retval;
   }
   
   static int _sched_setscheduler(struct task_struct *p, int policy,
@@@ -5074,15 -4891,10 +5083,15 @@@ do_sched_setscheduler(pid_t pid, int po
         rcu_read_lock();
         retval = -ESRCH;
         p = find_process_by_pid(pid);
- -      if (p != NULL)
- -              retval = sched_setscheduler(p, policy, &lparam);
+ +      if (likely(p))
+ +              get_task_struct(p);
         rcu_read_unlock();
   
+ +      if (likely(p)) {
+ +              retval = sched_setscheduler(p, policy, &lparam);
+ +              put_task_struct(p);
+ +      }
+ +
         return retval;
   }
   
@@@ -5299,40 -5111,37 +5308,40 @@@ out_unlock
         return retval;
   }
   
- -static int sched_read_attr(struct sched_attr __user *uattr,
- -                         struct sched_attr *attr,
- -                         unsigned int usize)
+ +/*
+ + * Copy the kernel size attribute structure (which might be larger
+ + * than what user-space knows about) to user-space.
+ + *
+ + * Note that all cases are valid: user-space buffer can be larger or
+ + * smaller than the kernel-space buffer. The usual case is that both
+ + * have the same size.
+ + */
+ +static int
+ +sched_attr_copy_to_user(struct sched_attr __user *uattr,
+ +                      struct sched_attr *kattr,
+ +                      unsigned int usize)
   {
- -      int ret;
+ +      unsigned int ksize = sizeof(*kattr);
   
         if (!access_ok(uattr, usize))
                 return -EFAULT;
   
         /*
- -       * If we're handed a smaller struct than we know of,
- -       * ensure all the unknown bits are 0 - i.e. old
- -       * user-space does not get uncomplete information.
+ +       * sched_getattr() ABI forwards and backwards compatibility:
+ +       *
+ +       * If usize == ksize then we just copy everything to user-space and all is good.
+ +       *
+ +       * If usize < ksize then we only copy as much as user-space has space for,
+ +       * this keeps ABI compatibility as well. We skip the rest.
+ +       *
+ +       * If usize > ksize then user-space is using a newer version of the ABI,
+ +       * which part the kernel doesn't know about. Just ignore it - tooling can
+ +       * detect the kernel's knowledge of attributes from the attr->size value
+ +       * which is set to ksize in this case.
          */
- -      if (usize < sizeof(*attr)) {
- -              unsigned char *addr;
- -              unsigned char *end;
- -
- -              addr = (void *)attr + usize;
- -              end  = (void *)attr + sizeof(*attr);
- -
- -              for (; addr < end; addr++) {
- -                      if (*addr)
- -                              return -EFBIG;
- -              }
- -
- -              attr->size = usize;
- -      }
+ +      kattr->size = min(usize, ksize);
   
- -      ret = copy_to_user(uattr, attr, attr->size);
- -      if (ret)
+ +      if (copy_to_user(uattr, kattr, kattr->size))
                 return -EFAULT;
   
         return 0;
@@@ -5342,18 -5151,20 +5351,18 @@@
    * sys_sched_getattr - similar to sched_getparam, but with sched_attr
    * @pid: the pid in question.
    * @uattr: structure containing the extended parameters.
- - * @size: sizeof(attr) for fwd/bwd comp.
+ + * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
    * @flags: for future extension.
    */
   SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- -              unsigned int, size, unsigned int, flags)
+ +              unsigned int, usize, unsigned int, flags)
   {
- -      struct sched_attr attr = {
- -              .size = sizeof(struct sched_attr),
- -      };
+ +      struct sched_attr kattr = { };
         struct task_struct *p;
         int retval;
   
- -      if (!uattr || pid < 0 || size > PAGE_SIZE ||
- -          size < SCHED_ATTR_SIZE_VER0 || flags)
+ +      if (!uattr || pid < 0 || usize > PAGE_SIZE ||
+ +          usize < SCHED_ATTR_SIZE_VER0 || flags)
                 return -EINVAL;
   
         rcu_read_lock();
@@@ -5366,24 -5177,25 +5375,24 @@@
         if (retval)
                 goto out_unlock;
   
- -      attr.sched_policy = p->policy;
+ +      kattr.sched_policy = p->policy;
         if (p->sched_reset_on_fork)
- -              attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ +              kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
         if (task_has_dl_policy(p))
- -              __getparam_dl(p, &attr);
+ +              __getparam_dl(p, &kattr);
         else if (task_has_rt_policy(p))
- -              attr.sched_priority = p->rt_priority;
+ +              kattr.sched_priority = p->rt_priority;
         else
- -              attr.sched_nice = task_nice(p);
+ +              kattr.sched_nice = task_nice(p);
   
   #ifdef CONFIG_UCLAMP_TASK
- -      attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- -      attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+ +      kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ +      kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
   #endif
   
         rcu_read_unlock();
   
- -      retval = sched_read_attr(uattr, &attr, size);
- -      return retval;
+ +      return sched_attr_copy_to_user(uattr, &kattr, usize);
   
   out_unlock:
         rcu_read_unlock();
@@@ -5613,7 -5425,7 +5622,7 @@@ SYSCALL_DEFINE0(sched_yield
         return 0;
   }
   
- -#ifndef CONFIG_PREEMPT
+ +#ifndef CONFIG_PREEMPTION
   int __sched _cond_resched(void)
   {
         if (should_resched(0)) {
@@@ -5630,7 -5442,7 +5639,7 @@@ EXPORT_SYMBOL(_cond_resched)
    * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
    * call schedule, and on return reacquire the lock.
    *
- - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ + * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
    * operations here to prevent schedule() from being called twice (once via
    * spin_unlock(), once by hand).
    */
@@@ -6169,7 -5981,7 +6178,7 @@@ void sched_setnuma(struct task_struct *
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
         if (running)
- -              set_curr_task(rq, p);
+ +              set_next_task(rq, p);
         task_rq_unlock(rq, p, &rf);
   }
   #endif /* CONFIG_NUMA_BALANCING */
@@@ -6209,22 -6021,21 +6218,22 @@@ static void calc_load_migrate(struct r
                 atomic_long_add(delta, &calc_load_tasks);
   }
   
- -static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+ +static struct task_struct *__pick_migrate_task(struct rq *rq)
   {
- -}
+ +      const struct sched_class *class;
+ +      struct task_struct *next;
   
- -static const struct sched_class fake_sched_class = {
- -      .put_prev_task = put_prev_task_fake,
- -};
+ +      for_each_class(class) {
+ +              next = class->pick_next_task(rq, NULL, NULL);
+ +              if (next) {
+ +                      next->sched_class->put_prev_task(rq, next, NULL);
+ +                      return next;
+ +              }
+ +      }
   
- -static struct task_struct fake_task = {
- -      /*
- -       * Avoid pull_{rt,dl}_task()
- -       */
- -      .prio = MAX_PRIO + 1,
- -      .sched_class = &fake_sched_class,
- -};
+ +      /* The idle class should always have a runnable task */
+ +      BUG();
+ +}
   
   /*
    * Migrate all tasks from the rq, sleeping tasks will be migrated by
@@@ -6267,7 -6078,12 +6276,7 @@@ static void migrate_tasks(struct rq *de
                 if (rq->nr_running == 1)
                         break;
   
- -              /*
- -               * pick_next_task() assumes pinned rq->lock:
- -               */
- -              next = pick_next_task(rq, &fake_task, rf);
- -              BUG_ON(!next);
- -              put_prev_task(rq, next);
+ +              next = __pick_migrate_task(rq);
   
                 /*
                  * Rules for changing task_struct::cpus_mask are holding
@@@ -6564,19 -6380,19 +6573,19 @@@ DECLARE_PER_CPU(cpumask_var_t, select_i
   
   void __init sched_init(void)
   {
- -      unsigned long alloc_size = 0, ptr;
+ +      unsigned long ptr = 0;
         int i;
   
         wait_bit_init();
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -      alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ +      ptr += 2 * nr_cpu_ids * sizeof(void **);
   #endif
   #ifdef CONFIG_RT_GROUP_SCHED
- -      alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ +      ptr += 2 * nr_cpu_ids * sizeof(void **);
   #endif
- -      if (alloc_size) {
- -              ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+ +      if (ptr) {
+ +              ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
                 root_task_group.se = (struct sched_entity **)ptr;
@@@ -6763,7 -6579,7 +6772,7 @@@ void ___might_sleep(const char *file, i
         rcu_sleep_check();
   
         if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
-            !is_idle_task(current)) ||
+            !is_idle_task(current) && !current->non_block_count) ||
             system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
             oops_in_progress)
                 return;
@@@ -6779,8 -6595,8 +6788,8 @@@
                 "BUG: sleeping function called from invalid context at %s:%d\n",
                         file, line);
         printk(KERN_ERR
-               "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
-                       in_atomic(), irqs_disabled(),
+               "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+                       in_atomic(), irqs_disabled(), current->non_block_count,
                         current->pid, current->comm);
   
         if (task_stack_end_corrupted(current))
@@@ -6895,7 -6711,7 +6904,7 @@@ struct task_struct *curr_task(int cpu
   
   #ifdef CONFIG_IA64
   /**
- - * set_curr_task - set the current task for a given CPU.
+ + * ia64_set_curr_task - set the current task for a given CPU.
    * @cpu: the processor in question.
    * @p: the task pointer to set.
    *
@@@ -6920,20 -6736,6 +6929,20 @@@ void ia64_set_curr_task(int cpu, struc
   /* task_group_lock serializes the addition/removal of task groups */
   static DEFINE_SPINLOCK(task_group_lock);
   
+ +static inline void alloc_uclamp_sched_group(struct task_group *tg,
+ +                                          struct task_group *parent)
+ +{
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +      enum uclamp_id clamp_id;
+ +
+ +      for_each_clamp_id(clamp_id) {
+ +              uclamp_se_set(&tg->uclamp_req[clamp_id],
+ +                            uclamp_none(clamp_id), false);
+ +              tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
+ +      }
+ +#endif
+ +}
+ +
   static void sched_free_group(struct task_group *tg)
   {
         free_fair_sched_group(tg);
@@@ -6957,8 -6759,6 +6966,8 @@@ struct task_group *sched_create_group(s
         if (!alloc_rt_sched_group(tg, parent))
                 goto err;
   
+ +      alloc_uclamp_sched_group(tg, parent);
+ +
         return tg;
   
   err:
@@@ -7062,7 -6862,7 +7071,7 @@@ void sched_move_task(struct task_struc
         if (queued)
                 enqueue_task(rq, tsk, queue_flags);
         if (running)
- -              set_curr_task(rq, tsk);
+ +              set_next_task(rq, tsk);
   
         task_rq_unlock(rq, tsk, &rf);
   }
@@@ -7145,6 -6945,10 +7154,6 @@@ static int cpu_cgroup_can_attach(struc
   #ifdef CONFIG_RT_GROUP_SCHED
                 if (!sched_rt_can_attach(css_tg(css), task))
                         return -EINVAL;
- -#else
- -              /* We don't support RT-tasks being in separate groups */
- -              if (task->sched_class != &fair_sched_class)
- -                      return -EINVAL;
   #endif
                 /*
                  * Serialize against wake_up_new_task() such that if its
@@@ -7175,178 -6979,6 +7184,178 @@@ static void cpu_cgroup_attach(struct cg
                 sched_move_task(task);
   }
   
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +static void cpu_util_update_eff(struct cgroup_subsys_state *css)
+ +{
+ +      struct cgroup_subsys_state *top_css = css;
+ +      struct uclamp_se *uc_parent = NULL;
+ +      struct uclamp_se *uc_se = NULL;
+ +      unsigned int eff[UCLAMP_CNT];
+ +      enum uclamp_id clamp_id;
+ +      unsigned int clamps;
+ +
+ +      css_for_each_descendant_pre(css, top_css) {
+ +              uc_parent = css_tg(css)->parent
+ +                      ? css_tg(css)->parent->uclamp : NULL;
+ +
+ +              for_each_clamp_id(clamp_id) {
+ +                      /* Assume effective clamps matches requested clamps */
+ +                      eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
+ +                      /* Cap effective clamps with parent's effective clamps */
+ +                      if (uc_parent &&
+ +                          eff[clamp_id] > uc_parent[clamp_id].value) {
+ +                              eff[clamp_id] = uc_parent[clamp_id].value;
+ +                      }
+ +              }
+ +              /* Ensure protection is always capped by limit */
+ +              eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
+ +
+ +              /* Propagate most restrictive effective clamps */
+ +              clamps = 0x0;
+ +              uc_se = css_tg(css)->uclamp;
+ +              for_each_clamp_id(clamp_id) {
+ +                      if (eff[clamp_id] == uc_se[clamp_id].value)
+ +                              continue;
+ +                      uc_se[clamp_id].value = eff[clamp_id];
+ +                      uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
+ +                      clamps |= (0x1 << clamp_id);
+ +              }
+ +              if (!clamps) {
+ +                      css = css_rightmost_descendant(css);
+ +                      continue;
+ +              }
+ +
+ +              /* Immediately update descendants RUNNABLE tasks */
+ +              uclamp_update_active_tasks(css, clamps);
+ +      }
+ +}
+ +
+ +/*
+ + * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
+ + * C expression. Since there is no way to convert a macro argument (N) into a
+ + * character constant, use two levels of macros.
+ + */
+ +#define _POW10(exp) ((unsigned int)1e##exp)
+ +#define POW10(exp) _POW10(exp)
+ +
+ +struct uclamp_request {
+ +#define UCLAMP_PERCENT_SHIFT  2
+ +#define UCLAMP_PERCENT_SCALE  (100 * POW10(UCLAMP_PERCENT_SHIFT))
+ +      s64 percent;
+ +      u64 util;
+ +      int ret;
+ +};
+ +
+ +static inline struct uclamp_request
+ +capacity_from_percent(char *buf)
+ +{
+ +      struct uclamp_request req = {
+ +              .percent = UCLAMP_PERCENT_SCALE,
+ +              .util = SCHED_CAPACITY_SCALE,
+ +              .ret = 0,
+ +      };
+ +
+ +      buf = strim(buf);
+ +      if (strcmp(buf, "max")) {
+ +              req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+ +                                           &req.percent);
+ +              if (req.ret)
+ +                      return req;
+ +              if (req.percent > UCLAMP_PERCENT_SCALE) {
+ +                      req.ret = -ERANGE;
+ +                      return req;
+ +              }
+ +
+ +              req.util = req.percent << SCHED_CAPACITY_SHIFT;
+ +              req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+ +      }
+ +
+ +      return req;
+ +}
+ +
+ +static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
+ +                              size_t nbytes, loff_t off,
+ +                              enum uclamp_id clamp_id)
+ +{
+ +      struct uclamp_request req;
+ +      struct task_group *tg;
+ +
+ +      req = capacity_from_percent(buf);
+ +      if (req.ret)
+ +              return req.ret;
+ +
+ +      mutex_lock(&uclamp_mutex);
+ +      rcu_read_lock();
+ +
+ +      tg = css_tg(of_css(of));
+ +      if (tg->uclamp_req[clamp_id].value != req.util)
+ +              uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
+ +
+ +      /*
+ +       * Because of not recoverable conversion rounding we keep track of the
+ +       * exact requested value
+ +       */
+ +      tg->uclamp_pct[clamp_id] = req.percent;
+ +
+ +      /* Update effective clamps to track the most restrictive value */
+ +      cpu_util_update_eff(of_css(of));
+ +
+ +      rcu_read_unlock();
+ +      mutex_unlock(&uclamp_mutex);
+ +
+ +      return nbytes;
+ +}
+ +
+ +static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+ +                                  char *buf, size_t nbytes,
+ +                                  loff_t off)
+ +{
+ +      return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
+ +}
+ +
+ +static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+ +                                  char *buf, size_t nbytes,
+ +                                  loff_t off)
+ +{
+ +      return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
+ +}
+ +
+ +static inline void cpu_uclamp_print(struct seq_file *sf,
+ +                                  enum uclamp_id clamp_id)
+ +{
+ +      struct task_group *tg;
+ +      u64 util_clamp;
+ +      u64 percent;
+ +      u32 rem;
+ +
+ +      rcu_read_lock();
+ +      tg = css_tg(seq_css(sf));
+ +      util_clamp = tg->uclamp_req[clamp_id].value;
+ +      rcu_read_unlock();
+ +
+ +      if (util_clamp == SCHED_CAPACITY_SCALE) {
+ +              seq_puts(sf, "max\n");
+ +              return;
+ +      }
+ +
+ +      percent = tg->uclamp_pct[clamp_id];
+ +      percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+ +      seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
+ +}
+ +
+ +static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
+ +{
+ +      cpu_uclamp_print(sf, UCLAMP_MIN);
+ +      return 0;
+ +}
+ +
+ +static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+ +{
+ +      cpu_uclamp_print(sf, UCLAMP_MAX);
+ +      return 0;
+ +}
+ +#endif /* CONFIG_UCLAMP_TASK_GROUP */
+ +
   #ifdef CONFIG_FAIR_GROUP_SCHED
   static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                 struct cftype *cftype, u64 shareval)
@@@ -7691,20 -7323,6 +7700,20 @@@ static struct cftype cpu_legacy_files[
                 .read_u64 = cpu_rt_period_read_uint,
                 .write_u64 = cpu_rt_period_write_uint,
         },
+ +#endif
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +      {
+ +              .name = "uclamp.min",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .seq_show = cpu_uclamp_min_show,
+ +              .write = cpu_uclamp_min_write,
+ +      },
+ +      {
+ +              .name = "uclamp.max",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .seq_show = cpu_uclamp_max_show,
+ +              .write = cpu_uclamp_max_write,
+ +      },
   #endif
         { }     /* Terminate */
   };
@@@ -7872,20 -7490,6 +7881,20 @@@ static struct cftype cpu_files[] = 
                 .seq_show = cpu_max_show,
                 .write = cpu_max_write,
         },
+ +#endif
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +      {
+ +              .name = "uclamp.min",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .seq_show = cpu_uclamp_min_show,
+ +              .write = cpu_uclamp_min_write,
+ +      },
+ +      {
+ +              .name = "uclamp.max",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .seq_show = cpu_uclamp_max_show,
+ +              .write = cpu_uclamp_max_write,
+ +      },
   #endif
         { }     /* terminate */
   };
diff --combined mm/madvise.c

index bac973b9f2cc71a11a5d010f38cc672adbdc1772,afe2b015ea58a3a9be6356b97775aa3674bc1234..88babcc384b9d4362b929d1771921dda30a80b67
--- 1/mm/madvise.c
--- 2/mm/madvise.c
+++ b/mm/madvise.c
@@@ -14,13 -14,13 +14,14 @@@
   #include <linux/userfaultfd_k.h>
   #include <linux/hugetlb.h>
   #include <linux/falloc.h>
+ +#include <linux/fadvise.h>
   #include <linux/sched.h>
   #include <linux/ksm.h>
   #include <linux/fs.h>
   #include <linux/file.h>
   #include <linux/blkdev.h>
   #include <linux/backing-dev.h>
+ #include <linux/pagewalk.h>
   #include <linux/swap.h>
   #include <linux/swapops.h>
   #include <linux/shmem_fs.h>
@@@ -226,19 -226,9 +227,9 @@@ static int swapin_walk_pmd_entry(pmd_t 
         return 0;
   }
   
- static void force_swapin_readahead(struct vm_area_struct *vma,
-               unsigned long start, unsigned long end)
- {
-       struct mm_walk walk = {
-               .mm = vma->vm_mm,
-               .pmd_entry = swapin_walk_pmd_entry,
-               .private = vma,
-       };
- 
-       walk_page_range(start, end, &walk);
- 
-       lru_add_drain();        /* Push any new pages onto the LRU now */
- }
+ static const struct mm_walk_ops swapin_walk_ops = {
+       .pmd_entry              = swapin_walk_pmd_entry,
+ };
   
   static void force_shm_swapin_readahead(struct vm_area_struct *vma,
                 unsigned long start, unsigned long end,
@@@ -276,12 -266,12 +267,13 @@@ static long madvise_willneed(struct vm_
                              unsigned long start, unsigned long end)
   {
         struct file *file = vma->vm_file;
+ +      loff_t offset;
   
         *prev = vma;
   #ifdef CONFIG_SWAP
         if (!file) {
-               force_swapin_readahead(vma, start, end);
+               walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
+               lru_add_drain(); /* Push any new pages onto the LRU now */
                 return 0;
         }
   
@@@ -300,20 -290,12 +292,20 @@@
                 return 0;
         }
   
- -      start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- -      if (end > vma->vm_end)
- -              end = vma->vm_end;
- -      end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- -
- -      force_page_cache_readahead(file->f_mapping, file, start, end - start);
+ +      /*
+ +       * Filesystem's fadvise may need to take various locks.  We need to
+ +       * explicitly grab a reference because the vma (and hence the
+ +       * vma's reference to the file) can go away as soon as we drop
+ +       * mmap_sem.
+ +       */
+ +      *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
+ +      get_file(file);
+ +      up_read(&current->mm->mmap_sem);
+ +      offset = (loff_t)(start - vma->vm_start)
+ +                      + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ +      vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
+ +      fput(file);
+ +      down_read(&current->mm->mmap_sem);
         return 0;
   }
   
@@@ -450,20 -432,9 +442,9 @@@ next
         return 0;
   }
   
- static void madvise_free_page_range(struct mmu_gather *tlb,
-                            struct vm_area_struct *vma,
-                            unsigned long addr, unsigned long end)
- {
-       struct mm_walk free_walk = {
-               .pmd_entry = madvise_free_pte_range,
-               .mm = vma->vm_mm,
-               .private = tlb,
-       };
- 
-       tlb_start_vma(tlb, vma);
-       walk_page_range(addr, end, &free_walk);
-       tlb_end_vma(tlb, vma);
- }
+ static const struct mm_walk_ops madvise_free_walk_ops = {
+       .pmd_entry              = madvise_free_pte_range,
+ };
   
   static int madvise_free_single_vma(struct vm_area_struct *vma,
                         unsigned long start_addr, unsigned long end_addr)
@@@ -490,7 -461,10 +471,10 @@@
         update_hiwater_rss(mm);
   
         mmu_notifier_invalidate_range_start(&range);
-       madvise_free_page_range(&tlb, vma, range.start, range.end);
+       tlb_start_vma(&tlb, vma);
+       walk_page_range(vma->vm_mm, range.start, range.end,
+                       &madvise_free_walk_ops, &tlb);
+       tlb_end_vma(&tlb, vma);
         mmu_notifier_invalidate_range_end(&range);
         tlb_finish_mmu(&tlb, range.start, range.end);
   
diff --combined mm/memcontrol.c

index 597d5810187256a6dd681948266de8a362855d69,9b2516a76be234475a0b6beb4f341060e95ec419..f3c15bb07cce4be6dc9eb6143da2625828c56c4a
--- 1/mm/memcontrol.c
--- 2/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -25,7 -25,7 +25,7 @@@
   #include <linux/page_counter.h>
   #include <linux/memcontrol.h>
   #include <linux/cgroup.h>
- #include <linux/mm.h>
+ #include <linux/pagewalk.h>
   #include <linux/sched/mm.h>
   #include <linux/shmem_fs.h>
   #include <linux/hugetlb.h>
@@@ -87,10 -87,6 +87,10 @@@ int do_swap_account __read_mostly
   #define do_swap_account               0
   #endif
   
+ +#ifdef CONFIG_CGROUP_WRITEBACK
+ +static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
+ +#endif
+ +
   /* Whether legacy memory+swap accounting is active */
   static bool do_memsw_account(void)
   {
@@@ -756,13 -752,15 +756,13 @@@ void __mod_lruvec_state(struct lruvec *
         /* Update memcg */
         __mod_memcg_state(memcg, idx, val);
   
+ +      /* Update lruvec */
+ +      __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
+ +
         x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
         if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
                 struct mem_cgroup_per_node *pi;
   
- -              /*
- -               * Batch local counters to keep them in sync with
- -               * the hierarchical ones.
- -               */
- -              __this_cpu_add(pn->lruvec_stat_local->count[idx], x);
                 for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
                         atomic_long_add(x, &pi->lruvec_stat[idx]);
                 x = 0;
@@@ -3262,72 -3260,6 +3262,72 @@@ static u64 mem_cgroup_read_u64(struct c
         }
   }
   
+ +static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only)
+ +{
+ +      unsigned long stat[MEMCG_NR_STAT];
+ +      struct mem_cgroup *mi;
+ +      int node, cpu, i;
+ +      int min_idx, max_idx;
+ +
+ +      if (slab_only) {
+ +              min_idx = NR_SLAB_RECLAIMABLE;
+ +              max_idx = NR_SLAB_UNRECLAIMABLE;
+ +      } else {
+ +              min_idx = 0;
+ +              max_idx = MEMCG_NR_STAT;
+ +      }
+ +
+ +      for (i = min_idx; i < max_idx; i++)
+ +              stat[i] = 0;
+ +
+ +      for_each_online_cpu(cpu)
+ +              for (i = min_idx; i < max_idx; i++)
+ +                      stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
+ +
+ +      for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ +              for (i = min_idx; i < max_idx; i++)
+ +                      atomic_long_add(stat[i], &mi->vmstats[i]);
+ +
+ +      if (!slab_only)
+ +              max_idx = NR_VM_NODE_STAT_ITEMS;
+ +
+ +      for_each_node(node) {
+ +              struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+ +              struct mem_cgroup_per_node *pi;
+ +
+ +              for (i = min_idx; i < max_idx; i++)
+ +                      stat[i] = 0;
+ +
+ +              for_each_online_cpu(cpu)
+ +                      for (i = min_idx; i < max_idx; i++)
+ +                              stat[i] += per_cpu(
+ +                                      pn->lruvec_stat_cpu->count[i], cpu);
+ +
+ +              for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
+ +                      for (i = min_idx; i < max_idx; i++)
+ +                              atomic_long_add(stat[i], &pi->lruvec_stat[i]);
+ +      }
+ +}
+ +
+ +static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
+ +{
+ +      unsigned long events[NR_VM_EVENT_ITEMS];
+ +      struct mem_cgroup *mi;
+ +      int cpu, i;
+ +
+ +      for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ +              events[i] = 0;
+ +
+ +      for_each_online_cpu(cpu)
+ +              for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ +                      events[i] += per_cpu(memcg->vmstats_percpu->events[i],
+ +                                           cpu);
+ +
+ +      for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ +              for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ +                      atomic_long_add(events[i], &mi->vmevents[i]);
+ +}
+ +
   #ifdef CONFIG_MEMCG_KMEM
   static int memcg_online_kmem(struct mem_cgroup *memcg)
   {
@@@ -3377,14 -3309,7 +3377,14 @@@ static void memcg_offline_kmem(struct m
         if (!parent)
                 parent = root_mem_cgroup;
   
+ +      /*
+ +       * Deactivate and reparent kmem_caches. Then flush percpu
+ +       * slab statistics to have precise values at the parent and
+ +       * all ancestor levels. It's required to keep slab stats
+ +       * accurate after the reparenting of kmem_caches.
+ +       */
         memcg_deactivate_kmem_caches(memcg, parent);
+ +      memcg_flush_percpu_vmstats(memcg, true);
   
         kmemcg_id = memcg->kmemcg_id;
         BUG_ON(kmemcg_id < 0);
@@@ -4176,8 -4101,6 +4176,8 @@@ static int mem_cgroup_oom_control_write
   
   #ifdef CONFIG_CGROUP_WRITEBACK
   
+ +#include <trace/events/writeback.h>
+ +
   static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
   {
         return wb_domain_init(&memcg->cgwb_domain, gfp);
@@@ -4261,130 -4184,6 +4261,130 @@@ void mem_cgroup_wb_stats(struct bdi_wri
         }
   }
   
+ +/*
+ + * Foreign dirty flushing
+ + *
+ + * There's an inherent mismatch between memcg and writeback.  The former
+ + * trackes ownership per-page while the latter per-inode.  This was a
+ + * deliberate design decision because honoring per-page ownership in the
+ + * writeback path is complicated, may lead to higher CPU and IO overheads
+ + * and deemed unnecessary given that write-sharing an inode across
+ + * different cgroups isn't a common use-case.
+ + *
+ + * Combined with inode majority-writer ownership switching, this works well
+ + * enough in most cases but there are some pathological cases.  For
+ + * example, let's say there are two cgroups A and B which keep writing to
+ + * different but confined parts of the same inode.  B owns the inode and
+ + * A's memory is limited far below B's.  A's dirty ratio can rise enough to
+ + * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
+ + * triggering background writeback.  A will be slowed down without a way to
+ + * make writeback of the dirty pages happen.
+ + *
+ + * Conditions like the above can lead to a cgroup getting repatedly and
+ + * severely throttled after making some progress after each
+ + * dirty_expire_interval while the underyling IO device is almost
+ + * completely idle.
+ + *
+ + * Solving this problem completely requires matching the ownership tracking
+ + * granularities between memcg and writeback in either direction.  However,
+ + * the more egregious behaviors can be avoided by simply remembering the
+ + * most recent foreign dirtying events and initiating remote flushes on
+ + * them when local writeback isn't enough to keep the memory clean enough.
+ + *
+ + * The following two functions implement such mechanism.  When a foreign
+ + * page - a page whose memcg and writeback ownerships don't match - is
+ + * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
+ + * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
+ + * decides that the memcg needs to sleep due to high dirty ratio, it calls
+ + * mem_cgroup_flush_foreign() which queues writeback on the recorded
+ + * foreign bdi_writebacks which haven't expired.  Both the numbers of
+ + * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
+ + * limited to MEMCG_CGWB_FRN_CNT.
+ + *
+ + * The mechanism only remembers IDs and doesn't hold any object references.
+ + * As being wrong occasionally doesn't matter, updates and accesses to the
+ + * records are lockless and racy.
+ + */
+ +void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+ +                                           struct bdi_writeback *wb)
+ +{
+ +      struct mem_cgroup *memcg = page->mem_cgroup;
+ +      struct memcg_cgwb_frn *frn;
+ +      u64 now = get_jiffies_64();
+ +      u64 oldest_at = now;
+ +      int oldest = -1;
+ +      int i;
+ +
+ +      trace_track_foreign_dirty(page, wb);
+ +
+ +      /*
+ +       * Pick the slot to use.  If there is already a slot for @wb, keep
+ +       * using it.  If not replace the oldest one which isn't being
+ +       * written out.
+ +       */
+ +      for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ +              frn = &memcg->cgwb_frn[i];
+ +              if (frn->bdi_id == wb->bdi->id &&
+ +                  frn->memcg_id == wb->memcg_css->id)
+ +                      break;
+ +              if (time_before64(frn->at, oldest_at) &&
+ +                  atomic_read(&frn->done.cnt) == 1) {
+ +                      oldest = i;
+ +                      oldest_at = frn->at;
+ +              }
+ +      }
+ +
+ +      if (i < MEMCG_CGWB_FRN_CNT) {
+ +              /*
+ +               * Re-using an existing one.  Update timestamp lazily to
+ +               * avoid making the cacheline hot.  We want them to be
+ +               * reasonably up-to-date and significantly shorter than
+ +               * dirty_expire_interval as that's what expires the record.
+ +               * Use the shorter of 1s and dirty_expire_interval / 8.
+ +               */
+ +              unsigned long update_intv =
+ +                      min_t(unsigned long, HZ,
+ +                            msecs_to_jiffies(dirty_expire_interval * 10) / 8);
+ +
+ +              if (time_before64(frn->at, now - update_intv))
+ +                      frn->at = now;
+ +      } else if (oldest >= 0) {
+ +              /* replace the oldest free one */
+ +              frn = &memcg->cgwb_frn[oldest];
+ +              frn->bdi_id = wb->bdi->id;
+ +              frn->memcg_id = wb->memcg_css->id;
+ +              frn->at = now;
+ +      }
+ +}
+ +
+ +/* issue foreign writeback flushes for recorded foreign dirtying events */
+ +void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
+ +{
+ +      struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ +      unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
+ +      u64 now = jiffies_64;
+ +      int i;
+ +
+ +      for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ +              struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
+ +
+ +              /*
+ +               * If the record is older than dirty_expire_interval,
+ +               * writeback on it has already started.  No need to kick it
+ +               * off again.  Also, don't start a new one if there's
+ +               * already one in flight.
+ +               */
+ +              if (time_after64(frn->at, now - intv) &&
+ +                  atomic_read(&frn->done.cnt) == 1) {
+ +                      frn->at = 0;
+ +                      trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
+ +                      cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+ +                                             WB_REASON_FOREIGN_FLUSH,
+ +                                             &frn->done);
+ +              }
+ +      }
+ +}
+ +
   #else /* CONFIG_CGROUP_WRITEBACK */
   
   static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
@@@ -4883,12 -4682,6 +4883,12 @@@ static void __mem_cgroup_free(struct me
   {
         int node;
   
+ +      /*
+ +       * Flush percpu vmstats and vmevents to guarantee the value correctness
+ +       * on parent's and all ancestor levels.
+ +       */
+ +      memcg_flush_percpu_vmstats(memcg, false);
+ +      memcg_flush_percpu_vmevents(memcg);
         for_each_node(node)
                 free_mem_cgroup_per_node_info(memcg, node);
         free_percpu(memcg->vmstats_percpu);
@@@ -4907,7 -4700,6 +4907,7 @@@ static struct mem_cgroup *mem_cgroup_al
         struct mem_cgroup *memcg;
         unsigned int size;
         int node;
+ +      int __maybe_unused i;
   
         size = sizeof(struct mem_cgroup);
         size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
@@@ -4951,9 -4743,6 +4951,9 @@@
   #endif
   #ifdef CONFIG_CGROUP_WRITEBACK
         INIT_LIST_HEAD(&memcg->cgwb_list);
+ +      for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ +              memcg->cgwb_frn[i].done =
+ +                      __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
   #endif
         idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
         return memcg;
@@@ -5083,12 -4872,7 +5083,12 @@@ static void mem_cgroup_css_released(str
   static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
   {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ +      int __maybe_unused i;
   
+ +#ifdef CONFIG_CGROUP_WRITEBACK
+ +      for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ +              wb_wait_for_completion(&memcg->cgwb_frn[i].done);
+ +#endif
         if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
                 static_branch_dec(&memcg_sockets_enabled_key);
   
@@@ -5499,17 -5283,16 +5499,16 @@@ static int mem_cgroup_count_precharge_p
         return 0;
   }
   
+ static const struct mm_walk_ops precharge_walk_ops = {
+       .pmd_entry      = mem_cgroup_count_precharge_pte_range,
+ };
+ 
   static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
   {
         unsigned long precharge;
   
-       struct mm_walk mem_cgroup_count_precharge_walk = {
-               .pmd_entry = mem_cgroup_count_precharge_pte_range,
-               .mm = mm,
-       };
         down_read(&mm->mmap_sem);
-       walk_page_range(0, mm->highest_vm_end,
-                       &mem_cgroup_count_precharge_walk);
+       walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
         up_read(&mm->mmap_sem);
   
         precharge = mc.precharge;
@@@ -5778,13 -5561,12 +5777,12 @@@ put:                 /* get_mctgt_type() gets the pag
         return ret;
   }
   
+ static const struct mm_walk_ops charge_walk_ops = {
+       .pmd_entry      = mem_cgroup_move_charge_pte_range,
+ };
+ 
   static void mem_cgroup_move_charge(void)
   {
-       struct mm_walk mem_cgroup_move_charge_walk = {
-               .pmd_entry = mem_cgroup_move_charge_pte_range,
-               .mm = mc.mm,
-       };
- 
         lru_add_drain_all();
         /*
          * Signal lock_page_memcg() to take the memcg's move_lock
@@@ -5810,7 -5592,8 +5808,8 @@@ retry
          * When we have consumed all precharges and failed in doing
          * additional charge, the page walk just aborts.
          */
-       walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
+       walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
+                       NULL);
   
         up_read(&mc.mm->mmap_sem);
         atomic_dec(&mc.from->moving_account);
diff --combined mm/page_alloc.c

index 6991ccec9c322ffb843110bb69cf2326d64b266c,b39baa2b1fafcaf0674e28c6ae63c2c2a1d1485f..ff5484fdbdf9908a9064129f2990a13f4cfc2247
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -2238,12 -2238,27 +2238,12 @@@ static int move_freepages(struct zone *
         unsigned int order;
         int pages_moved = 0;
   
- -#ifndef CONFIG_HOLES_IN_ZONE
- -      /*
- -       * page_zone is not safe to call in this context when
- -       * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
- -       * anyway as we check zone boundaries in move_freepages_block().
- -       * Remove at a later date when no bug reports exist related to
- -       * grouping pages by mobility
- -       */
- -      VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
- -                pfn_valid(page_to_pfn(end_page)) &&
- -                page_zone(start_page) != page_zone(end_page));
- -#endif
         for (page = start_page; page <= end_page;) {
                 if (!pfn_valid_within(page_to_pfn(page))) {
                         page++;
                         continue;
                 }
   
- -              /* Make sure we are not inadvertently changing nodes */
- -              VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
- -
                 if (!PageBuddy(page)) {
                         /*
                          * We assume that pages that could be isolated for
@@@ -2258,10 -2273,6 +2258,10 @@@
                         continue;
                 }
   
+ +              /* Make sure we are not inadvertently changing nodes */
+ +              VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+ +              VM_BUG_ON_PAGE(page_zone(page) != zone, page);
+ +
                 order = page_order(page);
                 move_to_free_area(page, &zone->free_area[order], migratetype);
                 page += 1 << order;
@@@ -3511,7 -3522,7 +3511,7 @@@ bool zone_watermark_ok_safe(struct zon
   static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
   {
         return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
- -                              RECLAIM_DISTANCE;
+ +                              node_reclaim_distance;
   }
   #else /* CONFIG_NUMA */
   static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
@@@ -5971,7 -5982,7 +5971,7 @@@ void __ref memmap_init_zone_device(stru
                 }
         }
   
-       pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
+       pr_info("%s initialised %lu pages in %ums\n", __func__,
                 size, jiffies_to_msecs(jiffies - start));
   }
author	Linus Torvalds <[email protected]>
	Sat, 21 Sep 2019 17:07:42 +0000 (10:07 -0700)
committer	Linus Torvalds <[email protected]>
	Sat, 21 Sep 2019 17:07:42 +0000 (10:07 -0700)
		1	2
arch/s390/mm/gmap.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_priv.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_process.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/nouveau/nouveau_drm.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/radeon/radeon.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/radeon/radeon_device.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/radeon/radeon_drv.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/radeon/radeon_mn.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/infiniband/hw/mlx5/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/infiniband/hw/mlx5/mr.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/infiniband/hw/mlx5/odp.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/madvise.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memcontrol.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history