*/
#include <linux/kernel.h>
- #include <linux/mm.h>
+ #include <linux/pagewalk.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/spinlock.h>
INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
spin_lock_init(&gmap->guest_table_lock);
spin_lock_init(&gmap->shadow_lock);
- atomic_set(&gmap->ref_count, 1);
+ refcount_set(&gmap->ref_count, 1);
page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
if (!page)
goto out_free;
*/
struct gmap *gmap_get(struct gmap *gmap)
{
- atomic_inc(&gmap->ref_count);
+ refcount_inc(&gmap->ref_count);
return gmap;
}
EXPORT_SYMBOL_GPL(gmap_get);
*/
void gmap_put(struct gmap *gmap)
{
- if (atomic_dec_return(&gmap->ref_count) == 0)
+ if (refcount_dec_and_test(&gmap->ref_count))
gmap_free(gmap);
}
EXPORT_SYMBOL_GPL(gmap_put);
continue;
if (!sg->initialized)
return ERR_PTR(-EAGAIN);
- atomic_inc(&sg->ref_count);
+ refcount_inc(&sg->ref_count);
return sg;
}
return NULL;
}
}
}
- atomic_set(&new->ref_count, 2);
+ refcount_set(&new->ref_count, 2);
list_add(&new->list, &parent->children);
if (asce & _ASCE_REAL_SPACE) {
/* nothing to protect, return right away */
return 0;
}
- static inline void zap_zero_pages(struct mm_struct *mm)
- {
- struct mm_walk walk = { .pmd_entry = __zap_zero_pages };
-
- walk.mm = mm;
- walk_page_range(0, TASK_SIZE, &walk);
- }
+ static const struct mm_walk_ops zap_zero_walk_ops = {
+ .pmd_entry = __zap_zero_pages,
+ };
/*
* switch on pgstes for its userspace process (for kvm)
mm->context.has_pgste = 1;
/* split thp mappings and disable thp for future mappings */
thp_split_mm(mm);
- zap_zero_pages(mm);
+ walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
up_write(&mm->mmap_sem);
return 0;
}
return 0;
}
+ static const struct mm_walk_ops enable_skey_walk_ops = {
+ .hugetlb_entry = __s390_enable_skey_hugetlb,
+ .pte_entry = __s390_enable_skey_pte,
+ };
+
int s390_enable_skey(void)
{
- struct mm_walk walk = {
- .hugetlb_entry = __s390_enable_skey_hugetlb,
- .pte_entry = __s390_enable_skey_pte,
- };
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int rc = 0;
}
mm->def_flags &= ~VM_MERGEABLE;
- walk.mm = mm;
- walk_page_range(0, TASK_SIZE, &walk);
+ walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
out_up:
up_write(&mm->mmap_sem);
return 0;
}
+ static const struct mm_walk_ops reset_cmma_walk_ops = {
+ .pte_entry = __s390_reset_cmma,
+ };
+
void s390_reset_cmma(struct mm_struct *mm)
{
- struct mm_walk walk = { .pte_entry = __s390_reset_cmma };
-
down_write(&mm->mmap_sem);
- walk.mm = mm;
- walk_page_range(0, TASK_SIZE, &walk);
+ walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
up_write(&mm->mmap_sem);
}
EXPORT_SYMBOL_GPL(s390_reset_cmma);
#include <linux/pm_runtime.h>
#include <linux/vga_switcheroo.h>
#include <drm/drm_probe_helper.h>
+ #include <linux/mmu_notifier.h>
#include "amdgpu.h"
#include "amdgpu_irq.h"
* - 3.31.0 - Add support for per-flip tiling attribute changes with DC
* - 3.32.0 - Add syncobj timeline support to AMDGPU_CS.
* - 3.33.0 - Fixes for GDS ENOMEM failures in AMDGPU_CS.
+ * - 3.34.0 - Non-DC can flip correctly between buffers with different pitches
*/
#define KMS_DRIVER_MAJOR 3
-#define KMS_DRIVER_MINOR 33
+#define KMS_DRIVER_MINOR 34
#define KMS_DRIVER_PATCHLEVEL 0
#define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256
int amdgpu_mcbp = 0;
int amdgpu_discovery = -1;
int amdgpu_mes = 0;
-int amdgpu_noretry;
+int amdgpu_noretry = 1;
struct amdgpu_mgpu_info mgpu_info = {
.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
module_param_named(mes, amdgpu_mes, int, 0444);
MODULE_PARM_DESC(noretry,
- "Disable retry faults (0 = retry enabled (default), 1 = retry disabled)");
+ "Disable retry faults (0 = retry enabled, 1 = retry disabled (default))");
module_param_named(noretry, amdgpu_noretry, int, 0644);
#ifdef CONFIG_HSA_AMD
/* Raven */
{0x1002, 0x15dd, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RAVEN|AMD_IS_APU},
{0x1002, 0x15d8, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RAVEN|AMD_IS_APU},
+ /* Arcturus */
+ {0x1002, 0x738C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARCTURUS|AMD_EXP_HW_SUPPORT},
+ {0x1002, 0x7388, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARCTURUS|AMD_EXP_HW_SUPPORT},
+ {0x1002, 0x738E, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARCTURUS|AMD_EXP_HW_SUPPORT},
+ {0x1002, 0x7390, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARCTURUS|AMD_EXP_HW_SUPPORT},
/* Navi10 */
{0x1002, 0x7310, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
{0x1002, 0x7312, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
{0x1002, 0x731A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
{0x1002, 0x731B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
{0x1002, 0x731F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
+ /* Navi14 */
+ {0x1002, 0x7340, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14},
+
+ /* Renoir */
+ {0x1002, 0x1636, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU|AMD_EXP_HW_SUPPORT},
{0, 0, 0}
};
* unfortunately we can't detect certain
* hypervisors so just do this all the time.
*/
+ adev->mp1_state = PP_MP1_STATE_UNLOAD;
amdgpu_device_ip_suspend(adev);
+ adev->mp1_state = PP_MP1_STATE_NONE;
}
static int amdgpu_pmops_suspend(struct device *dev)
{
- struct pci_dev *pdev = to_pci_dev(dev);
+ struct drm_device *drm_dev = dev_get_drvdata(dev);
- struct drm_device *drm_dev = pci_get_drvdata(pdev);
return amdgpu_device_suspend(drm_dev, true, true);
}
static int amdgpu_pmops_resume(struct device *dev)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct drm_device *drm_dev = pci_get_drvdata(pdev);
+ struct drm_device *drm_dev = dev_get_drvdata(dev);
/* GPU comes up enabled by the bios on resume */
if (amdgpu_device_is_px(drm_dev)) {
static int amdgpu_pmops_freeze(struct device *dev)
{
- struct pci_dev *pdev = to_pci_dev(dev);
+ struct drm_device *drm_dev = dev_get_drvdata(dev);
- struct drm_device *drm_dev = pci_get_drvdata(pdev);
return amdgpu_device_suspend(drm_dev, false, true);
}
static int amdgpu_pmops_thaw(struct device *dev)
{
- struct pci_dev *pdev = to_pci_dev(dev);
+ struct drm_device *drm_dev = dev_get_drvdata(dev);
- struct drm_device *drm_dev = pci_get_drvdata(pdev);
return amdgpu_device_resume(drm_dev, false, true);
}
static int amdgpu_pmops_poweroff(struct device *dev)
{
- struct pci_dev *pdev = to_pci_dev(dev);
+ struct drm_device *drm_dev = dev_get_drvdata(dev);
- struct drm_device *drm_dev = pci_get_drvdata(pdev);
return amdgpu_device_suspend(drm_dev, true, true);
}
static int amdgpu_pmops_restore(struct device *dev)
{
- struct pci_dev *pdev = to_pci_dev(dev);
+ struct drm_device *drm_dev = dev_get_drvdata(dev);
- struct drm_device *drm_dev = pci_get_drvdata(pdev);
return amdgpu_device_resume(drm_dev, false, true);
}
static int amdgpu_pmops_runtime_idle(struct device *dev)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct drm_device *drm_dev = pci_get_drvdata(pdev);
+ struct drm_device *drm_dev = dev_get_drvdata(dev);
struct drm_crtc *crtc;
if (!amdgpu_device_is_px(drm_dev)) {
.driver_features =
DRIVER_USE_AGP | DRIVER_ATOMIC |
DRIVER_GEM |
- DRIVER_PRIME | DRIVER_RENDER | DRIVER_MODESET | DRIVER_SYNCOBJ,
+ DRIVER_RENDER | DRIVER_MODESET | DRIVER_SYNCOBJ,
.load = amdgpu_driver_load_kms,
.open = amdgpu_driver_open_kms,
.postclose = amdgpu_driver_postclose_kms,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
.gem_prime_export = amdgpu_gem_prime_export,
.gem_prime_import = amdgpu_gem_prime_import,
- .gem_prime_res_obj = amdgpu_gem_prime_res_obj,
.gem_prime_get_sg_table = amdgpu_gem_prime_get_sg_table,
.gem_prime_import_sg_table = amdgpu_gem_prime_import_sg_table,
.gem_prime_vmap = amdgpu_gem_prime_vmap,
amdgpu_unregister_atpx_handler();
amdgpu_sync_fini();
amdgpu_fence_slab_fini();
+ mmu_notifier_synchronize();
}
module_init(amdgpu_init);
if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, start, end))
continue;
- r = reservation_object_wait_timeout_rcu(bo->tbo.resv,
+ r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv,
true, false, MAX_SCHEDULE_TIMEOUT);
if (r <= 0)
DRM_ERROR("(%ld) failed to wait for user bo\n", r);
* Block for operations on BOs to finish and mark pages as accessed and
* potentially dirty.
*/
- static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror,
- const struct hmm_update *update)
+ static int
+ amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror,
+ const struct mmu_notifier_range *update)
{
struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
unsigned long start = update->start;
unsigned long end = update->end;
- bool blockable = update->blockable;
+ bool blockable = mmu_notifier_range_blockable(update);
struct interval_tree_node *it;
/* notification is exclusive, but interval is inclusive */
* necessitates evicting all user-mode queues of the process. The BOs
* are restorted in amdgpu_mn_invalidate_range_end_hsa.
*/
- static int amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror,
- const struct hmm_update *update)
+ static int
+ amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror,
+ const struct mmu_notifier_range *update)
{
struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
unsigned long start = update->start;
unsigned long end = update->end;
- bool blockable = update->blockable;
+ bool blockable = mmu_notifier_range_blockable(update);
struct interval_tree_node *it;
/* notification is exclusive, but interval is inclusive */
range->flags = hmm_range_flags;
range->values = hmm_range_values;
range->pfn_shift = PAGE_SHIFT;
- INIT_LIST_HEAD(&range->list);
}
}
if (amdgpu_ttm_tt_get_usermm(bo->ttm))
return -EPERM;
- return drm_vma_node_verify_access(&abo->gem_base.vma_node,
+ return drm_vma_node_verify_access(&abo->tbo.base.vma_node,
filp->private_data);
}
struct amdgpu_copy_mem *src,
struct amdgpu_copy_mem *dst,
uint64_t size,
- struct reservation_object *resv,
+ struct dma_resv *resv,
struct dma_fence **f)
{
struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
r = amdgpu_ttm_copy_mem_to_mem(adev, &src, &dst,
new_mem->num_pages << PAGE_SHIFT,
- bo->resv, &fence);
+ bo->base.resv, &fence);
if (r)
goto error;
+ /* clear the space being freed */
+ if (old_mem->mem_type == TTM_PL_VRAM &&
+ (ttm_to_amdgpu_bo(bo)->flags &
+ AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) {
+ struct dma_fence *wipe_fence = NULL;
+
+ r = amdgpu_fill_buffer(ttm_to_amdgpu_bo(bo), AMDGPU_POISON,
+ NULL, &wipe_fence);
+ if (r) {
+ goto error;
+ } else if (wipe_fence) {
+ dma_fence_put(fence);
+ fence = wipe_fence;
+ }
+ }
+
/* Always block for VM page tables before committing the new location */
if (bo->type == ttm_bo_type_kernel)
r = ttm_bo_move_accel_cleanup(bo, fence, true, new_mem);
struct hmm_range *range;
unsigned long i;
uint64_t *pfns;
- int retry = 0;
int r = 0;
if (!mm) /* Happens during process shutdown */
0 : range->flags[HMM_PFN_WRITE];
range->pfn_flags_mask = 0;
range->pfns = pfns;
- hmm_range_register(range, mirror, start,
- start + ttm->num_pages * PAGE_SIZE, PAGE_SHIFT);
+ range->start = start;
+ range->end = start + ttm->num_pages * PAGE_SIZE;
+
+ hmm_range_register(range, mirror);
- retry:
/*
* Just wait for range to be valid, safe to ignore return value as we
* will use the return value of hmm_range_fault() below under the
hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT);
down_read(&mm->mmap_sem);
-
- r = hmm_range_fault(range, true);
- if (unlikely(r < 0)) {
- if (likely(r == -EAGAIN)) {
- /*
- * return -EAGAIN, mmap_sem is dropped
- */
- if (retry++ < MAX_RETRY_HMM_RANGE_FAULT)
- goto retry;
- else
- pr_err("Retry hmm fault too many times\n");
- }
-
- goto out_up_read;
- }
-
+ r = hmm_range_fault(range, 0);
up_read(&mm->mmap_sem);
+ if (unlikely(r < 0))
+ goto out_free_pfns;
+
for (i = 0; i < ttm->num_pages; i++) {
pages[i] = hmm_device_entry_to_page(range, pfns[i]);
if (unlikely(!pages[i])) {
return 0;
- out_up_read:
- if (likely(r != -EAGAIN))
- up_read(&mm->mmap_sem);
out_free_pfns:
hmm_range_unregister(range);
kvfree(pfns);
{
unsigned long num_pages = bo->mem.num_pages;
struct drm_mm_node *node = bo->mem.mm_node;
- struct reservation_object_list *flist;
+ struct dma_resv_list *flist;
struct dma_fence *f;
int i;
* cleanly handle page faults.
*/
if (bo->type == ttm_bo_type_kernel &&
- !reservation_object_test_signaled_rcu(bo->resv, true))
+ !dma_resv_test_signaled_rcu(bo->base.resv, true))
return false;
/* If bo is a KFD BO, check if the bo belongs to the current process.
* If true, then return false as any KFD process needs all its BOs to
* be resident to run successfully
*/
- flist = reservation_object_get_list(bo->resv);
+ flist = dma_resv_get_list(bo->base.resv);
if (flist) {
for (i = 0; i < flist->shared_count; ++i) {
f = rcu_dereference_protected(flist->shared[i],
- reservation_object_held(bo->resv));
+ dma_resv_held(bo->base.resv));
if (amdkfd_fence_check_mm(f, current->mm))
return false;
}
.move = &amdgpu_bo_move,
.verify_access = &amdgpu_verify_access,
.move_notify = &amdgpu_bo_move_notify,
+ .release_notify = &amdgpu_bo_release_notify,
.fault_reserve_notify = &amdgpu_bo_fault_reserve_notify,
.io_mem_reserve = &amdgpu_ttm_io_mem_reserve,
.io_mem_free = &amdgpu_ttm_io_mem_free,
uint64_t gtt_size;
int r;
u64 vis_vram_limit;
+ void *stolen_vga_buf;
mutex_init(&adev->mman.gtt_window_lock);
r = ttm_bo_device_init(&adev->mman.bdev,
&amdgpu_bo_driver,
adev->ddev->anon_inode->i_mapping,
- adev->need_dma32);
+ dma_addressing_limited(adev->dev));
if (r) {
DRM_ERROR("failed initializing buffer object driver(%d).\n", r);
return r;
r = amdgpu_bo_create_kernel(adev, adev->gmc.stolen_size, PAGE_SIZE,
AMDGPU_GEM_DOMAIN_VRAM,
&adev->stolen_vga_memory,
- NULL, NULL);
+ NULL, &stolen_vga_buf);
if (r)
return r;
DRM_INFO("amdgpu: %uM of VRAM memory ready\n",
*/
void amdgpu_ttm_late_init(struct amdgpu_device *adev)
{
+ void *stolen_vga_buf;
/* return the VGA stolen memory (if any) back to VRAM */
- amdgpu_bo_free_kernel(&adev->stolen_vga_memory, NULL, NULL);
+ amdgpu_bo_free_kernel(&adev->stolen_vga_memory, NULL, &stolen_vga_buf);
}
/**
int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
uint64_t dst_offset, uint32_t byte_count,
- struct reservation_object *resv,
+ struct dma_resv *resv,
struct dma_fence **fence, bool direct_submit,
bool vm_needs_flush)
{
int amdgpu_fill_buffer(struct amdgpu_bo *bo,
uint32_t src_data,
- struct reservation_object *resv,
+ struct dma_resv *resv,
struct dma_fence **fence)
{
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
struct kfd_device_info {
enum amd_asic_type asic_family;
+ const char *asic_name;
const struct kfd_event_interrupt_class *event_interrupt_class;
unsigned int max_pasid_bits;
unsigned int max_no_of_hqd;
/* We want to receive a notification when the mm_struct is destroyed */
struct mmu_notifier mmu_notifier;
- /* Use for delayed freeing of kfd_process structure */
- struct rcu_head rcu;
-
unsigned int pasid;
unsigned int doorbell_index;
static struct kfd_process *find_process(const struct task_struct *thread);
static void kfd_process_ref_release(struct kref *ref);
- static struct kfd_process *create_process(const struct task_struct *thread,
- struct file *filep);
+ static struct kfd_process *create_process(const struct task_struct *thread);
+ static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep);
static void evict_process_worker(struct work_struct *work);
static void restore_process_worker(struct work_struct *work);
if (process) {
pr_debug("Process already found\n");
} else {
- process = create_process(thread, filep);
+ process = create_process(thread);
+ if (IS_ERR(process))
+ goto out;
+
+ ret = kfd_process_init_cwsr_apu(process, filep);
+ if (ret) {
+ process = ERR_PTR(ret);
+ goto out;
+ }
if (!procfs.kobj)
goto out;
queue_work(kfd_process_wq, &p->release_work);
}
- static void kfd_process_destroy_delayed(struct rcu_head *rcu)
+ static void kfd_process_free_notifier(struct mmu_notifier *mn)
{
- struct kfd_process *p = container_of(rcu, struct kfd_process, rcu);
-
- kfd_unref_process(p);
+ kfd_unref_process(container_of(mn, struct kfd_process, mmu_notifier));
}
static void kfd_process_notifier_release(struct mmu_notifier *mn,
mutex_unlock(&p->mutex);
- mmu_notifier_unregister_no_release(&p->mmu_notifier, mm);
- mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed);
+ mmu_notifier_put(&p->mmu_notifier);
}
static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
.release = kfd_process_notifier_release,
+ .free_notifier = kfd_process_free_notifier,
};
static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
return 0;
}
- static struct kfd_process *create_process(const struct task_struct *thread,
- struct file *filep)
+ /*
+ * On return the kfd_process is fully operational and will be freed when the
+ * mm is released
+ */
+ static struct kfd_process *create_process(const struct task_struct *thread)
{
struct kfd_process *process;
int err = -ENOMEM;
process = kzalloc(sizeof(*process), GFP_KERNEL);
-
if (!process)
goto err_alloc_process;
- process->pasid = kfd_pasid_alloc();
- if (process->pasid == 0)
- goto err_alloc_pasid;
-
- if (kfd_alloc_process_doorbells(process) < 0)
- goto err_alloc_doorbells;
-
kref_init(&process->ref);
-
mutex_init(&process->mutex);
-
process->mm = thread->mm;
-
- /* register notifier */
- process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
- err = mmu_notifier_register(&process->mmu_notifier, process->mm);
- if (err)
- goto err_mmu_notifier;
-
- hash_add_rcu(kfd_processes_table, &process->kfd_processes,
- (uintptr_t)process->mm);
-
process->lead_thread = thread->group_leader;
- get_task_struct(process->lead_thread);
-
INIT_LIST_HEAD(&process->per_device_data);
-
+ INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
+ INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
+ process->last_restore_timestamp = get_jiffies_64();
kfd_event_init_process(process);
+ process->is_32bit_user_mode = in_compat_syscall();
+
+ process->pasid = kfd_pasid_alloc();
+ if (process->pasid == 0)
+ goto err_alloc_pasid;
+
+ if (kfd_alloc_process_doorbells(process) < 0)
+ goto err_alloc_doorbells;
err = pqm_init(&process->pqm, process);
if (err != 0)
goto err_process_pqm_init;
/* init process apertures*/
- process->is_32bit_user_mode = in_compat_syscall();
err = kfd_init_apertures(process);
if (err != 0)
goto err_init_apertures;
- INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
- INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
- process->last_restore_timestamp = get_jiffies_64();
-
- err = kfd_process_init_cwsr_apu(process, filep);
+ /* Must be last, have to use release destruction after this */
+ process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
+ err = mmu_notifier_register(&process->mmu_notifier, process->mm);
if (err)
- goto err_init_cwsr;
+ goto err_register_notifier;
+
+ get_task_struct(process->lead_thread);
+ hash_add_rcu(kfd_processes_table, &process->kfd_processes,
+ (uintptr_t)process->mm);
return process;
- err_init_cwsr:
+ err_register_notifier:
kfd_process_free_outstanding_kfd_bos(process);
kfd_process_destroy_pdds(process);
err_init_apertures:
pqm_uninit(&process->pqm);
err_process_pqm_init:
- hash_del_rcu(&process->kfd_processes);
- synchronize_rcu();
- mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm);
- err_mmu_notifier:
- mutex_destroy(&process->mutex);
kfd_free_process_doorbells(process);
err_alloc_doorbells:
kfd_pasid_free(process->pasid);
err_alloc_pasid:
+ mutex_destroy(&process->mutex);
kfree(process);
err_alloc_process:
return ERR_PTR(err);
return ret;
}
+ amdgpu_vm_set_task_info(pdd->vm);
+
ret = kfd_process_device_reserve_ib_mem(pdd);
if (ret)
goto err_reserve_ib_mem;
{
struct delayed_work *dwork;
struct kfd_process *p;
- struct kfd_process_device *pdd;
int ret = 0;
dwork = to_delayed_work(work);
* lifetime of this thread, kfd_process p will be valid
*/
p = container_of(dwork, struct kfd_process, restore_work);
-
- /* Call restore_process_bos on the first KGD device. This function
- * takes care of restoring the whole process including other devices.
- * Restore can fail if enough memory is not available. If so,
- * reschedule again.
- */
- pdd = list_first_entry(&p->per_device_data,
- struct kfd_process_device,
- per_device_list);
-
pr_debug("Started restoring pasid %d\n", p->pasid);
/* Setting last_restore_timestamp before successful restoration.
#include <linux/pci.h>
#include <linux/pm_runtime.h>
#include <linux/vga_switcheroo.h>
+ #include <linux/mmu_notifier.h>
-#include <drm/drmP.h>
#include <drm/drm_crtc_helper.h>
+#include <drm/drm_ioctl.h>
+#include <drm/drm_vblank.h>
#include <core/gpuobj.h>
#include <core/option.h>
static const struct drm_ioctl_desc
nouveau_ioctls[] = {
- DRM_IOCTL_DEF_DRV(NOUVEAU_GETPARAM, nouveau_abi16_ioctl_getparam, DRM_AUTH|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(NOUVEAU_SETPARAM, nouveau_abi16_ioctl_setparam, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
- DRM_IOCTL_DEF_DRV(NOUVEAU_CHANNEL_ALLOC, nouveau_abi16_ioctl_channel_alloc, DRM_AUTH|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(NOUVEAU_CHANNEL_FREE, nouveau_abi16_ioctl_channel_free, DRM_AUTH|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(NOUVEAU_GROBJ_ALLOC, nouveau_abi16_ioctl_grobj_alloc, DRM_AUTH|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(NOUVEAU_NOTIFIEROBJ_ALLOC, nouveau_abi16_ioctl_notifierobj_alloc, DRM_AUTH|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(NOUVEAU_GPUOBJ_FREE, nouveau_abi16_ioctl_gpuobj_free, DRM_AUTH|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_INIT, nouveau_svmm_init, DRM_AUTH|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_BIND, nouveau_svmm_bind, DRM_AUTH|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_NEW, nouveau_gem_ioctl_new, DRM_AUTH|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_PUSHBUF, nouveau_gem_ioctl_pushbuf, DRM_AUTH|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_PREP, nouveau_gem_ioctl_cpu_prep, DRM_AUTH|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_FINI, nouveau_gem_ioctl_cpu_fini, DRM_AUTH|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_INFO, nouveau_gem_ioctl_info, DRM_AUTH|DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(NOUVEAU_GETPARAM, nouveau_abi16_ioctl_getparam, DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(NOUVEAU_SETPARAM, drm_invalid_op, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
+ DRM_IOCTL_DEF_DRV(NOUVEAU_CHANNEL_ALLOC, nouveau_abi16_ioctl_channel_alloc, DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(NOUVEAU_CHANNEL_FREE, nouveau_abi16_ioctl_channel_free, DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(NOUVEAU_GROBJ_ALLOC, nouveau_abi16_ioctl_grobj_alloc, DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(NOUVEAU_NOTIFIEROBJ_ALLOC, nouveau_abi16_ioctl_notifierobj_alloc, DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(NOUVEAU_GPUOBJ_FREE, nouveau_abi16_ioctl_gpuobj_free, DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_INIT, nouveau_svmm_init, DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(NOUVEAU_SVM_BIND, nouveau_svmm_bind, DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_NEW, nouveau_gem_ioctl_new, DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_PUSHBUF, nouveau_gem_ioctl_pushbuf, DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_PREP, nouveau_gem_ioctl_cpu_prep, DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_FINI, nouveau_gem_ioctl_cpu_fini, DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_INFO, nouveau_gem_ioctl_info, DRM_RENDER_ALLOW),
};
long
static struct drm_driver
driver_stub = {
.driver_features =
- DRIVER_GEM | DRIVER_MODESET | DRIVER_PRIME | DRIVER_RENDER
+ DRIVER_GEM | DRIVER_MODESET | DRIVER_RENDER
#if defined(CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT)
| DRIVER_KMS_LEGACY_CONTEXT
#endif
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
- .gem_prime_export = drm_gem_prime_export,
- .gem_prime_import = drm_gem_prime_import,
.gem_prime_pin = nouveau_gem_prime_pin,
- .gem_prime_res_obj = nouveau_gem_prime_res_obj,
.gem_prime_unpin = nouveau_gem_prime_unpin,
.gem_prime_get_sg_table = nouveau_gem_prime_get_sg_table,
.gem_prime_import_sg_table = nouveau_gem_prime_import_sg_table,
#ifdef CONFIG_NOUVEAU_PLATFORM_DRIVER
platform_driver_unregister(&nouveau_platform_driver);
#endif
+ if (IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM))
+ mmu_notifier_synchronize();
}
module_init(nouveau_drm_init);
struct list_head va;
/* Constant after initialization */
struct radeon_device *rdev;
- struct drm_gem_object gem_base;
struct ttm_bo_kmap_obj dma_buf_vmap;
pid_t pid;
struct radeon_mn *mn;
struct list_head mn_list;
};
-#define gem_to_radeon_bo(gobj) container_of((gobj), struct radeon_bo, gem_base)
+#define gem_to_radeon_bo(gobj) container_of((gobj), struct radeon_bo, tbo.base)
int radeon_gem_debugfs_init(struct radeon_device *rdev);
struct radeon_fence *fence);
int radeon_sync_resv(struct radeon_device *rdev,
struct radeon_sync *sync,
- struct reservation_object *resv,
+ struct dma_resv *resv,
bool shared);
int radeon_sync_rings(struct radeon_device *rdev,
struct radeon_sync *sync,
uint64_t src_offset,
uint64_t dst_offset,
unsigned num_gpu_pages,
- struct reservation_object *resv);
+ struct dma_resv *resv);
u32 blit_ring_index;
struct radeon_fence *(*dma)(struct radeon_device *rdev,
uint64_t src_offset,
uint64_t dst_offset,
unsigned num_gpu_pages,
- struct reservation_object *resv);
+ struct dma_resv *resv);
u32 dma_ring_index;
/* method used for bo copy */
struct radeon_fence *(*copy)(struct radeon_device *rdev,
uint64_t src_offset,
uint64_t dst_offset,
unsigned num_gpu_pages,
- struct reservation_object *resv);
+ struct dma_resv *resv);
/* ring used for bo copies */
u32 copy_ring_index;
} copy;
struct radeon_wb wb;
struct radeon_dummy_page dummy_page;
bool shutdown;
- bool need_dma32;
bool need_swiotlb;
bool accel_working;
bool fastfb_working; /* IGP feature*/
/* tracking pinned memory */
u64 vram_pin_size;
u64 gart_pin_size;
-
- struct mutex mn_lock;
- DECLARE_HASHTABLE(mn_hash, 7);
};
bool radeon_is_px(struct drm_device *dev);
init_rwsem(&rdev->pm.mclk_lock);
init_rwsem(&rdev->exclusive_lock);
init_waitqueue_head(&rdev->irq.vblank_queue);
- mutex_init(&rdev->mn_lock);
- hash_init(rdev->mn_hash);
r = radeon_gem_init(rdev);
if (r)
return r;
else
rdev->mc.mc_mask = 0xffffffffULL; /* 32 bit MC */
- /* set DMA mask + need_dma32 flags.
+ /* set DMA mask.
* PCIE - can handle 40-bits.
* IGP - can handle 40-bits
* AGP - generally dma32 is safest
* PCI - dma32 for legacy pci gart, 40 bits on newer asics
*/
- rdev->need_dma32 = false;
+ dma_bits = 40;
if (rdev->flags & RADEON_IS_AGP)
- rdev->need_dma32 = true;
+ dma_bits = 32;
if ((rdev->flags & RADEON_IS_PCI) &&
(rdev->family <= CHIP_RS740))
- rdev->need_dma32 = true;
+ dma_bits = 32;
#ifdef CONFIG_PPC64
if (rdev->family == CHIP_CEDAR)
- rdev->need_dma32 = true;
+ dma_bits = 32;
#endif
- dma_bits = rdev->need_dma32 ? 32 : 40;
- r = pci_set_dma_mask(rdev->pdev, DMA_BIT_MASK(dma_bits));
+ r = dma_set_mask_and_coherent(&rdev->pdev->dev, DMA_BIT_MASK(dma_bits));
if (r) {
- rdev->need_dma32 = true;
- dma_bits = 32;
pr_warn("radeon: No suitable DMA available\n");
- }
- r = pci_set_consistent_dma_mask(rdev->pdev, DMA_BIT_MASK(dma_bits));
- if (r) {
- pci_set_consistent_dma_mask(rdev->pdev, DMA_BIT_MASK(32));
- pr_warn("radeon: No coherent DMA available\n");
+ return r;
}
rdev->need_swiotlb = drm_need_swiotlb(dma_bits);
#include <linux/module.h>
#include <linux/pm_runtime.h>
#include <linux/vga_switcheroo.h>
+ #include <linux/mmu_notifier.h>
#include <drm/drm_crtc_helper.h>
#include <drm/drm_drv.h>
struct drm_file *file_priv);
void radeon_gem_object_close(struct drm_gem_object *obj,
struct drm_file *file_priv);
-struct dma_buf *radeon_gem_prime_export(struct drm_device *dev,
- struct drm_gem_object *gobj,
+struct dma_buf *radeon_gem_prime_export(struct drm_gem_object *gobj,
int flags);
extern int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int crtc,
unsigned int flags, int *vpos, int *hpos,
struct sg_table *sg);
int radeon_gem_prime_pin(struct drm_gem_object *obj);
void radeon_gem_prime_unpin(struct drm_gem_object *obj);
-struct reservation_object *radeon_gem_prime_res_obj(struct drm_gem_object *);
void *radeon_gem_prime_vmap(struct drm_gem_object *obj);
void radeon_gem_prime_vunmap(struct drm_gem_object *obj, void *vaddr);
static void
radeon_pci_shutdown(struct pci_dev *pdev)
{
+ struct drm_device *ddev = pci_get_drvdata(pdev);
+
/* if we are running in a VM, make sure the device
* torn down properly on reboot/shutdown
*/
if (radeon_device_is_virtual())
radeon_pci_remove(pdev);
+
+ /* Some adapters need to be suspended before a
+ * shutdown occurs in order to prevent an error
+ * during kexec.
+ */
+ radeon_suspend_kms(ddev, true, true, false);
}
static int radeon_pmops_suspend(struct device *dev)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct drm_device *drm_dev = pci_get_drvdata(pdev);
+ struct drm_device *drm_dev = dev_get_drvdata(dev);
return radeon_suspend_kms(drm_dev, true, true, false);
}
static int radeon_pmops_resume(struct device *dev)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct drm_device *drm_dev = pci_get_drvdata(pdev);
+ struct drm_device *drm_dev = dev_get_drvdata(dev);
/* GPU comes up enabled by the bios on resume */
if (radeon_is_px(drm_dev)) {
static int radeon_pmops_freeze(struct device *dev)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct drm_device *drm_dev = pci_get_drvdata(pdev);
+ struct drm_device *drm_dev = dev_get_drvdata(dev);
return radeon_suspend_kms(drm_dev, false, true, true);
}
static int radeon_pmops_thaw(struct device *dev)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct drm_device *drm_dev = pci_get_drvdata(pdev);
+ struct drm_device *drm_dev = dev_get_drvdata(dev);
return radeon_resume_kms(drm_dev, false, true);
}
static int radeon_pmops_runtime_idle(struct device *dev)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct drm_device *drm_dev = pci_get_drvdata(pdev);
+ struct drm_device *drm_dev = dev_get_drvdata(dev);
struct drm_crtc *crtc;
if (!radeon_is_px(drm_dev)) {
static struct drm_driver kms_driver = {
.driver_features =
- DRIVER_USE_AGP | DRIVER_GEM | DRIVER_PRIME | DRIVER_RENDER,
+ DRIVER_USE_AGP | DRIVER_GEM | DRIVER_RENDER,
.load = radeon_driver_load_kms,
.open = radeon_driver_open_kms,
.postclose = radeon_driver_postclose_kms,
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
.gem_prime_export = radeon_gem_prime_export,
- .gem_prime_import = drm_gem_prime_import,
.gem_prime_pin = radeon_gem_prime_pin,
.gem_prime_unpin = radeon_gem_prime_unpin,
- .gem_prime_res_obj = radeon_gem_prime_res_obj,
.gem_prime_get_sg_table = radeon_gem_prime_get_sg_table,
.gem_prime_import_sg_table = radeon_gem_prime_import_sg_table,
.gem_prime_vmap = radeon_gem_prime_vmap,
{
pci_unregister_driver(pdriver);
radeon_unregister_atpx_handler();
+ mmu_notifier_synchronize();
}
module_init(radeon_init);
#include "radeon.h"
struct radeon_mn {
- /* constant after initialisation */
- struct radeon_device *rdev;
- struct mm_struct *mm;
struct mmu_notifier mn;
- /* only used on destruction */
- struct work_struct work;
-
- /* protected by rdev->mn_lock */
- struct hlist_node node;
-
/* objects protected by lock */
struct mutex lock;
struct rb_root_cached objects;
struct list_head bos;
};
- /**
- * radeon_mn_destroy - destroy the rmn
- *
- * @work: previously sheduled work item
- *
- * Lazy destroys the notifier from a work item
- */
- static void radeon_mn_destroy(struct work_struct *work)
- {
- struct radeon_mn *rmn = container_of(work, struct radeon_mn, work);
- struct radeon_device *rdev = rmn->rdev;
- struct radeon_mn_node *node, *next_node;
- struct radeon_bo *bo, *next_bo;
-
- mutex_lock(&rdev->mn_lock);
- mutex_lock(&rmn->lock);
- hash_del(&rmn->node);
- rbtree_postorder_for_each_entry_safe(node, next_node,
- &rmn->objects.rb_root, it.rb) {
-
- interval_tree_remove(&node->it, &rmn->objects);
- list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) {
- bo->mn = NULL;
- list_del_init(&bo->mn_list);
- }
- kfree(node);
- }
- mutex_unlock(&rmn->lock);
- mutex_unlock(&rdev->mn_lock);
- mmu_notifier_unregister(&rmn->mn, rmn->mm);
- kfree(rmn);
- }
-
- /**
- * radeon_mn_release - callback to notify about mm destruction
- *
- * @mn: our notifier
- * @mn: the mm this callback is about
- *
- * Shedule a work item to lazy destroy our notifier.
- */
- static void radeon_mn_release(struct mmu_notifier *mn,
- struct mm_struct *mm)
- {
- struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
- INIT_WORK(&rmn->work, radeon_mn_destroy);
- schedule_work(&rmn->work);
- }
-
/**
* radeon_mn_invalidate_range_start - callback to notify about mm change
*
continue;
}
- r = reservation_object_wait_timeout_rcu(bo->tbo.resv,
+ r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv,
true, false, MAX_SCHEDULE_TIMEOUT);
if (r <= 0)
DRM_ERROR("(%ld) failed to wait for user bo\n", r);
return ret;
}
- static const struct mmu_notifier_ops radeon_mn_ops = {
- .release = radeon_mn_release,
- .invalidate_range_start = radeon_mn_invalidate_range_start,
- };
+ static void radeon_mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
+ {
+ struct mmu_notifier_range range = {
+ .mm = mm,
+ .start = 0,
+ .end = ULONG_MAX,
+ .flags = 0,
+ .event = MMU_NOTIFY_UNMAP,
+ };
+
+ radeon_mn_invalidate_range_start(mn, &range);
+ }
- /**
- * radeon_mn_get - create notifier context
- *
- * @rdev: radeon device pointer
- *
- * Creates a notifier context for current->mm.
- */
- static struct radeon_mn *radeon_mn_get(struct radeon_device *rdev)
+ static struct mmu_notifier *radeon_mn_alloc_notifier(struct mm_struct *mm)
{
- struct mm_struct *mm = current->mm;
struct radeon_mn *rmn;
- int r;
-
- if (down_write_killable(&mm->mmap_sem))
- return ERR_PTR(-EINTR);
-
- mutex_lock(&rdev->mn_lock);
-
- hash_for_each_possible(rdev->mn_hash, rmn, node, (unsigned long)mm)
- if (rmn->mm == mm)
- goto release_locks;
rmn = kzalloc(sizeof(*rmn), GFP_KERNEL);
- if (!rmn) {
- rmn = ERR_PTR(-ENOMEM);
- goto release_locks;
- }
+ if (!rmn)
+ return ERR_PTR(-ENOMEM);
- rmn->rdev = rdev;
- rmn->mm = mm;
- rmn->mn.ops = &radeon_mn_ops;
mutex_init(&rmn->lock);
rmn->objects = RB_ROOT_CACHED;
-
- r = __mmu_notifier_register(&rmn->mn, mm);
- if (r)
- goto free_rmn;
-
- hash_add(rdev->mn_hash, &rmn->node, (unsigned long)mm);
-
- release_locks:
- mutex_unlock(&rdev->mn_lock);
- up_write(&mm->mmap_sem);
-
- return rmn;
-
- free_rmn:
- mutex_unlock(&rdev->mn_lock);
- up_write(&mm->mmap_sem);
- kfree(rmn);
+ return &rmn->mn;
+ }
- return ERR_PTR(r);
+ static void radeon_mn_free_notifier(struct mmu_notifier *mn)
+ {
+ kfree(container_of(mn, struct radeon_mn, mn));
}
+ static const struct mmu_notifier_ops radeon_mn_ops = {
+ .release = radeon_mn_release,
+ .invalidate_range_start = radeon_mn_invalidate_range_start,
+ .alloc_notifier = radeon_mn_alloc_notifier,
+ .free_notifier = radeon_mn_free_notifier,
+ };
+
/**
* radeon_mn_register - register a BO for notifier updates
*
int radeon_mn_register(struct radeon_bo *bo, unsigned long addr)
{
unsigned long end = addr + radeon_bo_size(bo) - 1;
- struct radeon_device *rdev = bo->rdev;
+ struct mmu_notifier *mn;
struct radeon_mn *rmn;
struct radeon_mn_node *node = NULL;
struct list_head bos;
struct interval_tree_node *it;
- rmn = radeon_mn_get(rdev);
- if (IS_ERR(rmn))
- return PTR_ERR(rmn);
+ mn = mmu_notifier_get(&radeon_mn_ops, current->mm);
+ if (IS_ERR(mn))
+ return PTR_ERR(mn);
+ rmn = container_of(mn, struct radeon_mn, mn);
INIT_LIST_HEAD(&bos);
*/
void radeon_mn_unregister(struct radeon_bo *bo)
{
- struct radeon_device *rdev = bo->rdev;
- struct radeon_mn *rmn;
+ struct radeon_mn *rmn = bo->mn;
struct list_head *head;
- mutex_lock(&rdev->mn_lock);
- rmn = bo->mn;
- if (rmn == NULL) {
- mutex_unlock(&rdev->mn_lock);
+ if (!rmn)
return;
- }
mutex_lock(&rmn->lock);
/* save the next list entry for later */
head = bo->mn_list.next;
- bo->mn = NULL;
list_del(&bo->mn_list);
if (list_empty(head)) {
}
mutex_unlock(&rmn->lock);
- mutex_unlock(&rdev->mn_lock);
+
+ mmu_notifier_put(&rmn->mn);
+ bo->mn = NULL;
}
props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
- if (MLX5_CAP_GEN(mdev, pg))
+ if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
props->odp_caps = dev->odp_caps;
}
if (err)
goto out_sys_pages;
- if (ibdev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)
- context->ibucontext.invalidate_range =
- &mlx5_ib_invalidate_range;
-
if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
err = mlx5_ib_devx_create(dev, true);
if (err < 0)
struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
struct mlx5_bfreg_info *bfregi;
- /* All umem's must be destroyed before destroying the ucontext. */
- mutex_lock(&ibcontext->per_mm_list_lock);
- WARN_ON(!list_empty(&ibcontext->per_mm_list));
- mutex_unlock(&ibcontext->per_mm_list_lock);
-
bfregi = &context->bfregi;
mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
return -EOPNOTSUPP;
break;
case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+ case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
if (!capable(CAP_SYS_RAWIO) ||
!capable(CAP_NET_RAW))
return -EPERM;
struct uverbs_attr_bundle *attrs,
int type)
{
- struct mlx5_dm *dm_db = &to_mdev(ctx->device)->dm;
+ struct mlx5_core_dev *dev = to_mdev(ctx->device)->mdev;
u64 act_size;
int err;
/* Allocation size must a multiple of the basic block size
* and a power of 2.
*/
- act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dm_db->dev));
+ act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dev));
act_size = roundup_pow_of_two(act_size);
dm->size = act_size;
- err = mlx5_cmd_alloc_sw_icm(dm_db, type, act_size,
- to_mucontext(ctx)->devx_uid, &dm->dev_addr,
- &dm->icm_dm.obj_id);
+ err = mlx5_dm_sw_icm_alloc(dev, type, act_size,
+ to_mucontext(ctx)->devx_uid, &dm->dev_addr,
+ &dm->icm_dm.obj_id);
if (err)
return err;
MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
&dm->dev_addr, sizeof(dm->dev_addr));
if (err)
- mlx5_cmd_dealloc_sw_icm(dm_db, type, dm->size,
- to_mucontext(ctx)->devx_uid,
- dm->dev_addr, dm->icm_dm.obj_id);
+ mlx5_dm_sw_icm_dealloc(dev, type, dm->size,
+ to_mucontext(ctx)->devx_uid, dm->dev_addr,
+ dm->icm_dm.obj_id);
return err;
}
attrs);
break;
case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+ err = handle_alloc_dm_sw_icm(context, dm,
+ attr, attrs,
+ MLX5_SW_ICM_TYPE_STEERING);
+ break;
case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
- err = handle_alloc_dm_sw_icm(context, dm, attr, attrs, type);
+ err = handle_alloc_dm_sw_icm(context, dm,
+ attr, attrs,
+ MLX5_SW_ICM_TYPE_HEADER_MODIFY);
break;
default:
err = -EOPNOTSUPP;
{
struct mlx5_ib_ucontext *ctx = rdma_udata_to_drv_context(
&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
+ struct mlx5_core_dev *dev = to_mdev(ibdm->device)->mdev;
struct mlx5_dm *dm_db = &to_mdev(ibdm->device)->dm;
struct mlx5_ib_dm *dm = to_mdm(ibdm);
u32 page_idx;
if (ret)
return ret;
- page_idx = (dm->dev_addr -
- pci_resource_start(dm_db->dev->pdev, 0) -
- MLX5_CAP64_DEV_MEM(dm_db->dev,
- memic_bar_start_addr)) >>
- PAGE_SHIFT;
+ page_idx = (dm->dev_addr - pci_resource_start(dev->pdev, 0) -
+ MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr)) >>
+ PAGE_SHIFT;
bitmap_clear(ctx->dm_pages, page_idx,
DIV_ROUND_UP(dm->size, PAGE_SIZE));
break;
case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+ ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_STEERING,
+ dm->size, ctx->devx_uid, dm->dev_addr,
+ dm->icm_dm.obj_id);
+ if (ret)
+ return ret;
+ break;
case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
- ret = mlx5_cmd_dealloc_sw_icm(dm_db, dm->type, dm->size,
- ctx->devx_uid, dm->dev_addr,
- dm->icm_dm.obj_id);
+ ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_HEADER_MODIFY,
+ dm->size, ctx->devx_uid, dm->dev_addr,
+ dm->icm_dm.obj_id);
if (ret)
return ret;
break;
if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
return -EINVAL;
action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
- action->modify_id = maction->flow_action_raw.action_id;
+ action->modify_hdr =
+ maction->flow_action_raw.modify_hdr;
return 0;
}
if (maction->flow_action_raw.sub_type ==
return -EINVAL;
action->action |=
MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
- action->reformat_id =
- maction->flow_action_raw.action_id;
+ action->pkt_reformat =
+ maction->flow_action_raw.pkt_reformat;
return 0;
}
/* fall through */
static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
{
- struct mlx5_core_dev *mdev = dev->mdev;
-
mlx5_ib_cleanup_multiport_master(dev);
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
srcu_barrier(&dev->mr_srcu);
}
WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
-
- WARN_ON(dev->dm.steering_sw_icm_alloc_blocks &&
- !bitmap_empty(
- dev->dm.steering_sw_icm_alloc_blocks,
- BIT(MLX5_CAP_DEV_MEM(mdev, log_steering_sw_icm_size) -
- MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev))));
-
- kfree(dev->dm.steering_sw_icm_alloc_blocks);
-
- WARN_ON(dev->dm.header_modify_sw_icm_alloc_blocks &&
- !bitmap_empty(dev->dm.header_modify_sw_icm_alloc_blocks,
- BIT(MLX5_CAP_DEV_MEM(
- mdev, log_header_modify_sw_icm_size) -
- MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev))));
-
- kfree(dev->dm.header_modify_sw_icm_alloc_blocks);
}
static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
{
struct mlx5_core_dev *mdev = dev->mdev;
- u64 header_modify_icm_blocks = 0;
- u64 steering_icm_blocks = 0;
int err;
int i;
dev->port[i].roce.last_port_state = IB_PORT_DOWN;
}
+ mlx5_ib_internal_fill_odp_caps(dev);
+
err = mlx5_ib_init_multiport_master(dev);
if (err)
return err;
INIT_LIST_HEAD(&dev->qp_list);
spin_lock_init(&dev->reset_flow_resource_lock);
- if (MLX5_CAP_GEN_64(mdev, general_obj_types) &
- MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM) {
- if (MLX5_CAP64_DEV_MEM(mdev, steering_sw_icm_start_address)) {
- steering_icm_blocks =
- BIT(MLX5_CAP_DEV_MEM(mdev,
- log_steering_sw_icm_size) -
- MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev));
-
- dev->dm.steering_sw_icm_alloc_blocks =
- kcalloc(BITS_TO_LONGS(steering_icm_blocks),
- sizeof(unsigned long), GFP_KERNEL);
- if (!dev->dm.steering_sw_icm_alloc_blocks)
- goto err_mp;
- }
-
- if (MLX5_CAP64_DEV_MEM(mdev,
- header_modify_sw_icm_start_address)) {
- header_modify_icm_blocks = BIT(
- MLX5_CAP_DEV_MEM(
- mdev, log_header_modify_sw_icm_size) -
- MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev));
-
- dev->dm.header_modify_sw_icm_alloc_blocks =
- kcalloc(BITS_TO_LONGS(header_modify_icm_blocks),
- sizeof(unsigned long), GFP_KERNEL);
- if (!dev->dm.header_modify_sw_icm_alloc_blocks)
- goto err_dm;
- }
- }
-
spin_lock_init(&dev->dm.lock);
dev->dm.dev = mdev;
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
err = init_srcu_struct(&dev->mr_srcu);
if (err)
- goto err_dm;
+ goto err_mp;
}
return 0;
-err_dm:
- kfree(dev->dm.steering_sw_icm_alloc_blocks);
- kfree(dev->dm.header_modify_sw_icm_alloc_blocks);
-
err_mp:
mlx5_ib_cleanup_multiport_master(dev);
static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev)
{
- mlx5_ib_internal_fill_odp_caps(dev);
-
return mlx5_ib_odp_init_one(dev);
}
int *ncont, int *order)
{
struct ib_umem *u;
- int err;
*umem = NULL;
- u = ib_umem_get(udata, start, length, access_flags, 0);
- err = PTR_ERR_OR_ZERO(u);
- if (err) {
- mlx5_ib_dbg(dev, "umem get failed (%d)\n", err);
- return err;
+ if (access_flags & IB_ACCESS_ON_DEMAND) {
+ struct ib_umem_odp *odp;
+
+ odp = ib_umem_odp_get(udata, start, length, access_flags);
+ if (IS_ERR(odp)) {
+ mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
+ PTR_ERR(odp));
+ return PTR_ERR(odp);
+ }
+
+ u = &odp->umem;
+
+ *page_shift = odp->page_shift;
+ *ncont = ib_umem_odp_num_pages(odp);
+ *npages = *ncont << (*page_shift - PAGE_SHIFT);
+ if (order)
+ *order = ilog2(roundup_pow_of_two(*ncont));
+ } else {
+ u = ib_umem_get(udata, start, length, access_flags, 0);
+ if (IS_ERR(u)) {
+ mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u));
+ return PTR_ERR(u);
+ }
+
+ mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
+ page_shift, ncont, order);
}
- mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
- page_shift, ncont, order);
if (!*npages) {
mlx5_ib_warn(dev, "avoid zero region\n");
ib_umem_release(u);
if (err < 0)
return ERR_PTR(err);
- use_umr = !MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled) &&
- (!MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled) ||
- !MLX5_CAP_GEN(dev->mdev, atomic));
+ use_umr = mlx5_ib_can_use_umr(dev, true);
if (order <= mr_cache_max_order(dev) && use_umr) {
mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont,
goto err;
}
- if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) {
+ if (!mlx5_ib_can_use_umr(dev, true) ||
+ (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len))) {
/*
* UMR can't be used - MKey needs to be replaced.
*/
/* Wait for all running page-fault handlers to finish. */
synchronize_srcu(&dev->mr_srcu);
/* Destroy all page mappings */
- if (umem_odp->page_list)
+ if (!umem_odp->is_implicit_odp)
mlx5_ib_invalidate_range(umem_odp,
ib_umem_start(umem_odp),
ib_umem_end(umem_odp));
* so that there will not be any invalidations in
* flight, looking at the *mr struct.
*/
- ib_umem_release(umem);
+ ib_umem_odp_release(umem_odp);
atomic_sub(npages, &dev->mdev->priv.reg_pages);
/* Avoid double-freeing the umem. */
for (i = 0; i < nentries; i++, pklm++) {
pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
va = (offset + i) * MLX5_IMR_MTT_SIZE;
- if (odp && odp->umem.address == va) {
+ if (odp && ib_umem_start(odp) == va) {
struct mlx5_ib_mr *mtt = odp->private;
pklm->key = cpu_to_be32(mtt->ibmr.lkey);
mr->parent = NULL;
synchronize_srcu(&mr->dev->mr_srcu);
- ib_umem_release(&odp->umem);
+ ib_umem_odp_release(odp);
if (imr->live)
mlx5_ib_update_xlt(imr, idx, 1, 0,
MLX5_IB_UPD_XLT_INDIRECT |
memset(caps, 0, sizeof(*caps));
- if (!MLX5_CAP_GEN(dev->mdev, pg))
+ if (!MLX5_CAP_GEN(dev->mdev, pg) ||
+ !mlx5_ib_can_use_umr(dev, true))
return;
caps->general_caps = IB_ODP_SUPPORT;
if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
MLX5_CAP_GEN(dev->mdev, null_mkey) &&
- MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
+ MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
+ !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled))
caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
return;
}
static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
- struct ib_umem *umem,
+ struct ib_umem_odp *umem_odp,
bool ksm, int access_flags)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
mr->dev = dev;
mr->access_flags = access_flags;
mr->mmkey.iova = 0;
- mr->umem = umem;
+ mr->umem = &umem_odp->umem;
if (ksm) {
err = mlx5_ib_update_xlt(mr, 0,
if (nentries)
nentries++;
} else {
- odp = ib_alloc_odp_umem(odp_mr, addr,
- MLX5_IMR_MTT_SIZE);
+ odp = ib_umem_odp_alloc_child(odp_mr, addr, MLX5_IMR_MTT_SIZE);
if (IS_ERR(odp)) {
mutex_unlock(&odp_mr->umem_mutex);
return ERR_CAST(odp);
}
- mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0,
+ mtt = implicit_mr_alloc(mr->ibmr.pd, odp, 0,
mr->access_flags);
if (IS_ERR(mtt)) {
mutex_unlock(&odp_mr->umem_mutex);
- ib_umem_release(&odp->umem);
+ ib_umem_odp_release(odp);
return ERR_CAST(mtt);
}
addr += MLX5_IMR_MTT_SIZE;
if (unlikely(addr < io_virt + bcnt)) {
odp = odp_next(odp);
- if (odp && odp->umem.address != addr)
+ if (odp && ib_umem_start(odp) != addr)
odp = NULL;
goto next_mr;
}
int access_flags)
{
struct mlx5_ib_mr *imr;
- struct ib_umem *umem;
+ struct ib_umem_odp *umem_odp;
- umem = ib_umem_get(udata, 0, 0, access_flags, 0);
- if (IS_ERR(umem))
- return ERR_CAST(umem);
+ umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags);
+ if (IS_ERR(umem_odp))
+ return ERR_CAST(umem_odp);
- imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags);
+ imr = implicit_mr_alloc(&pd->ibpd, umem_odp, 1, access_flags);
if (IS_ERR(imr)) {
- ib_umem_release(umem);
+ ib_umem_odp_release(umem_odp);
return ERR_CAST(imr);
}
- imr->umem = umem;
+ imr->umem = &umem_odp->umem;
init_waitqueue_head(&imr->q_leaf_free);
atomic_set(&imr->num_leaf_free, 0);
atomic_set(&imr->num_pending_prefetch, 0);
return imr;
}
- static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end,
- void *cookie)
+ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
{
- struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie;
-
- if (mr->parent != imr)
- return 0;
-
- ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
- ib_umem_end(umem_odp));
+ struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr);
+ struct rb_node *node;
- if (umem_odp->dying)
- return 0;
+ down_read(&per_mm->umem_rwsem);
+ for (node = rb_first_cached(&per_mm->umem_tree); node;
+ node = rb_next(node)) {
+ struct ib_umem_odp *umem_odp =
+ rb_entry(node, struct ib_umem_odp, interval_tree.rb);
+ struct mlx5_ib_mr *mr = umem_odp->private;
- WRITE_ONCE(umem_odp->dying, 1);
- atomic_inc(&imr->num_leaf_free);
- schedule_work(&umem_odp->work);
+ if (mr->parent != imr)
+ continue;
- return 0;
- }
+ ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+ ib_umem_end(umem_odp));
- void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
- {
- struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr);
+ if (umem_odp->dying)
+ continue;
- down_read(&per_mm->umem_rwsem);
- rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX,
- mr_leaf_free, true, imr);
+ WRITE_ONCE(umem_odp->dying, 1);
+ atomic_inc(&imr->num_leaf_free);
+ schedule_work(&umem_odp->work);
+ }
up_read(&per_mm->umem_rwsem);
wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
struct ib_umem_odp *odp;
size_t size;
- if (!odp_mr->page_list) {
+ if (odp_mr->is_implicit_odp) {
odp = implicit_mr_get_data(mr, io_virt, bcnt);
if (IS_ERR(odp))
start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
access_mask = ODP_READ_ALLOWED_BIT;
- if (prefetch && !downgrade && !mr->umem->writable) {
+ if (prefetch && !downgrade && !odp->umem.writable) {
/* prefetch with write-access must
* be supported by the MR
*/
goto out;
}
- if (mr->umem->writable && !downgrade)
+ if (odp->umem.writable && !downgrade)
access_mask |= ODP_WRITE_ALLOWED_BIT;
current_seq = READ_ONCE(odp->notifiers_seq);
*/
smp_rmb();
- ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size,
- access_mask, current_seq);
+ ret = ib_umem_odp_map_dma_pages(odp, io_virt, size, access_mask,
+ current_seq);
if (ret < 0)
goto out;
np = ret;
mutex_lock(&odp->umem_mutex);
- if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem),
- current_seq)) {
+ if (!ib_umem_mmu_notifier_retry(odp, current_seq)) {
/*
* No need to check whether the MTTs really belong to
* this MR, since ib_umem_odp_map_dma_pages already
io_virt += size;
next = odp_next(odp);
- if (unlikely(!next || next->umem.address != io_virt)) {
+ if (unlikely(!next || ib_umem_start(next) != io_virt)) {
mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
io_virt, next);
return -EAGAIN;
static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
.advise_mr = mlx5_ib_advise_mr,
+ .invalidate_range = mlx5_ib_invalidate_range,
};
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
{
int ret = 0;
- if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
- ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
+ if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
+ return ret;
+
+ ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
}
}
- if (!MLX5_CAP_GEN(dev->mdev, pg))
- return ret;
-
ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq);
return ret;
void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
{
- if (!MLX5_CAP_GEN(dev->mdev, pg))
+ if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
return;
mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq);
#include <linux/resource.h>
#include <linux/latencytop.h>
#include <linux/sched/prio.h>
+#include <linux/sched/types.h>
#include <linux/signal_types.h>
#include <linux/mm_types_task.h>
#include <linux/task_io_accounting.h>
+#include <linux/posix-timers.h>
#include <linux/rseq.h>
/* task_struct member predeclarations (sorted alphabetically): */
#endif
};
-/**
- * struct task_cputime - collected CPU time counts
- * @utime: time spent in user mode, in nanoseconds
- * @stime: time spent in kernel mode, in nanoseconds
- * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
- *
- * This structure groups together three kinds of CPU time that are tracked for
- * threads and thread groups. Most things considering CPU time want to group
- * these counts together and treat all three of them in parallel.
- */
-struct task_cputime {
- u64 utime;
- u64 stime;
- unsigned long long sum_exec_runtime;
-};
-
-/* Alternate field names when used on cache expirations: */
-#define virt_exp utime
-#define prof_exp stime
-#define sched_exp sum_exec_runtime
-
enum vtime_state {
/* Task is sleeping or running in a CPU with VTIME inactive: */
VTIME_INACTIVE = 0,
UCLAMP_CNT
};
+#ifdef CONFIG_SMP
+extern struct root_domain def_root_domain;
+extern struct mutex sched_domains_mutex;
+#endif
+
struct sched_info {
#ifdef CONFIG_SCHED_INFO
/* Cumulative counters: */
unsigned long min_flt;
unsigned long maj_flt;
-#ifdef CONFIG_POSIX_TIMERS
- struct task_cputime cputime_expires;
- struct list_head cpu_timers[3];
-#endif
+ /* Empty if CONFIG_POSIX_CPUTIMERS=n */
+ struct posix_cputimers posix_cputimers;
/* Process credentials: */
struct mutex_waiter *blocked_on;
#endif
+ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ int non_block_count;
+ #endif
+
#ifdef CONFIG_TRACE_IRQFLAGS
unsigned int irq_events;
unsigned long hardirq_enable_ip;
* value indicates whether a reschedule was done in fact.
* cond_resched_lock() will drop the spinlock before scheduling,
*/
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
extern int _cond_resched(void);
#else
static inline int _cond_resched(void) { return 0; }
/*
* Does a critical section need to be broken due to another
- * task waiting?: (technically does not depend on CONFIG_PREEMPT,
+ * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
* but a general need for low latency)
*/
static inline int spin_needbreak(spinlock_t *lock)
{
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
return spin_is_contended(lock);
#else
return 0;
int arch_task_struct_size __read_mostly;
#endif
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
/* Fetch thread_struct whitelist for the architecture. */
else
*offset += offsetof(struct task_struct, thread);
}
+#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
void __init fork_init(void)
{
mm_init_owner(mm, p);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_mm_init(mm);
- hmm_mm_init(mm);
init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
}
}
-#ifdef CONFIG_POSIX_TIMERS
/*
* Initialize POSIX timer handling for a thread group.
*/
static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
+ struct posix_cputimers *pct = &sig->posix_cputimers;
unsigned long cpu_limit;
cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
- if (cpu_limit != RLIM_INFINITY) {
- sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
- sig->cputimer.running = true;
- }
-
- /* The timer lists. */
- INIT_LIST_HEAD(&sig->cpu_timers[0]);
- INIT_LIST_HEAD(&sig->cpu_timers[1]);
- INIT_LIST_HEAD(&sig->cpu_timers[2]);
+ posix_cputimers_group_init(pct, cpu_limit);
}
-#else
-static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
-#endif
static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
#endif
}
-#ifdef CONFIG_POSIX_TIMERS
-/*
- * Initialize POSIX timer handling for a single task.
- */
-static void posix_cpu_timers_init(struct task_struct *tsk)
-{
- tsk->cputime_expires.prof_exp = 0;
- tsk->cputime_expires.virt_exp = 0;
- tsk->cputime_expires.sched_exp = 0;
- INIT_LIST_HEAD(&tsk->cpu_timers[0]);
- INIT_LIST_HEAD(&tsk->cpu_timers[1]);
- INIT_LIST_HEAD(&tsk->cpu_timers[2]);
-}
-#else
-static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
-#endif
-
static inline void init_task_pid_links(struct task_struct *task)
{
enum pid_type type;
#endif /* #ifdef CONFIG_TASKS_RCU */
}
+struct pid *pidfd_pid(const struct file *file)
+{
+ if (file->f_op == &pidfd_fops)
+ return file->private_data;
+
+ return ERR_PTR(-EBADF);
+}
+
static int pidfd_release(struct inode *inode, struct file *file)
{
struct pid *pid = file->private_data;
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
- posix_cpu_timers_init(p);
+ posix_cputimers_init(&p->posix_cputimers);
p->io_context = NULL;
audit_set_context(p, NULL);
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
+ *
+ * args->exit_signal is expected to be checked for sanity by the caller.
*/
long _do_fork(struct kernel_clone_args *args)
{
if (copy_from_user(&args, uargs, size))
return -EFAULT;
+ /*
+ * Verify that higher 32bits of exit_signal are unset and that
+ * it is a valid signal
+ */
+ if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
+ !valid_signal(args.exit_signal)))
+ return -EINVAL;
+
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
{
struct hrtimer *timer = &rq->hrtick_timer;
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
}
/*
*/
delay = max_t(u64, delay, 10000LL);
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
- HRTIMER_MODE_REL_PINNED);
+ HRTIMER_MODE_REL_PINNED_HARD);
}
#endif /* CONFIG_SMP */
rq->hrtick_csd.info = rq;
#endif
- hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
}
#else /* CONFIG_SCHED_HRTICK */
}
#ifdef CONFIG_UCLAMP_TASK
+/*
+ * Serializes updates of utilization clamp values
+ *
+ * The (slow-path) user-space triggers utilization clamp value updates which
+ * can require updates on (fast-path) scheduler's data structures used to
+ * support enqueue/dequeue operations.
+ * While the per-CPU rq lock protects fast-path update operations, user-space
+ * requests are serialized using a mutex to reduce the risk of conflicting
+ * updates or API abuses.
+ */
+static DEFINE_MUTEX(uclamp_mutex);
+
/* Max allowed minimum utilization */
unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
}
-static inline unsigned int uclamp_none(int clamp_id)
+static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
}
static inline unsigned int
-uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
/*
return uclamp_none(UCLAMP_MIN);
}
-static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
/* Reset max-clamp retention only on idle exit */
}
static inline
-unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
- unsigned int clamp_value)
+enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int clamp_value)
{
struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
int bucket_id = UCLAMP_BUCKETS - 1;
return uclamp_idle_value(rq, clamp_id, clamp_value);
}
+static inline struct uclamp_se
+uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
+{
+ struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ struct uclamp_se uc_max;
+
+ /*
+ * Tasks in autogroups or root task group will be
+ * restricted by system defaults.
+ */
+ if (task_group_is_autogroup(task_group(p)))
+ return uc_req;
+ if (task_group(p) == &root_task_group)
+ return uc_req;
+
+ uc_max = task_group(p)->uclamp[clamp_id];
+ if (uc_req.value > uc_max.value || !uc_req.user_defined)
+ return uc_max;
+#endif
+
+ return uc_req;
+}
+
/*
* The effective clamp bucket index of a task depends on, by increasing
* priority:
* - the task specific clamp value, when explicitly requested from userspace
+ * - the task group effective clamp value, for tasks not either in the root
+ * group or in an autogroup
* - the system default clamp value, defined by the sysadmin
*/
static inline struct uclamp_se
-uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
{
- struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+ struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
struct uclamp_se uc_max = uclamp_default[clamp_id];
/* System default restrictions always apply */
return uc_req;
}
-unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
* for each bucket when all its RUNNABLE tasks require the same clamp.
*/
static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
- unsigned int clamp_id)
+ enum uclamp_id clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
* enforce the expected state and warn.
*/
static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
- unsigned int clamp_id)
+ enum uclamp_id clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
{
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
{
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
uclamp_rq_dec_id(rq, p, clamp_id);
}
+static inline void
+uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ /*
+ * Lock the task and the rq where the task is (or was) queued.
+ *
+ * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+ * price to pay to safely serialize util_{min,max} updates with
+ * enqueues, dequeues and migration operations.
+ * This is the same locking schema used by __set_cpus_allowed_ptr().
+ */
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Setting the clamp bucket is serialized by task_rq_lock().
+ * If the task is not yet RUNNABLE and its task_struct is not
+ * affecting a valid clamp bucket, the next time it's enqueued,
+ * it will already see the updated clamp bucket value.
+ */
+ if (!p->uclamp[clamp_id].active) {
+ uclamp_rq_dec_id(rq, p, clamp_id);
+ uclamp_rq_inc_id(rq, p, clamp_id);
+ }
+
+ task_rq_unlock(rq, p, &rf);
+}
+
+static inline void
+uclamp_update_active_tasks(struct cgroup_subsys_state *css,
+ unsigned int clamps)
+{
+ enum uclamp_id clamp_id;
+ struct css_task_iter it;
+ struct task_struct *p;
+
+ css_task_iter_start(css, 0, &it);
+ while ((p = css_task_iter_next(&it))) {
+ for_each_clamp_id(clamp_id) {
+ if ((0x1 << clamp_id) & clamps)
+ uclamp_update_active(p, clamp_id);
+ }
+ }
+ css_task_iter_end(&it);
+}
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+static void uclamp_update_root_tg(void)
+{
+ struct task_group *tg = &root_task_group;
+
+ uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
+ sysctl_sched_uclamp_util_min, false);
+ uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
+ sysctl_sched_uclamp_util_max, false);
+
+ rcu_read_lock();
+ cpu_util_update_eff(&root_task_group.css);
+ rcu_read_unlock();
+}
+#else
+static void uclamp_update_root_tg(void) { }
+#endif
+
int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
+ bool update_root_tg = false;
int old_min, old_max;
- static DEFINE_MUTEX(mutex);
int result;
- mutex_lock(&mutex);
+ mutex_lock(&uclamp_mutex);
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
if (old_min != sysctl_sched_uclamp_util_min) {
uclamp_se_set(&uclamp_default[UCLAMP_MIN],
sysctl_sched_uclamp_util_min, false);
+ update_root_tg = true;
}
if (old_max != sysctl_sched_uclamp_util_max) {
uclamp_se_set(&uclamp_default[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
+ update_root_tg = true;
}
+ if (update_root_tg)
+ uclamp_update_root_tg();
+
/*
- * Updating all the RUNNABLE task is expensive, keep it simple and do
- * just a lazy update at each next enqueue time.
+ * We update all RUNNABLE tasks only when task groups are in use.
+ * Otherwise, keep it simple and do just a lazy update at each next
+ * task enqueue time.
*/
+
goto done;
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
done:
- mutex_unlock(&mutex);
+ mutex_unlock(&uclamp_mutex);
return result;
}
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr)
{
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
/*
* On scheduling class change, reset to default clamps for tasks
static void uclamp_fork(struct task_struct *p)
{
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
for_each_clamp_id(clamp_id)
p->uclamp[clamp_id].active = false;
static void __init init_uclamp(void)
{
struct uclamp_se uc_max = {};
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
int cpu;
+ mutex_init(&uclamp_mutex);
+
for_each_possible_cpu(cpu) {
memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
cpu_rq(cpu)->uclamp_flags = 0;
/* System defaults allow max clamp values for both indexes */
uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
- for_each_clamp_id(clamp_id)
+ for_each_clamp_id(clamp_id) {
uclamp_default[clamp_id] = uc_max;
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ root_task_group.uclamp_req[clamp_id] = uc_max;
+ root_task_group.uclamp[clamp_id] = uc_max;
+#endif
+ }
}
#else /* CONFIG_UCLAMP_TASK */
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
}
/*
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
- struct mm_struct *mm, *oldmm;
-
prepare_task_switch(rq, prev, next);
- mm = next->mm;
- oldmm = prev->active_mm;
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
arch_start_context_switch(prev);
/*
- * If mm is non-NULL, we pass through switch_mm(). If mm is
- * NULL, we will pass through mmdrop() in finish_task_switch().
- * Both of these contain the full memory barrier required by
- * membarrier after storing to rq->curr, before returning to
- * user-space.
+ * kernel -> kernel lazy + transfer active
+ * user -> kernel lazy + mmgrab() active
+ *
+ * kernel -> user switch + mmdrop() active
+ * user -> user switch
*/
- if (!mm) {
- next->active_mm = oldmm;
- mmgrab(oldmm);
- enter_lazy_tlb(oldmm, next);
- } else
- switch_mm_irqs_off(oldmm, mm, next);
+ if (!next->mm) { // to kernel
+ enter_lazy_tlb(prev->active_mm, next);
+
+ next->active_mm = prev->active_mm;
+ if (prev->mm) // from user
+ mmgrab(prev->active_mm);
+ else
+ prev->active_mm = NULL;
+ } else { // to user
+ /*
+ * sys_membarrier() requires an smp_mb() between setting
+ * rq->curr and returning to userspace.
+ *
+ * The below provides this either through switch_mm(), or in
+ * case 'prev->active_mm == next->mm' through
+ * finish_task_switch()'s mmdrop().
+ */
- if (!prev->mm) {
- prev->active_mm = NULL;
- rq->prev_mm = oldmm;
+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
+
+ if (!prev->mm) { // from kernel
+ /* will mmdrop() in finish_task_switch(). */
+ rq->prev_mm = prev->active_mm;
+ prev->active_mm = NULL;
+ }
}
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
struct tick_work {
int cpu;
+ atomic_t state;
struct delayed_work work;
};
+/* Values for ->state, see diagram below. */
+#define TICK_SCHED_REMOTE_OFFLINE 0
+#define TICK_SCHED_REMOTE_OFFLINING 1
+#define TICK_SCHED_REMOTE_RUNNING 2
+
+/*
+ * State diagram for ->state:
+ *
+ *
+ * TICK_SCHED_REMOTE_OFFLINE
+ * | ^
+ * | |
+ * | | sched_tick_remote()
+ * | |
+ * | |
+ * +--TICK_SCHED_REMOTE_OFFLINING
+ * | ^
+ * | |
+ * sched_tick_start() | | sched_tick_stop()
+ * | |
+ * V |
+ * TICK_SCHED_REMOTE_RUNNING
+ *
+ *
+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
+ * and sched_tick_start() are happy to leave the state in RUNNING.
+ */
static struct tick_work __percpu *tick_work_cpu;
struct task_struct *curr;
struct rq_flags rf;
u64 delta;
+ int os;
/*
* Handle the tick only if it appears the remote CPU is running in full
rq_lock_irq(rq, &rf);
curr = rq->curr;
- if (is_idle_task(curr))
+ if (is_idle_task(curr) || cpu_is_offline(cpu))
goto out_unlock;
update_rq_clock(rq);
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough
- * to keep scheduler internal stats reasonably up to date.
+ * to keep scheduler internal stats reasonably up to date. But
+ * first update state to reflect hotplug activity if required.
*/
- queue_delayed_work(system_unbound_wq, dwork, HZ);
+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
+ if (os == TICK_SCHED_REMOTE_RUNNING)
+ queue_delayed_work(system_unbound_wq, dwork, HZ);
}
static void sched_tick_start(int cpu)
{
+ int os;
struct tick_work *twork;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
- twork->cpu = cpu;
- INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
- queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
+ if (os == TICK_SCHED_REMOTE_OFFLINE) {
+ twork->cpu = cpu;
+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+ queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ }
}
#ifdef CONFIG_HOTPLUG_CPU
static void sched_tick_stop(int cpu)
{
struct tick_work *twork;
+ int os;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
- cancel_delayed_work_sync(&twork->work);
+ /* There cannot be competing actions, but don't rely on stop-machine. */
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
+ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
+ /* Don't cancel, as this would mess up the state machine. */
}
#endif /* CONFIG_HOTPLUG_CPU */
{
tick_work_cpu = alloc_percpu(struct tick_work);
BUG_ON(!tick_work_cpu);
-
return 0;
}
static inline void sched_tick_stop(int cpu) { }
#endif
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_TRACE_PREEMPT_TOGGLE))
/*
* If the value passed in is equal to the current preempt count
/*
* Various schedule()-time debugging checks and statistics:
*/
- static inline void schedule_debug(struct task_struct *prev)
+ static inline void schedule_debug(struct task_struct *prev, bool preempt)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
if (task_stack_end_corrupted(prev))
panic("corrupted stack end detected inside scheduler\n");
#endif
+ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ if (!preempt && prev->state && prev->non_block_count) {
+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
+ prev->comm, prev->pid, prev->non_block_count);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+ }
+ #endif
+
if (unlikely(in_atomic_preempt_off())) {
__schedule_bug(prev);
preempt_count_set(PREEMPT_DISABLED);
p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
- goto again;
+ goto restart;
/* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
return p;
}
-again:
+restart:
+ /*
+ * Ensure that we put DL/RT tasks before the pick loop, such that they
+ * can PULL higher prio tasks when we lower the RQ 'priority'.
+ */
+ prev->sched_class->put_prev_task(rq, prev, rf);
+ if (!rq->nr_running)
+ newidle_balance(rq, rf);
+
for_each_class(class) {
- p = class->pick_next_task(rq, prev, rf);
- if (p) {
- if (unlikely(p == RETRY_TASK))
- goto again;
+ p = class->pick_next_task(rq, NULL, NULL);
+ if (p)
return p;
- }
}
/* The idle class should always have a runnable task: */
* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
* called on the nearest possible occasion:
*
- * - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
*
* - in syscall or exception context, at the next outmost
* preempt_enable(). (this might be as soon as the wake_up()'s
* - in IRQ context, return from interrupt-handler to
* preemptible context
*
- * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
* then at the next:
*
* - cond_resched() call
rq = cpu_rq(cpu);
prev = rq->curr;
- schedule_debug(prev);
+ schedule_debug(prev, preempt);
if (sched_feat(HRTICK))
hrtick_clear(rq);
static inline void sched_submit_work(struct task_struct *tsk)
{
- if (!tsk->state || tsk_is_pi_blocked(tsk))
+ if (!tsk->state)
return;
/*
preempt_enable_no_resched();
}
+ if (tsk_is_pi_blocked(tsk))
+ return;
+
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
} while (need_resched());
}
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
* this is the entry point to schedule() from in-kernel preemption
* off of preempt_enable. Kernel preemptions off return from interrupt
}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
/*
* this is the entry point to schedule() from kernel preemption
if (queued)
enqueue_task(rq, p, queue_flag);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
resched_curr(rq);
}
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
out_unlock:
task_rq_unlock(rq, p, &rf);
}
return retval;
}
+ if (pi)
+ cpuset_read_lock();
+
/*
* Make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
* Changing the policy of the stop threads its a very bad idea:
*/
if (p == rq->stop) {
- task_rq_unlock(rq, p, &rf);
- return -EINVAL;
+ retval = -EINVAL;
+ goto unlock;
}
/*
goto change;
p->sched_reset_on_fork = reset_on_fork;
- task_rq_unlock(rq, p, &rf);
- return 0;
+ retval = 0;
+ goto unlock;
}
change:
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
- task_rq_unlock(rq, p, &rf);
- return -EPERM;
+ retval = -EPERM;
+ goto unlock;
}
#endif
#ifdef CONFIG_SMP
*/
if (!cpumask_subset(span, p->cpus_ptr) ||
rq->rd->dl_bw.bw == 0) {
- task_rq_unlock(rq, p, &rf);
- return -EPERM;
+ retval = -EPERM;
+ goto unlock;
}
}
#endif
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
task_rq_unlock(rq, p, &rf);
+ if (pi)
+ cpuset_read_unlock();
goto recheck;
}
* is available.
*/
if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
- task_rq_unlock(rq, p, &rf);
- return -EBUSY;
+ retval = -EBUSY;
+ goto unlock;
}
p->sched_reset_on_fork = reset_on_fork;
enqueue_task(rq, p, queue_flags);
}
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
preempt_disable();
task_rq_unlock(rq, p, &rf);
- if (pi)
+ if (pi) {
+ cpuset_read_unlock();
rt_mutex_adjust_pi(p);
+ }
/* Run balance callbacks after we've adjusted the PI chain: */
balance_callback(rq);
preempt_enable();
return 0;
+
+unlock:
+ task_rq_unlock(rq, p, &rf);
+ if (pi)
+ cpuset_read_unlock();
+ return retval;
}
static int _sched_setscheduler(struct task_struct *p, int policy,
rcu_read_lock();
retval = -ESRCH;
p = find_process_by_pid(pid);
- if (p != NULL)
- retval = sched_setscheduler(p, policy, &lparam);
+ if (likely(p))
+ get_task_struct(p);
rcu_read_unlock();
+ if (likely(p)) {
+ retval = sched_setscheduler(p, policy, &lparam);
+ put_task_struct(p);
+ }
+
return retval;
}
return retval;
}
-static int sched_read_attr(struct sched_attr __user *uattr,
- struct sched_attr *attr,
- unsigned int usize)
+/*
+ * Copy the kernel size attribute structure (which might be larger
+ * than what user-space knows about) to user-space.
+ *
+ * Note that all cases are valid: user-space buffer can be larger or
+ * smaller than the kernel-space buffer. The usual case is that both
+ * have the same size.
+ */
+static int
+sched_attr_copy_to_user(struct sched_attr __user *uattr,
+ struct sched_attr *kattr,
+ unsigned int usize)
{
- int ret;
+ unsigned int ksize = sizeof(*kattr);
if (!access_ok(uattr, usize))
return -EFAULT;
/*
- * If we're handed a smaller struct than we know of,
- * ensure all the unknown bits are 0 - i.e. old
- * user-space does not get uncomplete information.
+ * sched_getattr() ABI forwards and backwards compatibility:
+ *
+ * If usize == ksize then we just copy everything to user-space and all is good.
+ *
+ * If usize < ksize then we only copy as much as user-space has space for,
+ * this keeps ABI compatibility as well. We skip the rest.
+ *
+ * If usize > ksize then user-space is using a newer version of the ABI,
+ * which part the kernel doesn't know about. Just ignore it - tooling can
+ * detect the kernel's knowledge of attributes from the attr->size value
+ * which is set to ksize in this case.
*/
- if (usize < sizeof(*attr)) {
- unsigned char *addr;
- unsigned char *end;
-
- addr = (void *)attr + usize;
- end = (void *)attr + sizeof(*attr);
-
- for (; addr < end; addr++) {
- if (*addr)
- return -EFBIG;
- }
-
- attr->size = usize;
- }
+ kattr->size = min(usize, ksize);
- ret = copy_to_user(uattr, attr, attr->size);
- if (ret)
+ if (copy_to_user(uattr, kattr, kattr->size))
return -EFAULT;
return 0;
* sys_sched_getattr - similar to sched_getparam, but with sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
- * @size: sizeof(attr) for fwd/bwd comp.
+ * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
* @flags: for future extension.
*/
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, size, unsigned int, flags)
+ unsigned int, usize, unsigned int, flags)
{
- struct sched_attr attr = {
- .size = sizeof(struct sched_attr),
- };
+ struct sched_attr kattr = { };
struct task_struct *p;
int retval;
- if (!uattr || pid < 0 || size > PAGE_SIZE ||
- size < SCHED_ATTR_SIZE_VER0 || flags)
+ if (!uattr || pid < 0 || usize > PAGE_SIZE ||
+ usize < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
rcu_read_lock();
if (retval)
goto out_unlock;
- attr.sched_policy = p->policy;
+ kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
- attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
if (task_has_dl_policy(p))
- __getparam_dl(p, &attr);
+ __getparam_dl(p, &kattr);
else if (task_has_rt_policy(p))
- attr.sched_priority = p->rt_priority;
+ kattr.sched_priority = p->rt_priority;
else
- attr.sched_nice = task_nice(p);
+ kattr.sched_nice = task_nice(p);
#ifdef CONFIG_UCLAMP_TASK
- attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
rcu_read_unlock();
- retval = sched_read_attr(uattr, &attr, size);
- return retval;
+ return sched_attr_copy_to_user(uattr, &kattr, usize);
out_unlock:
rcu_read_unlock();
return 0;
}
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
int __sched _cond_resched(void)
{
if (should_resched(0)) {
* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
* call schedule, and on return reacquire the lock.
*
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
* operations here to prevent schedule() from being called twice (once via
* spin_unlock(), once by hand).
*/
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_NUMA_BALANCING */
atomic_long_add(delta, &calc_load_tasks);
}
-static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+static struct task_struct *__pick_migrate_task(struct rq *rq)
{
-}
+ const struct sched_class *class;
+ struct task_struct *next;
-static const struct sched_class fake_sched_class = {
- .put_prev_task = put_prev_task_fake,
-};
+ for_each_class(class) {
+ next = class->pick_next_task(rq, NULL, NULL);
+ if (next) {
+ next->sched_class->put_prev_task(rq, next, NULL);
+ return next;
+ }
+ }
-static struct task_struct fake_task = {
- /*
- * Avoid pull_{rt,dl}_task()
- */
- .prio = MAX_PRIO + 1,
- .sched_class = &fake_sched_class,
-};
+ /* The idle class should always have a runnable task */
+ BUG();
+}
/*
* Migrate all tasks from the rq, sleeping tasks will be migrated by
if (rq->nr_running == 1)
break;
- /*
- * pick_next_task() assumes pinned rq->lock:
- */
- next = pick_next_task(rq, &fake_task, rf);
- BUG_ON(!next);
- put_prev_task(rq, next);
+ next = __pick_migrate_task(rq);
/*
* Rules for changing task_struct::cpus_mask are holding
void __init sched_init(void)
{
- unsigned long alloc_size = 0, ptr;
+ unsigned long ptr = 0;
int i;
wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
#ifdef CONFIG_RT_GROUP_SCHED
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
- if (alloc_size) {
- ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+ if (ptr) {
+ ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.se = (struct sched_entity **)ptr;
rcu_sleep_check();
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
- !is_idle_task(current)) ||
+ !is_idle_task(current) && !current->non_block_count) ||
system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
oops_in_progress)
return;
"BUG: sleeping function called from invalid context at %s:%d\n",
file, line);
printk(KERN_ERR
- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
- in_atomic(), irqs_disabled(),
+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), current->non_block_count,
current->pid, current->comm);
if (task_stack_end_corrupted(current))
#ifdef CONFIG_IA64
/**
- * set_curr_task - set the current task for a given CPU.
+ * ia64_set_curr_task - set the current task for a given CPU.
* @cpu: the processor in question.
* @p: the task pointer to set.
*
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
+static inline void alloc_uclamp_sched_group(struct task_group *tg,
+ struct task_group *parent)
+{
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ enum uclamp_id clamp_id;
+
+ for_each_clamp_id(clamp_id) {
+ uclamp_se_set(&tg->uclamp_req[clamp_id],
+ uclamp_none(clamp_id), false);
+ tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
+ }
+#endif
+}
+
static void sched_free_group(struct task_group *tg)
{
free_fair_sched_group(tg);
if (!alloc_rt_sched_group(tg, parent))
goto err;
+ alloc_uclamp_sched_group(tg, parent);
+
return tg;
err:
if (queued)
enqueue_task(rq, tsk, queue_flags);
if (running)
- set_curr_task(rq, tsk);
+ set_next_task(rq, tsk);
task_rq_unlock(rq, tsk, &rf);
}
#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
-#else
- /* We don't support RT-tasks being in separate groups */
- if (task->sched_class != &fair_sched_class)
- return -EINVAL;
#endif
/*
* Serialize against wake_up_new_task() such that if its
sched_move_task(task);
}
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys_state *top_css = css;
+ struct uclamp_se *uc_parent = NULL;
+ struct uclamp_se *uc_se = NULL;
+ unsigned int eff[UCLAMP_CNT];
+ enum uclamp_id clamp_id;
+ unsigned int clamps;
+
+ css_for_each_descendant_pre(css, top_css) {
+ uc_parent = css_tg(css)->parent
+ ? css_tg(css)->parent->uclamp : NULL;
+
+ for_each_clamp_id(clamp_id) {
+ /* Assume effective clamps matches requested clamps */
+ eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
+ /* Cap effective clamps with parent's effective clamps */
+ if (uc_parent &&
+ eff[clamp_id] > uc_parent[clamp_id].value) {
+ eff[clamp_id] = uc_parent[clamp_id].value;
+ }
+ }
+ /* Ensure protection is always capped by limit */
+ eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
+
+ /* Propagate most restrictive effective clamps */
+ clamps = 0x0;
+ uc_se = css_tg(css)->uclamp;
+ for_each_clamp_id(clamp_id) {
+ if (eff[clamp_id] == uc_se[clamp_id].value)
+ continue;
+ uc_se[clamp_id].value = eff[clamp_id];
+ uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
+ clamps |= (0x1 << clamp_id);
+ }
+ if (!clamps) {
+ css = css_rightmost_descendant(css);
+ continue;
+ }
+
+ /* Immediately update descendants RUNNABLE tasks */
+ uclamp_update_active_tasks(css, clamps);
+ }
+}
+
+/*
+ * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
+ * C expression. Since there is no way to convert a macro argument (N) into a
+ * character constant, use two levels of macros.
+ */
+#define _POW10(exp) ((unsigned int)1e##exp)
+#define POW10(exp) _POW10(exp)
+
+struct uclamp_request {
+#define UCLAMP_PERCENT_SHIFT 2
+#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
+ s64 percent;
+ u64 util;
+ int ret;
+};
+
+static inline struct uclamp_request
+capacity_from_percent(char *buf)
+{
+ struct uclamp_request req = {
+ .percent = UCLAMP_PERCENT_SCALE,
+ .util = SCHED_CAPACITY_SCALE,
+ .ret = 0,
+ };
+
+ buf = strim(buf);
+ if (strcmp(buf, "max")) {
+ req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+ &req.percent);
+ if (req.ret)
+ return req;
+ if (req.percent > UCLAMP_PERCENT_SCALE) {
+ req.ret = -ERANGE;
+ return req;
+ }
+
+ req.util = req.percent << SCHED_CAPACITY_SHIFT;
+ req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+ }
+
+ return req;
+}
+
+static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off,
+ enum uclamp_id clamp_id)
+{
+ struct uclamp_request req;
+ struct task_group *tg;
+
+ req = capacity_from_percent(buf);
+ if (req.ret)
+ return req.ret;
+
+ mutex_lock(&uclamp_mutex);
+ rcu_read_lock();
+
+ tg = css_tg(of_css(of));
+ if (tg->uclamp_req[clamp_id].value != req.util)
+ uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
+
+ /*
+ * Because of not recoverable conversion rounding we keep track of the
+ * exact requested value
+ */
+ tg->uclamp_pct[clamp_id] = req.percent;
+
+ /* Update effective clamps to track the most restrictive value */
+ cpu_util_update_eff(of_css(of));
+
+ rcu_read_unlock();
+ mutex_unlock(&uclamp_mutex);
+
+ return nbytes;
+}
+
+static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
+}
+
+static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
+}
+
+static inline void cpu_uclamp_print(struct seq_file *sf,
+ enum uclamp_id clamp_id)
+{
+ struct task_group *tg;
+ u64 util_clamp;
+ u64 percent;
+ u32 rem;
+
+ rcu_read_lock();
+ tg = css_tg(seq_css(sf));
+ util_clamp = tg->uclamp_req[clamp_id].value;
+ rcu_read_unlock();
+
+ if (util_clamp == SCHED_CAPACITY_SCALE) {
+ seq_puts(sf, "max\n");
+ return;
+ }
+
+ percent = tg->uclamp_pct[clamp_id];
+ percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+ seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
+}
+
+static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
+{
+ cpu_uclamp_print(sf, UCLAMP_MIN);
+ return 0;
+}
+
+static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+{
+ cpu_uclamp_print(sf, UCLAMP_MAX);
+ return 0;
+}
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
.read_u64 = cpu_rt_period_read_uint,
.write_u64 = cpu_rt_period_write_uint,
},
+#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ {
+ .name = "uclamp.min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_min_show,
+ .write = cpu_uclamp_min_write,
+ },
+ {
+ .name = "uclamp.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_max_show,
+ .write = cpu_uclamp_max_write,
+ },
#endif
{ } /* Terminate */
};
.seq_show = cpu_max_show,
.write = cpu_max_write,
},
+#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ {
+ .name = "uclamp.min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_min_show,
+ .write = cpu_uclamp_min_write,
+ },
+ {
+ .name = "uclamp.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_max_show,
+ .write = cpu_uclamp_max_write,
+ },
#endif
{ } /* terminate */
};
#include <linux/userfaultfd_k.h>
#include <linux/hugetlb.h>
#include <linux/falloc.h>
+#include <linux/fadvise.h>
#include <linux/sched.h>
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+ #include <linux/pagewalk.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
return 0;
}
- static void force_swapin_readahead(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
- {
- struct mm_walk walk = {
- .mm = vma->vm_mm,
- .pmd_entry = swapin_walk_pmd_entry,
- .private = vma,
- };
-
- walk_page_range(start, end, &walk);
-
- lru_add_drain(); /* Push any new pages onto the LRU now */
- }
+ static const struct mm_walk_ops swapin_walk_ops = {
+ .pmd_entry = swapin_walk_pmd_entry,
+ };
static void force_shm_swapin_readahead(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long start, unsigned long end)
{
struct file *file = vma->vm_file;
+ loff_t offset;
*prev = vma;
#ifdef CONFIG_SWAP
if (!file) {
- force_swapin_readahead(vma, start, end);
+ walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
+ lru_add_drain(); /* Push any new pages onto the LRU now */
return 0;
}
return 0;
}
- start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- if (end > vma->vm_end)
- end = vma->vm_end;
- end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-
- force_page_cache_readahead(file->f_mapping, file, start, end - start);
+ /*
+ * Filesystem's fadvise may need to take various locks. We need to
+ * explicitly grab a reference because the vma (and hence the
+ * vma's reference to the file) can go away as soon as we drop
+ * mmap_sem.
+ */
+ *prev = NULL; /* tell sys_madvise we drop mmap_sem */
+ get_file(file);
+ up_read(¤t->mm->mmap_sem);
+ offset = (loff_t)(start - vma->vm_start)
+ + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
+ fput(file);
+ down_read(¤t->mm->mmap_sem);
return 0;
}
return 0;
}
- static void madvise_free_page_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
- {
- struct mm_walk free_walk = {
- .pmd_entry = madvise_free_pte_range,
- .mm = vma->vm_mm,
- .private = tlb,
- };
-
- tlb_start_vma(tlb, vma);
- walk_page_range(addr, end, &free_walk);
- tlb_end_vma(tlb, vma);
- }
+ static const struct mm_walk_ops madvise_free_walk_ops = {
+ .pmd_entry = madvise_free_pte_range,
+ };
static int madvise_free_single_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr)
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(&range);
- madvise_free_page_range(&tlb, vma, range.start, range.end);
+ tlb_start_vma(&tlb, vma);
+ walk_page_range(vma->vm_mm, range.start, range.end,
+ &madvise_free_walk_ops, &tlb);
+ tlb_end_vma(&tlb, vma);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb, range.start, range.end);
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
- #include <linux/mm.h>
+ #include <linux/pagewalk.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
#define do_swap_account 0
#endif
+#ifdef CONFIG_CGROUP_WRITEBACK
+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
+#endif
+
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
/* Update memcg */
__mod_memcg_state(memcg, idx, val);
+ /* Update lruvec */
+ __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
+
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup_per_node *pi;
- /*
- * Batch local counters to keep them in sync with
- * the hierarchical ones.
- */
- __this_cpu_add(pn->lruvec_stat_local->count[idx], x);
for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
atomic_long_add(x, &pi->lruvec_stat[idx]);
x = 0;
}
}
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only)
+{
+ unsigned long stat[MEMCG_NR_STAT];
+ struct mem_cgroup *mi;
+ int node, cpu, i;
+ int min_idx, max_idx;
+
+ if (slab_only) {
+ min_idx = NR_SLAB_RECLAIMABLE;
+ max_idx = NR_SLAB_UNRECLAIMABLE;
+ } else {
+ min_idx = 0;
+ max_idx = MEMCG_NR_STAT;
+ }
+
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = min_idx; i < max_idx; i++)
+ atomic_long_add(stat[i], &mi->vmstats[i]);
+
+ if (!slab_only)
+ max_idx = NR_VM_NODE_STAT_ITEMS;
+
+ for_each_node(node) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+ struct mem_cgroup_per_node *pi;
+
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] += per_cpu(
+ pn->lruvec_stat_cpu->count[i], cpu);
+
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
+ for (i = min_idx; i < max_idx; i++)
+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
+ }
+}
+
+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
+{
+ unsigned long events[NR_VM_EVENT_ITEMS];
+ struct mem_cgroup *mi;
+ int cpu, i;
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] += per_cpu(memcg->vmstats_percpu->events[i],
+ cpu);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ atomic_long_add(events[i], &mi->vmevents[i]);
+}
+
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
if (!parent)
parent = root_mem_cgroup;
+ /*
+ * Deactivate and reparent kmem_caches. Then flush percpu
+ * slab statistics to have precise values at the parent and
+ * all ancestor levels. It's required to keep slab stats
+ * accurate after the reparenting of kmem_caches.
+ */
memcg_deactivate_kmem_caches(memcg, parent);
+ memcg_flush_percpu_vmstats(memcg, true);
kmemcg_id = memcg->kmemcg_id;
BUG_ON(kmemcg_id < 0);
#ifdef CONFIG_CGROUP_WRITEBACK
+#include <trace/events/writeback.h>
+
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
{
return wb_domain_init(&memcg->cgwb_domain, gfp);
}
}
+/*
+ * Foreign dirty flushing
+ *
+ * There's an inherent mismatch between memcg and writeback. The former
+ * trackes ownership per-page while the latter per-inode. This was a
+ * deliberate design decision because honoring per-page ownership in the
+ * writeback path is complicated, may lead to higher CPU and IO overheads
+ * and deemed unnecessary given that write-sharing an inode across
+ * different cgroups isn't a common use-case.
+ *
+ * Combined with inode majority-writer ownership switching, this works well
+ * enough in most cases but there are some pathological cases. For
+ * example, let's say there are two cgroups A and B which keep writing to
+ * different but confined parts of the same inode. B owns the inode and
+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
+ * triggering background writeback. A will be slowed down without a way to
+ * make writeback of the dirty pages happen.
+ *
+ * Conditions like the above can lead to a cgroup getting repatedly and
+ * severely throttled after making some progress after each
+ * dirty_expire_interval while the underyling IO device is almost
+ * completely idle.
+ *
+ * Solving this problem completely requires matching the ownership tracking
+ * granularities between memcg and writeback in either direction. However,
+ * the more egregious behaviors can be avoided by simply remembering the
+ * most recent foreign dirtying events and initiating remote flushes on
+ * them when local writeback isn't enough to keep the memory clean enough.
+ *
+ * The following two functions implement such mechanism. When a foreign
+ * page - a page whose memcg and writeback ownerships don't match - is
+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
+ * foreign bdi_writebacks which haven't expired. Both the numbers of
+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
+ * limited to MEMCG_CGWB_FRN_CNT.
+ *
+ * The mechanism only remembers IDs and doesn't hold any object references.
+ * As being wrong occasionally doesn't matter, updates and accesses to the
+ * records are lockless and racy.
+ */
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+ struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+ struct memcg_cgwb_frn *frn;
+ u64 now = get_jiffies_64();
+ u64 oldest_at = now;
+ int oldest = -1;
+ int i;
+
+ trace_track_foreign_dirty(page, wb);
+
+ /*
+ * Pick the slot to use. If there is already a slot for @wb, keep
+ * using it. If not replace the oldest one which isn't being
+ * written out.
+ */
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ frn = &memcg->cgwb_frn[i];
+ if (frn->bdi_id == wb->bdi->id &&
+ frn->memcg_id == wb->memcg_css->id)
+ break;
+ if (time_before64(frn->at, oldest_at) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ oldest = i;
+ oldest_at = frn->at;
+ }
+ }
+
+ if (i < MEMCG_CGWB_FRN_CNT) {
+ /*
+ * Re-using an existing one. Update timestamp lazily to
+ * avoid making the cacheline hot. We want them to be
+ * reasonably up-to-date and significantly shorter than
+ * dirty_expire_interval as that's what expires the record.
+ * Use the shorter of 1s and dirty_expire_interval / 8.
+ */
+ unsigned long update_intv =
+ min_t(unsigned long, HZ,
+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
+
+ if (time_before64(frn->at, now - update_intv))
+ frn->at = now;
+ } else if (oldest >= 0) {
+ /* replace the oldest free one */
+ frn = &memcg->cgwb_frn[oldest];
+ frn->bdi_id = wb->bdi->id;
+ frn->memcg_id = wb->memcg_css->id;
+ frn->at = now;
+ }
+}
+
+/* issue foreign writeback flushes for recorded foreign dirtying events */
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
+ u64 now = jiffies_64;
+ int i;
+
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
+
+ /*
+ * If the record is older than dirty_expire_interval,
+ * writeback on it has already started. No need to kick it
+ * off again. Also, don't start a new one if there's
+ * already one in flight.
+ */
+ if (time_after64(frn->at, now - intv) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ frn->at = 0;
+ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+ WB_REASON_FOREIGN_FLUSH,
+ &frn->done);
+ }
+ }
+}
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
{
int node;
+ /*
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
+ * on parent's and all ancestor levels.
+ */
+ memcg_flush_percpu_vmstats(memcg, false);
+ memcg_flush_percpu_vmevents(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
struct mem_cgroup *memcg;
unsigned int size;
int node;
+ int __maybe_unused i;
size = sizeof(struct mem_cgroup);
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ memcg->cgwb_frn[i].done =
+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ int __maybe_unused i;
+#ifdef CONFIG_CGROUP_WRITEBACK
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
+#endif
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);
return 0;
}
+ static const struct mm_walk_ops precharge_walk_ops = {
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
+ };
+
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
unsigned long precharge;
- struct mm_walk mem_cgroup_count_precharge_walk = {
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
- .mm = mm,
- };
down_read(&mm->mmap_sem);
- walk_page_range(0, mm->highest_vm_end,
- &mem_cgroup_count_precharge_walk);
+ walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
up_read(&mm->mmap_sem);
precharge = mc.precharge;
return ret;
}
+ static const struct mm_walk_ops charge_walk_ops = {
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
+ };
+
static void mem_cgroup_move_charge(void)
{
- struct mm_walk mem_cgroup_move_charge_walk = {
- .pmd_entry = mem_cgroup_move_charge_pte_range,
- .mm = mc.mm,
- };
-
lru_add_drain_all();
/*
* Signal lock_page_memcg() to take the memcg's move_lock
* When we have consumed all precharges and failed in doing
* additional charge, the page walk just aborts.
*/
- walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
+ walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
+ NULL);
up_read(&mc.mm->mmap_sem);
atomic_dec(&mc.from->moving_account);
unsigned int order;
int pages_moved = 0;
-#ifndef CONFIG_HOLES_IN_ZONE
- /*
- * page_zone is not safe to call in this context when
- * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
- * anyway as we check zone boundaries in move_freepages_block().
- * Remove at a later date when no bug reports exist related to
- * grouping pages by mobility
- */
- VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
- pfn_valid(page_to_pfn(end_page)) &&
- page_zone(start_page) != page_zone(end_page));
-#endif
for (page = start_page; page <= end_page;) {
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
continue;
}
- /* Make sure we are not inadvertently changing nodes */
- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
-
if (!PageBuddy(page)) {
/*
* We assume that pages that could be isolated for
continue;
}
+ /* Make sure we are not inadvertently changing nodes */
+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
+
order = page_order(page);
move_to_free_area(page, &zone->free_area[order], migratetype);
page += 1 << order;
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
- RECLAIM_DISTANCE;
+ node_reclaim_distance;
}
#else /* CONFIG_NUMA */
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
}
}
- pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
+ pr_info("%s initialised %lu pages in %ums\n", __func__,
size, jiffies_to_msecs(jiffies - start));
}