arch/riscv/mm/fault.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2009 Sunplus Core Technology Co., Ltd.
   4  *  Lennox Wu <[email protected]>
   5  *  Chen Liqin <[email protected]>
   6  * Copyright (C) 2012 Regents of the University of California
   7  */
   8
   9
  10 #include <linux/mm.h>
  11 #include <linux/kernel.h>
  12 #include <linux/interrupt.h>
  13 #include <linux/perf_event.h>
  14 #include <linux/signal.h>
  15 #include <linux/uaccess.h>
  16 #include <linux/kprobes.h>
  17 #include <linux/kfence.h>
  18 #include <linux/entry-common.h>
  19
  20 #include <asm/ptrace.h>
  21 #include <asm/tlbflush.h>
  22
  23 #include "../kernel/head.h"
  24
  25 static void show_pte(unsigned long addr)
  26 {
  27         pgd_t *pgdp, pgd;
  28         p4d_t *p4dp, p4d;
  29         pud_t *pudp, pud;
  30         pmd_t *pmdp, pmd;
  31         pte_t *ptep, pte;
  32         struct mm_struct *mm = current->mm;
  33
  34         if (!mm)
  35                 mm = &init_mm;
  36
  37         pr_alert("Current %s pgtable: %luK pagesize, %d-bit VAs, pgdp=0x%016llx\n",
  38                  current->comm, PAGE_SIZE / SZ_1K, VA_BITS,
  39                  mm == &init_mm ? (u64)__pa_symbol(mm->pgd) : virt_to_phys(mm->pgd));
  40
  41         pgdp = pgd_offset(mm, addr);
  42         pgd = pgdp_get(pgdp);
  43         pr_alert("[%016lx] pgd=%016lx", addr, pgd_val(pgd));
  44         if (pgd_none(pgd) || pgd_bad(pgd) || pgd_leaf(pgd))
  45                 goto out;
  46
  47         p4dp = p4d_offset(pgdp, addr);
  48         p4d = p4dp_get(p4dp);
  49         pr_cont(", p4d=%016lx", p4d_val(p4d));
  50         if (p4d_none(p4d) || p4d_bad(p4d) || p4d_leaf(p4d))
  51                 goto out;
  52
  53         pudp = pud_offset(p4dp, addr);
  54         pud = pudp_get(pudp);
  55         pr_cont(", pud=%016lx", pud_val(pud));
  56         if (pud_none(pud) || pud_bad(pud) || pud_leaf(pud))
  57                 goto out;
  58
  59         pmdp = pmd_offset(pudp, addr);
  60         pmd = pmdp_get(pmdp);
  61         pr_cont(", pmd=%016lx", pmd_val(pmd));
  62         if (pmd_none(pmd) || pmd_bad(pmd) || pmd_leaf(pmd))
  63                 goto out;
  64
  65         ptep = pte_offset_map(pmdp, addr);
  66         if (!ptep)
  67                 goto out;
  68
  69         pte = ptep_get(ptep);
  70         pr_cont(", pte=%016lx", pte_val(pte));
  71         pte_unmap(ptep);
  72 out:
  73         pr_cont("\n");
  74 }
  75
  76 static void die_kernel_fault(const char *msg, unsigned long addr,
  77                 struct pt_regs *regs)
  78 {
  79         bust_spinlocks(1);
  80
  81         pr_alert("Unable to handle kernel %s at virtual address " REG_FMT "\n", msg,
  82                 addr);
  83
  84         bust_spinlocks(0);
  85         show_pte(addr);
  86         die(regs, "Oops");
  87         make_task_dead(SIGKILL);
  88 }
  89
  90 static inline void no_context(struct pt_regs *regs, unsigned long addr)
  91 {
  92         const char *msg;
  93
  94         /* Are we prepared to handle this kernel fault? */
  95         if (fixup_exception(regs))
  96                 return;
  97
  98         /*
  99          * Oops. The kernel tried to access some bad page. We'll have to
 100          * terminate things with extreme prejudice.
 101          */
 102         if (addr < PAGE_SIZE)
 103                 msg = "NULL pointer dereference";
 104         else {
 105                 if (kfence_handle_page_fault(addr, regs->cause == EXC_STORE_PAGE_FAULT, regs))
 106                         return;
 107
 108                 msg = "paging request";
 109         }
 110
 111         die_kernel_fault(msg, addr, regs);
 112 }
 113
 114 static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_fault_t fault)
 115 {
 116         if (!user_mode(regs)) {
 117                 no_context(regs, addr);
 118                 return;
 119         }
 120
 121         if (fault & VM_FAULT_OOM) {
 122                 /*
 123                  * We ran out of memory, call the OOM killer, and return the userspace
 124                  * (which will retry the fault, or kill us if we got oom-killed).
 125                  */
 126                 pagefault_out_of_memory();
 127                 return;
 128         } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) {
 129                 /* Kernel mode? Handle exceptions or die */
 130                 do_trap(regs, SIGBUS, BUS_ADRERR, addr);
 131                 return;
 132         } else if (fault & VM_FAULT_SIGSEGV) {
 133                 do_trap(regs, SIGSEGV, SEGV_MAPERR, addr);
 134                 return;
 135         }
 136
 137         BUG();
 138 }
 139
 140 static inline void
 141 bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr)
 142 {
 143         /*
 144          * Something tried to access memory that isn't in our memory map.
 145          * Fix it, but check if it's kernel or user first.
 146          */
 147         /* User mode accesses just cause a SIGSEGV */
 148         if (user_mode(regs)) {
 149                 do_trap(regs, SIGSEGV, code, addr);
 150                 return;
 151         }
 152
 153         no_context(regs, addr);
 154 }
 155
 156 static inline void
 157 bad_area(struct pt_regs *regs, struct mm_struct *mm, int code,
 158          unsigned long addr)
 159 {
 160         mmap_read_unlock(mm);
 161
 162         bad_area_nosemaphore(regs, code, addr);
 163 }
 164
 165 static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr)
 166 {
 167         pgd_t *pgd, *pgd_k;
 168         pud_t *pud_k;
 169         p4d_t *p4d_k;
 170         pmd_t *pmd_k;
 171         pte_t *pte_k;
 172         int index;
 173         unsigned long pfn;
 174
 175         /* User mode accesses just cause a SIGSEGV */
 176         if (user_mode(regs))
 177                 return do_trap(regs, SIGSEGV, code, addr);
 178
 179         /*
 180          * Synchronize this task's top level page-table
 181          * with the 'reference' page table.
 182          *
 183          * Do _not_ use "tsk->active_mm->pgd" here.
 184          * We might be inside an interrupt in the middle
 185          * of a task switch.
 186          */
 187         index = pgd_index(addr);
 188         pfn = csr_read(CSR_SATP) & SATP_PPN;
 189         pgd = (pgd_t *)pfn_to_virt(pfn) + index;
 190         pgd_k = init_mm.pgd + index;
 191
 192         if (!pgd_present(pgdp_get(pgd_k))) {
 193                 no_context(regs, addr);
 194                 return;
 195         }
 196         set_pgd(pgd, pgdp_get(pgd_k));
 197
 198         p4d_k = p4d_offset(pgd_k, addr);
 199         if (!p4d_present(p4dp_get(p4d_k))) {
 200                 no_context(regs, addr);
 201                 return;
 202         }
 203
 204         pud_k = pud_offset(p4d_k, addr);
 205         if (!pud_present(pudp_get(pud_k))) {
 206                 no_context(regs, addr);
 207                 return;
 208         }
 209         if (pud_leaf(pudp_get(pud_k)))
 210                 goto flush_tlb;
 211
 212         /*
 213          * Since the vmalloc area is global, it is unnecessary
 214          * to copy individual PTEs
 215          */
 216         pmd_k = pmd_offset(pud_k, addr);
 217         if (!pmd_present(pmdp_get(pmd_k))) {
 218                 no_context(regs, addr);
 219                 return;
 220         }
 221         if (pmd_leaf(pmdp_get(pmd_k)))
 222                 goto flush_tlb;
 223
 224         /*
 225          * Make sure the actual PTE exists as well to
 226          * catch kernel vmalloc-area accesses to non-mapped
 227          * addresses. If we don't do this, this will just
 228          * silently loop forever.
 229          */
 230         pte_k = pte_offset_kernel(pmd_k, addr);
 231         if (!pte_present(ptep_get(pte_k))) {
 232                 no_context(regs, addr);
 233                 return;
 234         }
 235
 236         /*
 237          * The kernel assumes that TLBs don't cache invalid
 238          * entries, but in RISC-V, SFENCE.VMA specifies an
 239          * ordering constraint, not a cache flush; it is
 240          * necessary even after writing invalid entries.
 241          */
 242 flush_tlb:
 243         local_flush_tlb_page(addr);
 244 }
 245
 246 static inline bool access_error(unsigned long cause, struct vm_area_struct *vma)
 247 {
 248         switch (cause) {
 249         case EXC_INST_PAGE_FAULT:
 250                 if (!(vma->vm_flags & VM_EXEC)) {
 251                         return true;
 252                 }
 253                 break;
 254         case EXC_LOAD_PAGE_FAULT:
 255                 /* Write implies read */
 256                 if (!(vma->vm_flags & (VM_READ | VM_WRITE))) {
 257                         return true;
 258                 }
 259                 break;
 260         case EXC_STORE_PAGE_FAULT:
 261                 if (!(vma->vm_flags & VM_WRITE)) {
 262                         return true;
 263                 }
 264                 break;
 265         default:
 266                 panic("%s: unhandled cause %lu", __func__, cause);
 267         }
 268         return false;
 269 }
 270
 271 /*
 272  * This routine handles page faults.  It determines the address and the
 273  * problem, and then passes it off to one of the appropriate routines.
 274  */
 275 void handle_page_fault(struct pt_regs *regs)
 276 {
 277         struct task_struct *tsk;
 278         struct vm_area_struct *vma;
 279         struct mm_struct *mm;
 280         unsigned long addr, cause;
 281         unsigned int flags = FAULT_FLAG_DEFAULT;
 282         int code = SEGV_MAPERR;
 283         vm_fault_t fault;
 284
 285         cause = regs->cause;
 286         addr = regs->badaddr;
 287
 288         tsk = current;
 289         mm = tsk->mm;
 290
 291         if (kprobe_page_fault(regs, cause))
 292                 return;
 293
 294         /*
 295          * Fault-in kernel-space virtual memory on-demand.
 296          * The 'reference' page table is init_mm.pgd.
 297          *
 298          * NOTE! We MUST NOT take any locks for this case. We may
 299          * be in an interrupt or a critical region, and should
 300          * only copy the information from the master page table,
 301          * nothing more.
 302          */
 303         if ((!IS_ENABLED(CONFIG_MMU) || !IS_ENABLED(CONFIG_64BIT)) &&
 304             unlikely(addr >= VMALLOC_START && addr < VMALLOC_END)) {
 305                 vmalloc_fault(regs, code, addr);
 306                 return;
 307         }
 308
 309         /* Enable interrupts if they were enabled in the parent context. */
 310         if (!regs_irqs_disabled(regs))
 311                 local_irq_enable();
 312
 313         /*
 314          * If we're in an interrupt, have no user context, or are running
 315          * in an atomic region, then we must not take the fault.
 316          */
 317         if (unlikely(faulthandler_disabled() || !mm)) {
 318                 tsk->thread.bad_cause = cause;
 319                 no_context(regs, addr);
 320                 return;
 321         }
 322
 323         if (user_mode(regs))
 324                 flags |= FAULT_FLAG_USER;
 325
 326         if (!user_mode(regs) && addr < TASK_SIZE && unlikely(!(regs->status & SR_SUM))) {
 327                 if (fixup_exception(regs))
 328                         return;
 329
 330                 die_kernel_fault("access to user memory without uaccess routines", addr, regs);
 331         }
 332
 333         perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
 334
 335         if (cause == EXC_STORE_PAGE_FAULT)
 336                 flags |= FAULT_FLAG_WRITE;
 337         else if (cause == EXC_INST_PAGE_FAULT)
 338                 flags |= FAULT_FLAG_INSTRUCTION;
 339         if (!(flags & FAULT_FLAG_USER))
 340                 goto lock_mmap;
 341
 342         vma = lock_vma_under_rcu(mm, addr);
 343         if (!vma)
 344                 goto lock_mmap;
 345
 346         if (unlikely(access_error(cause, vma))) {
 347                 vma_end_read(vma);
 348                 count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
 349                 tsk->thread.bad_cause = cause;
 350                 bad_area_nosemaphore(regs, SEGV_ACCERR, addr);
 351                 return;
 352         }
 353
 354         fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs);
 355         if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
 356                 vma_end_read(vma);
 357
 358         if (!(fault & VM_FAULT_RETRY)) {
 359                 count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
 360                 goto done;
 361         }
 362         count_vm_vma_lock_event(VMA_LOCK_RETRY);
 363         if (fault & VM_FAULT_MAJOR)
 364                 flags |= FAULT_FLAG_TRIED;
 365
 366         if (fault_signal_pending(fault, regs)) {
 367                 if (!user_mode(regs))
 368                         no_context(regs, addr);
 369                 return;
 370         }
 371 lock_mmap:
 372
 373 retry:
 374         vma = lock_mm_and_find_vma(mm, addr, regs);
 375         if (unlikely(!vma)) {
 376                 tsk->thread.bad_cause = cause;
 377                 bad_area_nosemaphore(regs, code, addr);
 378                 return;
 379         }
 380
 381         /*
 382          * Ok, we have a good vm_area for this memory access, so
 383          * we can handle it.
 384          */
 385         code = SEGV_ACCERR;
 386
 387         if (unlikely(access_error(cause, vma))) {
 388                 tsk->thread.bad_cause = cause;
 389                 bad_area(regs, mm, code, addr);
 390                 return;
 391         }
 392
 393         /*
 394          * If for any reason at all we could not handle the fault,
 395          * make sure we exit gracefully rather than endlessly redo
 396          * the fault.
 397          */
 398         fault = handle_mm_fault(vma, addr, flags, regs);
 399
 400         /*
 401          * If we need to retry but a fatal signal is pending, handle the
 402          * signal first. We do not need to release the mmap_lock because it
 403          * would already be released in __lock_page_or_retry in mm/filemap.c.
 404          */
 405         if (fault_signal_pending(fault, regs)) {
 406                 if (!user_mode(regs))
 407                         no_context(regs, addr);
 408                 return;
 409         }
 410
 411         /* The fault is fully completed (including releasing mmap lock) */
 412         if (fault & VM_FAULT_COMPLETED)
 413                 return;
 414
 415         if (unlikely(fault & VM_FAULT_RETRY)) {
 416                 flags |= FAULT_FLAG_TRIED;
 417
 418                 /*
 419                  * No need to mmap_read_unlock(mm) as we would
 420                  * have already released it in __lock_page_or_retry
 421                  * in mm/filemap.c.
 422                  */
 423                 goto retry;
 424         }
 425
 426         mmap_read_unlock(mm);
 427
 428 done:
 429         if (unlikely(fault & VM_FAULT_ERROR)) {
 430                 tsk->thread.bad_cause = cause;
 431                 mm_fault_error(regs, addr, fault);
 432                 return;
 433         }
 434         return;
 435 }