arch/x86/kernel/cpu/sgx/main.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*  Copyright(c) 2016-20 Intel Corporation. */
   3
   4 #include <linux/file.h>
   5 #include <linux/freezer.h>
   6 #include <linux/highmem.h>
   7 #include <linux/kthread.h>
   8 #include <linux/miscdevice.h>
   9 #include <linux/node.h>
  10 #include <linux/pagemap.h>
  11 #include <linux/ratelimit.h>
  12 #include <linux/sched/mm.h>
  13 #include <linux/sched/signal.h>
  14 #include <linux/slab.h>
  15 #include <linux/sysfs.h>
  16 #include <asm/sgx.h>
  17 #include "driver.h"
  18 #include "encl.h"
  19 #include "encls.h"
  20
  21 struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
  22 static int sgx_nr_epc_sections;
  23 static struct task_struct *ksgxd_tsk;
  24 static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
  25 static DEFINE_XARRAY(sgx_epc_address_space);
  26
  27 /*
  28  * These variables are part of the state of the reclaimer, and must be accessed
  29  * with sgx_reclaimer_lock acquired.
  30  */
  31 static LIST_HEAD(sgx_active_page_list);
  32 static DEFINE_SPINLOCK(sgx_reclaimer_lock);
  33
  34 static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
  35
  36 /* Nodes with one or more EPC sections. */
  37 static nodemask_t sgx_numa_mask;
  38
  39 /*
  40  * Array with one list_head for each possible NUMA node.  Each
  41  * list contains all the sgx_epc_section's which are on that
  42  * node.
  43  */
  44 static struct sgx_numa_node *sgx_numa_nodes;
  45
  46 static LIST_HEAD(sgx_dirty_page_list);
  47
  48 /*
  49  * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
  50  * from the input list, and made available for the page allocator. SECS pages
  51  * prepending their children in the input list are left intact.
  52  *
  53  * Return 0 when sanitization was successful or kthread was stopped, and the
  54  * number of unsanitized pages otherwise.
  55  */
  56 static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list)
  57 {
  58         unsigned long left_dirty = 0;
  59         struct sgx_epc_page *page;
  60         LIST_HEAD(dirty);
  61         int ret;
  62
  63         /* dirty_page_list is thread-local, no need for a lock: */
  64         while (!list_empty(dirty_page_list)) {
  65                 if (kthread_should_stop())
  66                         return 0;
  67
  68                 page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
  69
  70                 /*
  71                  * Checking page->poison without holding the node->lock
  72                  * is racy, but losing the race (i.e. poison is set just
  73                  * after the check) just means __eremove() will be uselessly
  74                  * called for a page that sgx_free_epc_page() will put onto
  75                  * the node->sgx_poison_page_list later.
  76                  */
  77                 if (page->poison) {
  78                         struct sgx_epc_section *section = &sgx_epc_sections[page->section];
  79                         struct sgx_numa_node *node = section->node;
  80
  81                         spin_lock(&node->lock);
  82                         list_move(&page->list, &node->sgx_poison_page_list);
  83                         spin_unlock(&node->lock);
  84
  85                         continue;
  86                 }
  87
  88                 ret = __eremove(sgx_get_epc_virt_addr(page));
  89                 if (!ret) {
  90                         /*
  91                          * page is now sanitized.  Make it available via the SGX
  92                          * page allocator:
  93                          */
  94                         list_del(&page->list);
  95                         sgx_free_epc_page(page);
  96                 } else {
  97                         /* The page is not yet clean - move to the dirty list. */
  98                         list_move_tail(&page->list, &dirty);
  99                         left_dirty++;
 100                 }
 101
 102                 cond_resched();
 103         }
 104
 105         list_splice(&dirty, dirty_page_list);
 106         return left_dirty;
 107 }
 108
 109 static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
 110 {
 111         struct sgx_encl_page *page = epc_page->owner;
 112         struct sgx_encl *encl = page->encl;
 113         struct sgx_encl_mm *encl_mm;
 114         bool ret = true;
 115         int idx;
 116
 117         idx = srcu_read_lock(&encl->srcu);
 118
 119         list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
 120                 if (!mmget_not_zero(encl_mm->mm))
 121                         continue;
 122
 123                 mmap_read_lock(encl_mm->mm);
 124                 ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page);
 125                 mmap_read_unlock(encl_mm->mm);
 126
 127                 mmput_async(encl_mm->mm);
 128
 129                 if (!ret)
 130                         break;
 131         }
 132
 133         srcu_read_unlock(&encl->srcu, idx);
 134
 135         if (!ret)
 136                 return false;
 137
 138         return true;
 139 }
 140
 141 static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
 142 {
 143         struct sgx_encl_page *page = epc_page->owner;
 144         unsigned long addr = page->desc & PAGE_MASK;
 145         struct sgx_encl *encl = page->encl;
 146         int ret;
 147
 148         sgx_zap_enclave_ptes(encl, addr);
 149
 150         mutex_lock(&encl->lock);
 151
 152         ret = __eblock(sgx_get_epc_virt_addr(epc_page));
 153         if (encls_failed(ret))
 154                 ENCLS_WARN(ret, "EBLOCK");
 155
 156         mutex_unlock(&encl->lock);
 157 }
 158
 159 static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
 160                           struct sgx_backing *backing)
 161 {
 162         struct sgx_pageinfo pginfo;
 163         int ret;
 164
 165         pginfo.addr = 0;
 166         pginfo.secs = 0;
 167
 168         pginfo.contents = (unsigned long)kmap_local_page(backing->contents);
 169         pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) +
 170                           backing->pcmd_offset;
 171
 172         ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
 173         set_page_dirty(backing->pcmd);
 174         set_page_dirty(backing->contents);
 175
 176         kunmap_local((void *)(unsigned long)(pginfo.metadata -
 177                                               backing->pcmd_offset));
 178         kunmap_local((void *)(unsigned long)pginfo.contents);
 179
 180         return ret;
 181 }
 182
 183 void sgx_ipi_cb(void *info)
 184 {
 185 }
 186
 187 /*
 188  * Swap page to the regular memory transformed to the blocked state by using
 189  * EBLOCK, which means that it can no longer be referenced (no new TLB entries).
 190  *
 191  * The first trial just tries to write the page assuming that some other thread
 192  * has reset the count for threads inside the enclave by using ETRACK, and
 193  * previous thread count has been zeroed out. The second trial calls ETRACK
 194  * before EWB. If that fails we kick all the HW threads out, and then do EWB,
 195  * which should be guaranteed the succeed.
 196  */
 197 static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
 198                          struct sgx_backing *backing)
 199 {
 200         struct sgx_encl_page *encl_page = epc_page->owner;
 201         struct sgx_encl *encl = encl_page->encl;
 202         struct sgx_va_page *va_page;
 203         unsigned int va_offset;
 204         void *va_slot;
 205         int ret;
 206
 207         encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED;
 208
 209         va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
 210                                    list);
 211         va_offset = sgx_alloc_va_slot(va_page);
 212         va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset;
 213         if (sgx_va_page_full(va_page))
 214                 list_move_tail(&va_page->list, &encl->va_pages);
 215
 216         ret = __sgx_encl_ewb(epc_page, va_slot, backing);
 217         if (ret == SGX_NOT_TRACKED) {
 218                 ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page));
 219                 if (ret) {
 220                         if (encls_failed(ret))
 221                                 ENCLS_WARN(ret, "ETRACK");
 222                 }
 223
 224                 ret = __sgx_encl_ewb(epc_page, va_slot, backing);
 225                 if (ret == SGX_NOT_TRACKED) {
 226                         /*
 227                          * Slow path, send IPIs to kick cpus out of the
 228                          * enclave.  Note, it's imperative that the cpu
 229                          * mask is generated *after* ETRACK, else we'll
 230                          * miss cpus that entered the enclave between
 231                          * generating the mask and incrementing epoch.
 232                          */
 233                         on_each_cpu_mask(sgx_encl_cpumask(encl),
 234                                          sgx_ipi_cb, NULL, 1);
 235                         ret = __sgx_encl_ewb(epc_page, va_slot, backing);
 236                 }
 237         }
 238
 239         if (ret) {
 240                 if (encls_failed(ret))
 241                         ENCLS_WARN(ret, "EWB");
 242
 243                 sgx_free_va_slot(va_page, va_offset);
 244         } else {
 245                 encl_page->desc |= va_offset;
 246                 encl_page->va_page = va_page;
 247         }
 248 }
 249
 250 static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
 251                                 struct sgx_backing *backing)
 252 {
 253         struct sgx_encl_page *encl_page = epc_page->owner;
 254         struct sgx_encl *encl = encl_page->encl;
 255         struct sgx_backing secs_backing;
 256         int ret;
 257
 258         mutex_lock(&encl->lock);
 259
 260         sgx_encl_ewb(epc_page, backing);
 261         encl_page->epc_page = NULL;
 262         encl->secs_child_cnt--;
 263         sgx_encl_put_backing(backing);
 264
 265         if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
 266                 ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size),
 267                                            &secs_backing);
 268                 if (ret)
 269                         goto out;
 270
 271                 sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
 272
 273                 sgx_encl_free_epc_page(encl->secs.epc_page);
 274                 encl->secs.epc_page = NULL;
 275
 276                 sgx_encl_put_backing(&secs_backing);
 277         }
 278
 279 out:
 280         mutex_unlock(&encl->lock);
 281 }
 282
 283 /*
 284  * Take a fixed number of pages from the head of the active page pool and
 285  * reclaim them to the enclave's private shmem files. Skip the pages, which have
 286  * been accessed since the last scan. Move those pages to the tail of active
 287  * page pool so that the pages get scanned in LRU like fashion.
 288  *
 289  * Batch process a chunk of pages (at the moment 16) in order to degrade amount
 290  * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit
 291  * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI
 292  * + EWB) but not sufficiently. Reclaiming one page at a time would also be
 293  * problematic as it would increase the lock contention too much, which would
 294  * halt forward progress.
 295  */
 296 static void sgx_reclaim_pages(void)
 297 {
 298         struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
 299         struct sgx_backing backing[SGX_NR_TO_SCAN];
 300         struct sgx_encl_page *encl_page;
 301         struct sgx_epc_page *epc_page;
 302         pgoff_t page_index;
 303         int cnt = 0;
 304         int ret;
 305         int i;
 306
 307         spin_lock(&sgx_reclaimer_lock);
 308         for (i = 0; i < SGX_NR_TO_SCAN; i++) {
 309                 if (list_empty(&sgx_active_page_list))
 310                         break;
 311
 312                 epc_page = list_first_entry(&sgx_active_page_list,
 313                                             struct sgx_epc_page, list);
 314                 list_del_init(&epc_page->list);
 315                 encl_page = epc_page->owner;
 316
 317                 if (kref_get_unless_zero(&encl_page->encl->refcount) != 0)
 318                         chunk[cnt++] = epc_page;
 319                 else
 320                         /* The owner is freeing the page. No need to add the
 321                          * page back to the list of reclaimable pages.
 322                          */
 323                         epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
 324         }
 325         spin_unlock(&sgx_reclaimer_lock);
 326
 327         for (i = 0; i < cnt; i++) {
 328                 epc_page = chunk[i];
 329                 encl_page = epc_page->owner;
 330
 331                 if (!sgx_reclaimer_age(epc_page))
 332                         goto skip;
 333
 334                 page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
 335
 336                 mutex_lock(&encl_page->encl->lock);
 337                 ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]);
 338                 if (ret) {
 339                         mutex_unlock(&encl_page->encl->lock);
 340                         goto skip;
 341                 }
 342
 343                 encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED;
 344                 mutex_unlock(&encl_page->encl->lock);
 345                 continue;
 346
 347 skip:
 348                 spin_lock(&sgx_reclaimer_lock);
 349                 list_add_tail(&epc_page->list, &sgx_active_page_list);
 350                 spin_unlock(&sgx_reclaimer_lock);
 351
 352                 kref_put(&encl_page->encl->refcount, sgx_encl_release);
 353
 354                 chunk[i] = NULL;
 355         }
 356
 357         for (i = 0; i < cnt; i++) {
 358                 epc_page = chunk[i];
 359                 if (epc_page)
 360                         sgx_reclaimer_block(epc_page);
 361         }
 362
 363         for (i = 0; i < cnt; i++) {
 364                 epc_page = chunk[i];
 365                 if (!epc_page)
 366                         continue;
 367
 368                 encl_page = epc_page->owner;
 369                 sgx_reclaimer_write(epc_page, &backing[i]);
 370
 371                 kref_put(&encl_page->encl->refcount, sgx_encl_release);
 372                 epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
 373
 374                 sgx_free_epc_page(epc_page);
 375         }
 376 }
 377
 378 static bool sgx_should_reclaim(unsigned long watermark)
 379 {
 380         return atomic_long_read(&sgx_nr_free_pages) < watermark &&
 381                !list_empty(&sgx_active_page_list);
 382 }
 383
 384 /*
 385  * sgx_reclaim_direct() should be called (without enclave's mutex held)
 386  * in locations where SGX memory resources might be low and might be
 387  * needed in order to make forward progress.
 388  */
 389 void sgx_reclaim_direct(void)
 390 {
 391         if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
 392                 sgx_reclaim_pages();
 393 }
 394
 395 static int ksgxd(void *p)
 396 {
 397         set_freezable();
 398
 399         /*
 400          * Sanitize pages in order to recover from kexec(). The 2nd pass is
 401          * required for SECS pages, whose child pages blocked EREMOVE.
 402          */
 403         __sgx_sanitize_pages(&sgx_dirty_page_list);
 404         WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list));
 405
 406         while (!kthread_should_stop()) {
 407                 if (try_to_freeze())
 408                         continue;
 409
 410                 wait_event_freezable(ksgxd_waitq,
 411                                      kthread_should_stop() ||
 412                                      sgx_should_reclaim(SGX_NR_HIGH_PAGES));
 413
 414                 if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
 415                         sgx_reclaim_pages();
 416
 417                 cond_resched();
 418         }
 419
 420         return 0;
 421 }
 422
 423 static bool __init sgx_page_reclaimer_init(void)
 424 {
 425         struct task_struct *tsk;
 426
 427         tsk = kthread_run(ksgxd, NULL, "ksgxd");
 428         if (IS_ERR(tsk))
 429                 return false;
 430
 431         ksgxd_tsk = tsk;
 432
 433         return true;
 434 }
 435
 436 bool current_is_ksgxd(void)
 437 {
 438         return current == ksgxd_tsk;
 439 }
 440
 441 static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
 442 {
 443         struct sgx_numa_node *node = &sgx_numa_nodes[nid];
 444         struct sgx_epc_page *page = NULL;
 445
 446         spin_lock(&node->lock);
 447
 448         if (list_empty(&node->free_page_list)) {
 449                 spin_unlock(&node->lock);
 450                 return NULL;
 451         }
 452
 453         page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
 454         list_del_init(&page->list);
 455         page->flags = 0;
 456
 457         spin_unlock(&node->lock);
 458         atomic_long_dec(&sgx_nr_free_pages);
 459
 460         return page;
 461 }
 462
 463 /**
 464  * __sgx_alloc_epc_page() - Allocate an EPC page
 465  *
 466  * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
 467  * from the NUMA node, where the caller is executing.
 468  *
 469  * Return:
 470  * - an EPC page:       A borrowed EPC pages were available.
 471  * - NULL:              Out of EPC pages.
 472  */
 473 struct sgx_epc_page *__sgx_alloc_epc_page(void)
 474 {
 475         struct sgx_epc_page *page;
 476         int nid_of_current = numa_node_id();
 477         int nid = nid_of_current;
 478
 479         if (node_isset(nid_of_current, sgx_numa_mask)) {
 480                 page = __sgx_alloc_epc_page_from_node(nid_of_current);
 481                 if (page)
 482                         return page;
 483         }
 484
 485         /* Fall back to the non-local NUMA nodes: */
 486         while (true) {
 487                 nid = next_node_in(nid, sgx_numa_mask);
 488                 if (nid == nid_of_current)
 489                         break;
 490
 491                 page = __sgx_alloc_epc_page_from_node(nid);
 492                 if (page)
 493                         return page;
 494         }
 495
 496         return ERR_PTR(-ENOMEM);
 497 }
 498
 499 /**
 500  * sgx_mark_page_reclaimable() - Mark a page as reclaimable
 501  * @page:       EPC page
 502  *
 503  * Mark a page as reclaimable and add it to the active page list. Pages
 504  * are automatically removed from the active list when freed.
 505  */
 506 void sgx_mark_page_reclaimable(struct sgx_epc_page *page)
 507 {
 508         spin_lock(&sgx_reclaimer_lock);
 509         page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED;
 510         list_add_tail(&page->list, &sgx_active_page_list);
 511         spin_unlock(&sgx_reclaimer_lock);
 512 }
 513
 514 /**
 515  * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list
 516  * @page:       EPC page
 517  *
 518  * Clear the reclaimable flag and remove the page from the active page list.
 519  *
 520  * Return:
 521  *   0 on success,
 522  *   -EBUSY if the page is in the process of being reclaimed
 523  */
 524 int sgx_unmark_page_reclaimable(struct sgx_epc_page *page)
 525 {
 526         spin_lock(&sgx_reclaimer_lock);
 527         if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) {
 528                 /* The page is being reclaimed. */
 529                 if (list_empty(&page->list)) {
 530                         spin_unlock(&sgx_reclaimer_lock);
 531                         return -EBUSY;
 532                 }
 533
 534                 list_del(&page->list);
 535                 page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
 536         }
 537         spin_unlock(&sgx_reclaimer_lock);
 538
 539         return 0;
 540 }
 541
 542 /**
 543  * sgx_alloc_epc_page() - Allocate an EPC page
 544  * @owner:      the owner of the EPC page
 545  * @reclaim:    reclaim pages if necessary
 546  *
 547  * Iterate through EPC sections and borrow a free EPC page to the caller. When a
 548  * page is no longer needed it must be released with sgx_free_epc_page(). If
 549  * @reclaim is set to true, directly reclaim pages when we are out of pages. No
 550  * mm's can be locked when @reclaim is set to true.
 551  *
 552  * Finally, wake up ksgxd when the number of pages goes below the watermark
 553  * before returning back to the caller.
 554  *
 555  * Return:
 556  *   an EPC page,
 557  *   -errno on error
 558  */
 559 struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
 560 {
 561         struct sgx_epc_page *page;
 562
 563         for ( ; ; ) {
 564                 page = __sgx_alloc_epc_page();
 565                 if (!IS_ERR(page)) {
 566                         page->owner = owner;
 567                         break;
 568                 }
 569
 570                 if (list_empty(&sgx_active_page_list))
 571                         return ERR_PTR(-ENOMEM);
 572
 573                 if (!reclaim) {
 574                         page = ERR_PTR(-EBUSY);
 575                         break;
 576                 }
 577
 578                 if (signal_pending(current)) {
 579                         page = ERR_PTR(-ERESTARTSYS);
 580                         break;
 581                 }
 582
 583                 sgx_reclaim_pages();
 584                 cond_resched();
 585         }
 586
 587         if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
 588                 wake_up(&ksgxd_waitq);
 589
 590         return page;
 591 }
 592
 593 /**
 594  * sgx_free_epc_page() - Free an EPC page
 595  * @page:       an EPC page
 596  *
 597  * Put the EPC page back to the list of free pages. It's the caller's
 598  * responsibility to make sure that the page is in uninitialized state. In other
 599  * words, do EREMOVE, EWB or whatever operation is necessary before calling
 600  * this function.
 601  */
 602 void sgx_free_epc_page(struct sgx_epc_page *page)
 603 {
 604         struct sgx_epc_section *section = &sgx_epc_sections[page->section];
 605         struct sgx_numa_node *node = section->node;
 606
 607         spin_lock(&node->lock);
 608
 609         page->owner = NULL;
 610         if (page->poison)
 611                 list_add(&page->list, &node->sgx_poison_page_list);
 612         else
 613                 list_add_tail(&page->list, &node->free_page_list);
 614         page->flags = SGX_EPC_PAGE_IS_FREE;
 615
 616         spin_unlock(&node->lock);
 617         atomic_long_inc(&sgx_nr_free_pages);
 618 }
 619
 620 static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
 621                                          unsigned long index,
 622                                          struct sgx_epc_section *section)
 623 {
 624         unsigned long nr_pages = size >> PAGE_SHIFT;
 625         unsigned long i;
 626
 627         section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB);
 628         if (!section->virt_addr)
 629                 return false;
 630
 631         section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page));
 632         if (!section->pages) {
 633                 memunmap(section->virt_addr);
 634                 return false;
 635         }
 636
 637         section->phys_addr = phys_addr;
 638         xa_store_range(&sgx_epc_address_space, section->phys_addr,
 639                        phys_addr + size - 1, section, GFP_KERNEL);
 640
 641         for (i = 0; i < nr_pages; i++) {
 642                 section->pages[i].section = index;
 643                 section->pages[i].flags = 0;
 644                 section->pages[i].owner = NULL;
 645                 section->pages[i].poison = 0;
 646                 list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
 647         }
 648
 649         return true;
 650 }
 651
 652 bool arch_is_platform_page(u64 paddr)
 653 {
 654         return !!xa_load(&sgx_epc_address_space, paddr);
 655 }
 656 EXPORT_SYMBOL_GPL(arch_is_platform_page);
 657
 658 static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr)
 659 {
 660         struct sgx_epc_section *section;
 661
 662         section = xa_load(&sgx_epc_address_space, paddr);
 663         if (!section)
 664                 return NULL;
 665
 666         return &section->pages[PFN_DOWN(paddr - section->phys_addr)];
 667 }
 668
 669 /*
 670  * Called in process context to handle a hardware reported
 671  * error in an SGX EPC page.
 672  * If the MF_ACTION_REQUIRED bit is set in flags, then the
 673  * context is the task that consumed the poison data. Otherwise
 674  * this is called from a kernel thread unrelated to the page.
 675  */
 676 int arch_memory_failure(unsigned long pfn, int flags)
 677 {
 678         struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT);
 679         struct sgx_epc_section *section;
 680         struct sgx_numa_node *node;
 681
 682         /*
 683          * mm/memory-failure.c calls this routine for all errors
 684          * where there isn't a "struct page" for the address. But that
 685          * includes other address ranges besides SGX.
 686          */
 687         if (!page)
 688                 return -ENXIO;
 689
 690         /*
 691          * If poison was consumed synchronously. Send a SIGBUS to
 692          * the task. Hardware has already exited the SGX enclave and
 693          * will not allow re-entry to an enclave that has a memory
 694          * error. The signal may help the task understand why the
 695          * enclave is broken.
 696          */
 697         if (flags & MF_ACTION_REQUIRED)
 698                 force_sig(SIGBUS);
 699
 700         section = &sgx_epc_sections[page->section];
 701         node = section->node;
 702
 703         spin_lock(&node->lock);
 704
 705         /* Already poisoned? Nothing more to do */
 706         if (page->poison)
 707                 goto out;
 708
 709         page->poison = 1;
 710
 711         /*
 712          * If the page is on a free list, move it to the per-node
 713          * poison page list.
 714          */
 715         if (page->flags & SGX_EPC_PAGE_IS_FREE) {
 716                 list_move(&page->list, &node->sgx_poison_page_list);
 717                 goto out;
 718         }
 719
 720         /*
 721          * TBD: Add additional plumbing to enable pre-emptive
 722          * action for asynchronous poison notification. Until
 723          * then just hope that the poison:
 724          * a) is not accessed - sgx_free_epc_page() will deal with it
 725          *    when the user gives it back
 726          * b) results in a recoverable machine check rather than
 727          *    a fatal one
 728          */
 729 out:
 730         spin_unlock(&node->lock);
 731         return 0;
 732 }
 733
 734 /**
 735  * A section metric is concatenated in a way that @low bits 12-31 define the
 736  * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
 737  * metric.
 738  */
 739 static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
 740 {
 741         return (low & GENMASK_ULL(31, 12)) +
 742                ((high & GENMASK_ULL(19, 0)) << 32);
 743 }
 744
 745 #ifdef CONFIG_NUMA
 746 static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf)
 747 {
 748         return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size);
 749 }
 750 static DEVICE_ATTR_RO(sgx_total_bytes);
 751
 752 static umode_t arch_node_attr_is_visible(struct kobject *kobj,
 753                 struct attribute *attr, int idx)
 754 {
 755         /* Make all x86/ attributes invisible when SGX is not initialized: */
 756         if (nodes_empty(sgx_numa_mask))
 757                 return 0;
 758
 759         return attr->mode;
 760 }
 761
 762 static struct attribute *arch_node_dev_attrs[] = {
 763         &dev_attr_sgx_total_bytes.attr,
 764         NULL,
 765 };
 766
 767 const struct attribute_group arch_node_dev_group = {
 768         .name = "x86",
 769         .attrs = arch_node_dev_attrs,
 770         .is_visible = arch_node_attr_is_visible,
 771 };
 772
 773 static void __init arch_update_sysfs_visibility(int nid)
 774 {
 775         struct node *node = node_devices[nid];
 776         int ret;
 777
 778         ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group);
 779
 780         if (ret)
 781                 pr_err("sysfs update failed (%d), files may be invisible", ret);
 782 }
 783 #else /* !CONFIG_NUMA */
 784 static void __init arch_update_sysfs_visibility(int nid) {}
 785 #endif
 786
 787 static bool __init sgx_page_cache_init(void)
 788 {
 789         u32 eax, ebx, ecx, edx, type;
 790         u64 pa, size;
 791         int nid;
 792         int i;
 793
 794         sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
 795         if (!sgx_numa_nodes)
 796                 return false;
 797
 798         for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
 799                 cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
 800
 801                 type = eax & SGX_CPUID_EPC_MASK;
 802                 if (type == SGX_CPUID_EPC_INVALID)
 803                         break;
 804
 805                 if (type != SGX_CPUID_EPC_SECTION) {
 806                         pr_err_once("Unknown EPC section type: %u\n", type);
 807                         break;
 808                 }
 809
 810                 pa   = sgx_calc_section_metric(eax, ebx);
 811                 size = sgx_calc_section_metric(ecx, edx);
 812
 813                 pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1);
 814
 815                 if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) {
 816                         pr_err("No free memory for an EPC section\n");
 817                         break;
 818                 }
 819
 820                 nid = numa_map_to_online_node(phys_to_target_node(pa));
 821                 if (nid == NUMA_NO_NODE) {
 822                         /* The physical address is already printed above. */
 823                         pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
 824                         nid = 0;
 825                 }
 826
 827                 if (!node_isset(nid, sgx_numa_mask)) {
 828                         spin_lock_init(&sgx_numa_nodes[nid].lock);
 829                         INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
 830                         INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list);
 831                         node_set(nid, sgx_numa_mask);
 832                         sgx_numa_nodes[nid].size = 0;
 833
 834                         /* Make SGX-specific node sysfs files visible: */
 835                         arch_update_sysfs_visibility(nid);
 836                 }
 837
 838                 sgx_epc_sections[i].node =  &sgx_numa_nodes[nid];
 839                 sgx_numa_nodes[nid].size += size;
 840
 841                 sgx_nr_epc_sections++;
 842         }
 843
 844         if (!sgx_nr_epc_sections) {
 845                 pr_err("There are zero EPC sections.\n");
 846                 return false;
 847         }
 848
 849         return true;
 850 }
 851
 852 /*
 853  * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
 854  * Bare-metal driver requires to update them to hash of enclave's signer
 855  * before EINIT. KVM needs to update them to guest's virtual MSR values
 856  * before doing EINIT from guest.
 857  */
 858 void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
 859 {
 860         int i;
 861
 862         WARN_ON_ONCE(preemptible());
 863
 864         for (i = 0; i < 4; i++)
 865                 wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
 866 }
 867
 868 const struct file_operations sgx_provision_fops = {
 869         .owner                  = THIS_MODULE,
 870 };
 871
 872 static struct miscdevice sgx_dev_provision = {
 873         .minor = MISC_DYNAMIC_MINOR,
 874         .name = "sgx_provision",
 875         .nodename = "sgx_provision",
 876         .fops = &sgx_provision_fops,
 877 };
 878
 879 /**
 880  * sgx_set_attribute() - Update allowed attributes given file descriptor
 881  * @allowed_attributes:         Pointer to allowed enclave attributes
 882  * @attribute_fd:               File descriptor for specific attribute
 883  *
 884  * Append enclave attribute indicated by file descriptor to allowed
 885  * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
 886  * /dev/sgx_provision is supported.
 887  *
 888  * Return:
 889  * -0:          SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
 890  * -EINVAL:     Invalid, or not supported file descriptor
 891  */
 892 int sgx_set_attribute(unsigned long *allowed_attributes,
 893                       unsigned int attribute_fd)
 894 {
 895         struct fd f = fdget(attribute_fd);
 896
 897         if (!f.file)
 898                 return -EINVAL;
 899
 900         if (f.file->f_op != &sgx_provision_fops) {
 901                 fdput(f);
 902                 return -EINVAL;
 903         }
 904
 905         *allowed_attributes |= SGX_ATTR_PROVISIONKEY;
 906
 907         fdput(f);
 908         return 0;
 909 }
 910 EXPORT_SYMBOL_GPL(sgx_set_attribute);
 911
 912 static int __init sgx_init(void)
 913 {
 914         int ret;
 915         int i;
 916
 917         if (!cpu_feature_enabled(X86_FEATURE_SGX))
 918                 return -ENODEV;
 919
 920         if (!sgx_page_cache_init())
 921                 return -ENOMEM;
 922
 923         if (!sgx_page_reclaimer_init()) {
 924                 ret = -ENOMEM;
 925                 goto err_page_cache;
 926         }
 927
 928         ret = misc_register(&sgx_dev_provision);
 929         if (ret)
 930                 goto err_kthread;
 931
 932         /*
 933          * Always try to initialize the native *and* KVM drivers.
 934          * The KVM driver is less picky than the native one and
 935          * can function if the native one is not supported on the
 936          * current system or fails to initialize.
 937          *
 938          * Error out only if both fail to initialize.
 939          */
 940         ret = sgx_drv_init();
 941
 942         if (sgx_vepc_init() && ret)
 943                 goto err_provision;
 944
 945         return 0;
 946
 947 err_provision:
 948         misc_deregister(&sgx_dev_provision);
 949
 950 err_kthread:
 951         kthread_stop(ksgxd_tsk);
 952
 953 err_page_cache:
 954         for (i = 0; i < sgx_nr_epc_sections; i++) {
 955                 vfree(sgx_epc_sections[i].pages);
 956                 memunmap(sgx_epc_sections[i].virt_addr);
 957         }
 958
 959         return ret;
 960 }
 961
 962 device_initcall(sgx_init);