arch/x86/kernel/cpu/sgx/virt.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Device driver to expose SGX enclave memory to KVM guests.
   4  *
   5  * Copyright(c) 2021 Intel Corporation.
   6  */
   7
   8 #include <linux/miscdevice.h>
   9 #include <linux/mm.h>
  10 #include <linux/mman.h>
  11 #include <linux/sched/mm.h>
  12 #include <linux/sched/signal.h>
  13 #include <linux/slab.h>
  14 #include <linux/xarray.h>
  15 #include <asm/sgx.h>
  16 #include <uapi/asm/sgx.h>
  17
  18 #include "encls.h"
  19 #include "sgx.h"
  20
  21 struct sgx_vepc {
  22         struct xarray page_array;
  23         struct mutex lock;
  24 };
  25
  26 /*
  27  * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
  28  * virtual EPC instances, and the lock to protect it.
  29  */
  30 static struct mutex zombie_secs_pages_lock;
  31 static struct list_head zombie_secs_pages;
  32
  33 static int __sgx_vepc_fault(struct sgx_vepc *vepc,
  34                             struct vm_area_struct *vma, unsigned long addr)
  35 {
  36         struct sgx_epc_page *epc_page;
  37         unsigned long index, pfn;
  38         int ret;
  39
  40         WARN_ON(!mutex_is_locked(&vepc->lock));
  41
  42         /* Calculate index of EPC page in virtual EPC's page_array */
  43         index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
  44
  45         epc_page = xa_load(&vepc->page_array, index);
  46         if (epc_page)
  47                 return 0;
  48
  49         epc_page = sgx_alloc_epc_page(vepc, false);
  50         if (IS_ERR(epc_page))
  51                 return PTR_ERR(epc_page);
  52
  53         ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
  54         if (ret)
  55                 goto err_free;
  56
  57         pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
  58
  59         ret = vmf_insert_pfn(vma, addr, pfn);
  60         if (ret != VM_FAULT_NOPAGE) {
  61                 ret = -EFAULT;
  62                 goto err_delete;
  63         }
  64
  65         return 0;
  66
  67 err_delete:
  68         xa_erase(&vepc->page_array, index);
  69 err_free:
  70         sgx_free_epc_page(epc_page);
  71         return ret;
  72 }
  73
  74 static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
  75 {
  76         struct vm_area_struct *vma = vmf->vma;
  77         struct sgx_vepc *vepc = vma->vm_private_data;
  78         int ret;
  79
  80         mutex_lock(&vepc->lock);
  81         ret = __sgx_vepc_fault(vepc, vma, vmf->address);
  82         mutex_unlock(&vepc->lock);
  83
  84         if (!ret)
  85                 return VM_FAULT_NOPAGE;
  86
  87         if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
  88                 mmap_read_unlock(vma->vm_mm);
  89                 return VM_FAULT_RETRY;
  90         }
  91
  92         return VM_FAULT_SIGBUS;
  93 }
  94
  95 static const struct vm_operations_struct sgx_vepc_vm_ops = {
  96         .fault = sgx_vepc_fault,
  97 };
  98
  99 static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
 100 {
 101         struct sgx_vepc *vepc = file->private_data;
 102
 103         if (!(vma->vm_flags & VM_SHARED))
 104                 return -EINVAL;
 105
 106         vma->vm_ops = &sgx_vepc_vm_ops;
 107         /* Don't copy VMA in fork() */
 108         vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY);
 109         vma->vm_private_data = vepc;
 110
 111         return 0;
 112 }
 113
 114 static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)
 115 {
 116         /*
 117          * Take a previously guest-owned EPC page and return it to the
 118          * general EPC page pool.
 119          *
 120          * Guests can not be trusted to have left this page in a good
 121          * state, so run EREMOVE on the page unconditionally.  In the
 122          * case that a guest properly EREMOVE'd this page, a superfluous
 123          * EREMOVE is harmless.
 124          */
 125         return __eremove(sgx_get_epc_virt_addr(epc_page));
 126 }
 127
 128 static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
 129 {
 130         int ret = sgx_vepc_remove_page(epc_page);
 131         if (ret) {
 132                 /*
 133                  * Only SGX_CHILD_PRESENT is expected, which is because of
 134                  * EREMOVE'ing an SECS still with child, in which case it can
 135                  * be handled by EREMOVE'ing the SECS again after all pages in
 136                  * virtual EPC have been EREMOVE'd. See comments in below in
 137                  * sgx_vepc_release().
 138                  *
 139                  * The user of virtual EPC (KVM) needs to guarantee there's no
 140                  * logical processor is still running in the enclave in guest,
 141                  * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
 142                  * handled here.
 143                  */
 144                 WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
 145                           ret, ret);
 146                 return ret;
 147         }
 148
 149         sgx_free_epc_page(epc_page);
 150         return 0;
 151 }
 152
 153 static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
 154 {
 155         struct sgx_epc_page *entry;
 156         unsigned long index;
 157         long failures = 0;
 158
 159         xa_for_each(&vepc->page_array, index, entry) {
 160                 int ret = sgx_vepc_remove_page(entry);
 161                 if (ret) {
 162                         if (ret == SGX_CHILD_PRESENT) {
 163                                 /* The page is a SECS, userspace will retry.  */
 164                                 failures++;
 165                         } else {
 166                                 /*
 167                                  * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
 168                                  * WARN, as userspace can induce said failures by
 169                                  * calling the ioctl concurrently on multiple vEPCs or
 170                                  * while one or more CPUs is running the enclave.  Only
 171                                  * a #PF on EREMOVE indicates a kernel/hardware issue.
 172                                  */
 173                                 WARN_ON_ONCE(encls_faulted(ret) &&
 174                                              ENCLS_TRAPNR(ret) != X86_TRAP_GP);
 175                                 return -EBUSY;
 176                         }
 177                 }
 178                 cond_resched();
 179         }
 180
 181         /*
 182          * Return the number of SECS pages that failed to be removed, so
 183          * userspace knows that it has to retry.
 184          */
 185         return failures;
 186 }
 187
 188 static int sgx_vepc_release(struct inode *inode, struct file *file)
 189 {
 190         struct sgx_vepc *vepc = file->private_data;
 191         struct sgx_epc_page *epc_page, *tmp, *entry;
 192         unsigned long index;
 193
 194         LIST_HEAD(secs_pages);
 195
 196         xa_for_each(&vepc->page_array, index, entry) {
 197                 /*
 198                  * Remove all normal, child pages.  sgx_vepc_free_page()
 199                  * will fail if EREMOVE fails, but this is OK and expected on
 200                  * SECS pages.  Those can only be EREMOVE'd *after* all their
 201                  * child pages. Retries below will clean them up.
 202                  */
 203                 if (sgx_vepc_free_page(entry))
 204                         continue;
 205
 206                 xa_erase(&vepc->page_array, index);
 207                 cond_resched();
 208         }
 209
 210         /*
 211          * Retry EREMOVE'ing pages.  This will clean up any SECS pages that
 212          * only had children in this 'epc' area.
 213          */
 214         xa_for_each(&vepc->page_array, index, entry) {
 215                 epc_page = entry;
 216                 /*
 217                  * An EREMOVE failure here means that the SECS page still
 218                  * has children.  But, since all children in this 'sgx_vepc'
 219                  * have been removed, the SECS page must have a child on
 220                  * another instance.
 221                  */
 222                 if (sgx_vepc_free_page(epc_page))
 223                         list_add_tail(&epc_page->list, &secs_pages);
 224
 225                 xa_erase(&vepc->page_array, index);
 226                 cond_resched();
 227         }
 228
 229         /*
 230          * SECS pages are "pinned" by child pages, and "unpinned" once all
 231          * children have been EREMOVE'd.  A child page in this instance
 232          * may have pinned an SECS page encountered in an earlier release(),
 233          * creating a zombie.  Since some children were EREMOVE'd above,
 234          * try to EREMOVE all zombies in the hopes that one was unpinned.
 235          */
 236         mutex_lock(&zombie_secs_pages_lock);
 237         list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
 238                 /*
 239                  * Speculatively remove the page from the list of zombies,
 240                  * if the page is successfully EREMOVE'd it will be added to
 241                  * the list of free pages.  If EREMOVE fails, throw the page
 242                  * on the local list, which will be spliced on at the end.
 243                  */
 244                 list_del(&epc_page->list);
 245
 246                 if (sgx_vepc_free_page(epc_page))
 247                         list_add_tail(&epc_page->list, &secs_pages);
 248                 cond_resched();
 249         }
 250
 251         if (!list_empty(&secs_pages))
 252                 list_splice_tail(&secs_pages, &zombie_secs_pages);
 253         mutex_unlock(&zombie_secs_pages_lock);
 254
 255         xa_destroy(&vepc->page_array);
 256         kfree(vepc);
 257
 258         return 0;
 259 }
 260
 261 static int sgx_vepc_open(struct inode *inode, struct file *file)
 262 {
 263         struct sgx_vepc *vepc;
 264
 265         vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
 266         if (!vepc)
 267                 return -ENOMEM;
 268         mutex_init(&vepc->lock);
 269         xa_init(&vepc->page_array);
 270
 271         file->private_data = vepc;
 272
 273         return 0;
 274 }
 275
 276 static long sgx_vepc_ioctl(struct file *file,
 277                            unsigned int cmd, unsigned long arg)
 278 {
 279         struct sgx_vepc *vepc = file->private_data;
 280
 281         switch (cmd) {
 282         case SGX_IOC_VEPC_REMOVE_ALL:
 283                 if (arg)
 284                         return -EINVAL;
 285                 return sgx_vepc_remove_all(vepc);
 286
 287         default:
 288                 return -ENOTTY;
 289         }
 290 }
 291
 292 static const struct file_operations sgx_vepc_fops = {
 293         .owner          = THIS_MODULE,
 294         .open           = sgx_vepc_open,
 295         .unlocked_ioctl = sgx_vepc_ioctl,
 296         .compat_ioctl   = sgx_vepc_ioctl,
 297         .release        = sgx_vepc_release,
 298         .mmap           = sgx_vepc_mmap,
 299 };
 300
 301 static struct miscdevice sgx_vepc_dev = {
 302         .minor          = MISC_DYNAMIC_MINOR,
 303         .name           = "sgx_vepc",
 304         .nodename       = "sgx_vepc",
 305         .fops           = &sgx_vepc_fops,
 306 };
 307
 308 int __init sgx_vepc_init(void)
 309 {
 310         /* SGX virtualization requires KVM to work */
 311         if (!cpu_feature_enabled(X86_FEATURE_VMX))
 312                 return -ENODEV;
 313
 314         INIT_LIST_HEAD(&zombie_secs_pages);
 315         mutex_init(&zombie_secs_pages_lock);
 316
 317         return misc_register(&sgx_vepc_dev);
 318 }
 319
 320 /**
 321  * sgx_virt_ecreate() - Run ECREATE on behalf of guest
 322  * @pageinfo:   Pointer to PAGEINFO structure
 323  * @secs:       Userspace pointer to SECS page
 324  * @trapnr:     trap number injected to guest in case of ECREATE error
 325  *
 326  * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
 327  * of enforcing policies of guest's enclaves, and return the trap number
 328  * which should be injected to guest in case of any ECREATE error.
 329  *
 330  * Return:
 331  * -  0:        ECREATE was successful.
 332  * - <0:        on error.
 333  */
 334 int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
 335                      int *trapnr)
 336 {
 337         int ret;
 338
 339         /*
 340          * @secs is an untrusted, userspace-provided address.  It comes from
 341          * KVM and is assumed to be a valid pointer which points somewhere in
 342          * userspace.  This can fault and call SGX or other fault handlers when
 343          * userspace mapping @secs doesn't exist.
 344          *
 345          * Add a WARN() to make sure @secs is already valid userspace pointer
 346          * from caller (KVM), who should already have handled invalid pointer
 347          * case (for instance, made by malicious guest).  All other checks,
 348          * such as alignment of @secs, are deferred to ENCLS itself.
 349          */
 350         if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
 351                 return -EINVAL;
 352
 353         __uaccess_begin();
 354         ret = __ecreate(pageinfo, (void *)secs);
 355         __uaccess_end();
 356
 357         if (encls_faulted(ret)) {
 358                 *trapnr = ENCLS_TRAPNR(ret);
 359                 return -EFAULT;
 360         }
 361
 362         /* ECREATE doesn't return an error code, it faults or succeeds. */
 363         WARN_ON_ONCE(ret);
 364         return 0;
 365 }
 366 EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
 367
 368 static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
 369                             void __user *secs)
 370 {
 371         int ret;
 372
 373         /*
 374          * Make sure all userspace pointers from caller (KVM) are valid.
 375          * All other checks deferred to ENCLS itself.  Also see comment
 376          * for @secs in sgx_virt_ecreate().
 377          */
 378 #define SGX_EINITTOKEN_SIZE     304
 379         if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
 380                          !access_ok(token, SGX_EINITTOKEN_SIZE) ||
 381                          !access_ok(secs, PAGE_SIZE)))
 382                 return -EINVAL;
 383
 384         __uaccess_begin();
 385         ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
 386         __uaccess_end();
 387
 388         return ret;
 389 }
 390
 391 /**
 392  * sgx_virt_einit() - Run EINIT on behalf of guest
 393  * @sigstruct:          Userspace pointer to SIGSTRUCT structure
 394  * @token:              Userspace pointer to EINITTOKEN structure
 395  * @secs:               Userspace pointer to SECS page
 396  * @lepubkeyhash:       Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
 397  * @trapnr:             trap number injected to guest in case of EINIT error
 398  *
 399  * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
 400  * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
 401  * needs to update hardware values to guest's virtual MSR values in order to
 402  * ensure EINIT is executed with expected hardware values.
 403  *
 404  * Return:
 405  * -  0:        EINIT was successful.
 406  * - <0:        on error.
 407  */
 408 int sgx_virt_einit(void __user *sigstruct, void __user *token,
 409                    void __user *secs, u64 *lepubkeyhash, int *trapnr)
 410 {
 411         int ret;
 412
 413         if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
 414                 ret = __sgx_virt_einit(sigstruct, token, secs);
 415         } else {
 416                 preempt_disable();
 417
 418                 sgx_update_lepubkeyhash(lepubkeyhash);
 419
 420                 ret = __sgx_virt_einit(sigstruct, token, secs);
 421                 preempt_enable();
 422         }
 423
 424         /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
 425         if (ret == -EINVAL)
 426                 return ret;
 427
 428         if (encls_faulted(ret)) {
 429                 *trapnr = ENCLS_TRAPNR(ret);
 430                 return -EFAULT;
 431         }
 432
 433         return ret;
 434 }
 435 EXPORT_SYMBOL_GPL(sgx_virt_einit);