arch/x86/coco/tdx/tdx.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /* Copyright (C) 2021-2022 Intel Corporation */
   3
   4 #undef pr_fmt
   5 #define pr_fmt(fmt)     "tdx: " fmt
   6
   7 #include <linux/cpufeature.h>
   8 #include <linux/export.h>
   9 #include <linux/io.h>
  10 #include <asm/coco.h>
  11 #include <asm/tdx.h>
  12 #include <asm/vmx.h>
  13 #include <asm/ia32.h>
  14 #include <asm/insn.h>
  15 #include <asm/insn-eval.h>
  16 #include <asm/pgtable.h>
  17
  18 /* MMIO direction */
  19 #define EPT_READ        0
  20 #define EPT_WRITE       1
  21
  22 /* Port I/O direction */
  23 #define PORT_READ       0
  24 #define PORT_WRITE      1
  25
  26 /* See Exit Qualification for I/O Instructions in VMX documentation */
  27 #define VE_IS_IO_IN(e)          ((e) & BIT(3))
  28 #define VE_GET_IO_SIZE(e)       (((e) & GENMASK(2, 0)) + 1)
  29 #define VE_GET_PORT_NUM(e)      ((e) >> 16)
  30 #define VE_IS_IO_STRING(e)      ((e) & BIT(4))
  31
  32 #define ATTR_DEBUG              BIT(0)
  33 #define ATTR_SEPT_VE_DISABLE    BIT(28)
  34
  35 /* TDX Module call error codes */
  36 #define TDCALL_RETURN_CODE(a)   ((a) >> 32)
  37 #define TDCALL_INVALID_OPERAND  0xc0000100
  38
  39 #define TDREPORT_SUBTYPE_0      0
  40
  41 /* Called from __tdx_hypercall() for unrecoverable failure */
  42 noinstr void __noreturn __tdx_hypercall_failed(void)
  43 {
  44         instrumentation_begin();
  45         panic("TDVMCALL failed. TDX module bug?");
  46 }
  47
  48 #ifdef CONFIG_KVM_GUEST
  49 long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
  50                        unsigned long p3, unsigned long p4)
  51 {
  52         struct tdx_module_args args = {
  53                 .r10 = nr,
  54                 .r11 = p1,
  55                 .r12 = p2,
  56                 .r13 = p3,
  57                 .r14 = p4,
  58         };
  59
  60         return __tdx_hypercall(&args);
  61 }
  62 EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
  63 #endif
  64
  65 /*
  66  * Used for TDX guests to make calls directly to the TD module.  This
  67  * should only be used for calls that have no legitimate reason to fail
  68  * or where the kernel can not survive the call failing.
  69  */
  70 static inline void tdcall(u64 fn, struct tdx_module_args *args)
  71 {
  72         if (__tdcall_ret(fn, args))
  73                 panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
  74 }
  75
  76 /**
  77  * tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT
  78  *                           subtype 0) using TDG.MR.REPORT TDCALL.
  79  * @reportdata: Address of the input buffer which contains user-defined
  80  *              REPORTDATA to be included into TDREPORT.
  81  * @tdreport: Address of the output buffer to store TDREPORT.
  82  *
  83  * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module
  84  * v1.0 specification for more information on TDG.MR.REPORT TDCALL.
  85  * It is used in the TDX guest driver module to get the TDREPORT0.
  86  *
  87  * Return 0 on success, -EINVAL for invalid operands, or -EIO on
  88  * other TDCALL failures.
  89  */
  90 int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
  91 {
  92         struct tdx_module_args args = {
  93                 .rcx = virt_to_phys(tdreport),
  94                 .rdx = virt_to_phys(reportdata),
  95                 .r8 = TDREPORT_SUBTYPE_0,
  96         };
  97         u64 ret;
  98
  99         ret = __tdcall(TDG_MR_REPORT, &args);
 100         if (ret) {
 101                 if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
 102                         return -EINVAL;
 103                 return -EIO;
 104         }
 105
 106         return 0;
 107 }
 108 EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
 109
 110 /**
 111  * tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote
 112  *                         hypercall.
 113  * @buf: Address of the directly mapped shared kernel buffer which
 114  *       contains TDREPORT. The same buffer will be used by VMM to
 115  *       store the generated TD Quote output.
 116  * @size: size of the tdquote buffer (4KB-aligned).
 117  *
 118  * Refer to section titled "TDG.VP.VMCALL<GetQuote>" in the TDX GHCI
 119  * v1.0 specification for more information on GetQuote hypercall.
 120  * It is used in the TDX guest driver module to get the TD Quote.
 121  *
 122  * Return 0 on success or error code on failure.
 123  */
 124 u64 tdx_hcall_get_quote(u8 *buf, size_t size)
 125 {
 126         /* Since buf is a shared memory, set the shared (decrypted) bits */
 127         return _tdx_hypercall(TDVMCALL_GET_QUOTE, cc_mkdec(virt_to_phys(buf)), size, 0, 0);
 128 }
 129 EXPORT_SYMBOL_GPL(tdx_hcall_get_quote);
 130
 131 static void __noreturn tdx_panic(const char *msg)
 132 {
 133         struct tdx_module_args args = {
 134                 .r10 = TDX_HYPERCALL_STANDARD,
 135                 .r11 = TDVMCALL_REPORT_FATAL_ERROR,
 136                 .r12 = 0, /* Error code: 0 is Panic */
 137         };
 138         union {
 139                 /* Define register order according to the GHCI */
 140                 struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; };
 141
 142                 char str[64];
 143         } message;
 144
 145         /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
 146         strtomem_pad(message.str, msg, '\0');
 147
 148         args.r8  = message.r8;
 149         args.r9  = message.r9;
 150         args.r14 = message.r14;
 151         args.r15 = message.r15;
 152         args.rdi = message.rdi;
 153         args.rsi = message.rsi;
 154         args.rbx = message.rbx;
 155         args.rdx = message.rdx;
 156
 157         /*
 158          * This hypercall should never return and it is not safe
 159          * to keep the guest running. Call it forever if it
 160          * happens to return.
 161          */
 162         while (1)
 163                 __tdx_hypercall(&args);
 164 }
 165
 166 static void tdx_parse_tdinfo(u64 *cc_mask)
 167 {
 168         struct tdx_module_args args = {};
 169         unsigned int gpa_width;
 170         u64 td_attr;
 171
 172         /*
 173          * TDINFO TDX module call is used to get the TD execution environment
 174          * information like GPA width, number of available vcpus, debug mode
 175          * information, etc. More details about the ABI can be found in TDX
 176          * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
 177          * [TDG.VP.INFO].
 178          */
 179         tdcall(TDG_VP_INFO, &args);
 180
 181         /*
 182          * The highest bit of a guest physical address is the "sharing" bit.
 183          * Set it for shared pages and clear it for private pages.
 184          *
 185          * The GPA width that comes out of this call is critical. TDX guests
 186          * can not meaningfully run without it.
 187          */
 188         gpa_width = args.rcx & GENMASK(5, 0);
 189         *cc_mask = BIT_ULL(gpa_width - 1);
 190
 191         /*
 192          * The kernel can not handle #VE's when accessing normal kernel
 193          * memory.  Ensure that no #VE will be delivered for accesses to
 194          * TD-private memory.  Only VMM-shared memory (MMIO) will #VE.
 195          */
 196         td_attr = args.rdx;
 197         if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
 198                 const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
 199
 200                 /* Relax SEPT_VE_DISABLE check for debug TD. */
 201                 if (td_attr & ATTR_DEBUG)
 202                         pr_warn("%s\n", msg);
 203                 else
 204                         tdx_panic(msg);
 205         }
 206 }
 207
 208 /*
 209  * The TDX module spec states that #VE may be injected for a limited set of
 210  * reasons:
 211  *
 212  *  - Emulation of the architectural #VE injection on EPT violation;
 213  *
 214  *  - As a result of guest TD execution of a disallowed instruction,
 215  *    a disallowed MSR access, or CPUID virtualization;
 216  *
 217  *  - A notification to the guest TD about anomalous behavior;
 218  *
 219  * The last one is opt-in and is not used by the kernel.
 220  *
 221  * The Intel Software Developer's Manual describes cases when instruction
 222  * length field can be used in section "Information for VM Exits Due to
 223  * Instruction Execution".
 224  *
 225  * For TDX, it ultimately means GET_VEINFO provides reliable instruction length
 226  * information if #VE occurred due to instruction execution, but not for EPT
 227  * violations.
 228  */
 229 static int ve_instr_len(struct ve_info *ve)
 230 {
 231         switch (ve->exit_reason) {
 232         case EXIT_REASON_HLT:
 233         case EXIT_REASON_MSR_READ:
 234         case EXIT_REASON_MSR_WRITE:
 235         case EXIT_REASON_CPUID:
 236         case EXIT_REASON_IO_INSTRUCTION:
 237                 /* It is safe to use ve->instr_len for #VE due instructions */
 238                 return ve->instr_len;
 239         case EXIT_REASON_EPT_VIOLATION:
 240                 /*
 241                  * For EPT violations, ve->insn_len is not defined. For those,
 242                  * the kernel must decode instructions manually and should not
 243                  * be using this function.
 244                  */
 245                 WARN_ONCE(1, "ve->instr_len is not defined for EPT violations");
 246                 return 0;
 247         default:
 248                 WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason);
 249                 return ve->instr_len;
 250         }
 251 }
 252
 253 static u64 __cpuidle __halt(const bool irq_disabled)
 254 {
 255         struct tdx_module_args args = {
 256                 .r10 = TDX_HYPERCALL_STANDARD,
 257                 .r11 = hcall_func(EXIT_REASON_HLT),
 258                 .r12 = irq_disabled,
 259         };
 260
 261         /*
 262          * Emulate HLT operation via hypercall. More info about ABI
 263          * can be found in TDX Guest-Host-Communication Interface
 264          * (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>.
 265          *
 266          * The VMM uses the "IRQ disabled" param to understand IRQ
 267          * enabled status (RFLAGS.IF) of the TD guest and to determine
 268          * whether or not it should schedule the halted vCPU if an
 269          * IRQ becomes pending. E.g. if IRQs are disabled, the VMM
 270          * can keep the vCPU in virtual HLT, even if an IRQ is
 271          * pending, without hanging/breaking the guest.
 272          */
 273         return __tdx_hypercall(&args);
 274 }
 275
 276 static int handle_halt(struct ve_info *ve)
 277 {
 278         const bool irq_disabled = irqs_disabled();
 279
 280         if (__halt(irq_disabled))
 281                 return -EIO;
 282
 283         return ve_instr_len(ve);
 284 }
 285
 286 void __cpuidle tdx_safe_halt(void)
 287 {
 288         const bool irq_disabled = false;
 289
 290         /*
 291          * Use WARN_ONCE() to report the failure.
 292          */
 293         if (__halt(irq_disabled))
 294                 WARN_ONCE(1, "HLT instruction emulation failed\n");
 295 }
 296
 297 static int read_msr(struct pt_regs *regs, struct ve_info *ve)
 298 {
 299         struct tdx_module_args args = {
 300                 .r10 = TDX_HYPERCALL_STANDARD,
 301                 .r11 = hcall_func(EXIT_REASON_MSR_READ),
 302                 .r12 = regs->cx,
 303         };
 304
 305         /*
 306          * Emulate the MSR read via hypercall. More info about ABI
 307          * can be found in TDX Guest-Host-Communication Interface
 308          * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
 309          */
 310         if (__tdx_hypercall(&args))
 311                 return -EIO;
 312
 313         regs->ax = lower_32_bits(args.r11);
 314         regs->dx = upper_32_bits(args.r11);
 315         return ve_instr_len(ve);
 316 }
 317
 318 static int write_msr(struct pt_regs *regs, struct ve_info *ve)
 319 {
 320         struct tdx_module_args args = {
 321                 .r10 = TDX_HYPERCALL_STANDARD,
 322                 .r11 = hcall_func(EXIT_REASON_MSR_WRITE),
 323                 .r12 = regs->cx,
 324                 .r13 = (u64)regs->dx << 32 | regs->ax,
 325         };
 326
 327         /*
 328          * Emulate the MSR write via hypercall. More info about ABI
 329          * can be found in TDX Guest-Host-Communication Interface
 330          * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
 331          */
 332         if (__tdx_hypercall(&args))
 333                 return -EIO;
 334
 335         return ve_instr_len(ve);
 336 }
 337
 338 static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
 339 {
 340         struct tdx_module_args args = {
 341                 .r10 = TDX_HYPERCALL_STANDARD,
 342                 .r11 = hcall_func(EXIT_REASON_CPUID),
 343                 .r12 = regs->ax,
 344                 .r13 = regs->cx,
 345         };
 346
 347         /*
 348          * Only allow VMM to control range reserved for hypervisor
 349          * communication.
 350          *
 351          * Return all-zeros for any CPUID outside the range. It matches CPU
 352          * behaviour for non-supported leaf.
 353          */
 354         if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {
 355                 regs->ax = regs->bx = regs->cx = regs->dx = 0;
 356                 return ve_instr_len(ve);
 357         }
 358
 359         /*
 360          * Emulate the CPUID instruction via a hypercall. More info about
 361          * ABI can be found in TDX Guest-Host-Communication Interface
 362          * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
 363          */
 364         if (__tdx_hypercall(&args))
 365                 return -EIO;
 366
 367         /*
 368          * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
 369          * EAX, EBX, ECX, EDX registers after the CPUID instruction execution.
 370          * So copy the register contents back to pt_regs.
 371          */
 372         regs->ax = args.r12;
 373         regs->bx = args.r13;
 374         regs->cx = args.r14;
 375         regs->dx = args.r15;
 376
 377         return ve_instr_len(ve);
 378 }
 379
 380 static bool mmio_read(int size, unsigned long addr, unsigned long *val)
 381 {
 382         struct tdx_module_args args = {
 383                 .r10 = TDX_HYPERCALL_STANDARD,
 384                 .r11 = hcall_func(EXIT_REASON_EPT_VIOLATION),
 385                 .r12 = size,
 386                 .r13 = EPT_READ,
 387                 .r14 = addr,
 388                 .r15 = *val,
 389         };
 390
 391         if (__tdx_hypercall(&args))
 392                 return false;
 393
 394         *val = args.r11;
 395         return true;
 396 }
 397
 398 static bool mmio_write(int size, unsigned long addr, unsigned long val)
 399 {
 400         return !_tdx_hypercall(hcall_func(EXIT_REASON_EPT_VIOLATION), size,
 401                                EPT_WRITE, addr, val);
 402 }
 403
 404 static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
 405 {
 406         unsigned long *reg, val, vaddr;
 407         char buffer[MAX_INSN_SIZE];
 408         enum insn_mmio_type mmio;
 409         struct insn insn = {};
 410         int size, extend_size;
 411         u8 extend_val = 0;
 412
 413         /* Only in-kernel MMIO is supported */
 414         if (WARN_ON_ONCE(user_mode(regs)))
 415                 return -EFAULT;
 416
 417         if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
 418                 return -EFAULT;
 419
 420         if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
 421                 return -EINVAL;
 422
 423         mmio = insn_decode_mmio(&insn, &size);
 424         if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
 425                 return -EINVAL;
 426
 427         if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
 428                 reg = insn_get_modrm_reg_ptr(&insn, regs);
 429                 if (!reg)
 430                         return -EINVAL;
 431         }
 432
 433         /*
 434          * Reject EPT violation #VEs that split pages.
 435          *
 436          * MMIO accesses are supposed to be naturally aligned and therefore
 437          * never cross page boundaries. Seeing split page accesses indicates
 438          * a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
 439          *
 440          * load_unaligned_zeropad() will recover using exception fixups.
 441          */
 442         vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
 443         if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
 444                 return -EFAULT;
 445
 446         /* Handle writes first */
 447         switch (mmio) {
 448         case INSN_MMIO_WRITE:
 449                 memcpy(&val, reg, size);
 450                 if (!mmio_write(size, ve->gpa, val))
 451                         return -EIO;
 452                 return insn.length;
 453         case INSN_MMIO_WRITE_IMM:
 454                 val = insn.immediate.value;
 455                 if (!mmio_write(size, ve->gpa, val))
 456                         return -EIO;
 457                 return insn.length;
 458         case INSN_MMIO_READ:
 459         case INSN_MMIO_READ_ZERO_EXTEND:
 460         case INSN_MMIO_READ_SIGN_EXTEND:
 461                 /* Reads are handled below */
 462                 break;
 463         case INSN_MMIO_MOVS:
 464         case INSN_MMIO_DECODE_FAILED:
 465                 /*
 466                  * MMIO was accessed with an instruction that could not be
 467                  * decoded or handled properly. It was likely not using io.h
 468                  * helpers or accessed MMIO accidentally.
 469                  */
 470                 return -EINVAL;
 471         default:
 472                 WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
 473                 return -EINVAL;
 474         }
 475
 476         /* Handle reads */
 477         if (!mmio_read(size, ve->gpa, &val))
 478                 return -EIO;
 479
 480         switch (mmio) {
 481         case INSN_MMIO_READ:
 482                 /* Zero-extend for 32-bit operation */
 483                 extend_size = size == 4 ? sizeof(*reg) : 0;
 484                 break;
 485         case INSN_MMIO_READ_ZERO_EXTEND:
 486                 /* Zero extend based on operand size */
 487                 extend_size = insn.opnd_bytes;
 488                 break;
 489         case INSN_MMIO_READ_SIGN_EXTEND:
 490                 /* Sign extend based on operand size */
 491                 extend_size = insn.opnd_bytes;
 492                 if (size == 1 && val & BIT(7))
 493                         extend_val = 0xFF;
 494                 else if (size > 1 && val & BIT(15))
 495                         extend_val = 0xFF;
 496                 break;
 497         default:
 498                 /* All other cases has to be covered with the first switch() */
 499                 WARN_ON_ONCE(1);
 500                 return -EINVAL;
 501         }
 502
 503         if (extend_size)
 504                 memset(reg, extend_val, extend_size);
 505         memcpy(reg, &val, size);
 506         return insn.length;
 507 }
 508
 509 static bool handle_in(struct pt_regs *regs, int size, int port)
 510 {
 511         struct tdx_module_args args = {
 512                 .r10 = TDX_HYPERCALL_STANDARD,
 513                 .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
 514                 .r12 = size,
 515                 .r13 = PORT_READ,
 516                 .r14 = port,
 517         };
 518         u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
 519         bool success;
 520
 521         /*
 522          * Emulate the I/O read via hypercall. More info about ABI can be found
 523          * in TDX Guest-Host-Communication Interface (GHCI) section titled
 524          * "TDG.VP.VMCALL<Instruction.IO>".
 525          */
 526         success = !__tdx_hypercall(&args);
 527
 528         /* Update part of the register affected by the emulated instruction */
 529         regs->ax &= ~mask;
 530         if (success)
 531                 regs->ax |= args.r11 & mask;
 532
 533         return success;
 534 }
 535
 536 static bool handle_out(struct pt_regs *regs, int size, int port)
 537 {
 538         u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
 539
 540         /*
 541          * Emulate the I/O write via hypercall. More info about ABI can be found
 542          * in TDX Guest-Host-Communication Interface (GHCI) section titled
 543          * "TDG.VP.VMCALL<Instruction.IO>".
 544          */
 545         return !_tdx_hypercall(hcall_func(EXIT_REASON_IO_INSTRUCTION), size,
 546                                PORT_WRITE, port, regs->ax & mask);
 547 }
 548
 549 /*
 550  * Emulate I/O using hypercall.
 551  *
 552  * Assumes the IO instruction was using ax, which is enforced
 553  * by the standard io.h macros.
 554  *
 555  * Return True on success or False on failure.
 556  */
 557 static int handle_io(struct pt_regs *regs, struct ve_info *ve)
 558 {
 559         u32 exit_qual = ve->exit_qual;
 560         int size, port;
 561         bool in, ret;
 562
 563         if (VE_IS_IO_STRING(exit_qual))
 564                 return -EIO;
 565
 566         in   = VE_IS_IO_IN(exit_qual);
 567         size = VE_GET_IO_SIZE(exit_qual);
 568         port = VE_GET_PORT_NUM(exit_qual);
 569
 570
 571         if (in)
 572                 ret = handle_in(regs, size, port);
 573         else
 574                 ret = handle_out(regs, size, port);
 575         if (!ret)
 576                 return -EIO;
 577
 578         return ve_instr_len(ve);
 579 }
 580
 581 /*
 582  * Early #VE exception handler. Only handles a subset of port I/O.
 583  * Intended only for earlyprintk. If failed, return false.
 584  */
 585 __init bool tdx_early_handle_ve(struct pt_regs *regs)
 586 {
 587         struct ve_info ve;
 588         int insn_len;
 589
 590         tdx_get_ve_info(&ve);
 591
 592         if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)
 593                 return false;
 594
 595         insn_len = handle_io(regs, &ve);
 596         if (insn_len < 0)
 597                 return false;
 598
 599         regs->ip += insn_len;
 600         return true;
 601 }
 602
 603 void tdx_get_ve_info(struct ve_info *ve)
 604 {
 605         struct tdx_module_args args = {};
 606
 607         /*
 608          * Called during #VE handling to retrieve the #VE info from the
 609          * TDX module.
 610          *
 611          * This has to be called early in #VE handling.  A "nested" #VE which
 612          * occurs before this will raise a #DF and is not recoverable.
 613          *
 614          * The call retrieves the #VE info from the TDX module, which also
 615          * clears the "#VE valid" flag. This must be done before anything else
 616          * because any #VE that occurs while the valid flag is set will lead to
 617          * #DF.
 618          *
 619          * Note, the TDX module treats virtual NMIs as inhibited if the #VE
 620          * valid flag is set. It means that NMI=>#VE will not result in a #DF.
 621          */
 622         tdcall(TDG_VP_VEINFO_GET, &args);
 623
 624         /* Transfer the output parameters */
 625         ve->exit_reason = args.rcx;
 626         ve->exit_qual   = args.rdx;
 627         ve->gla         = args.r8;
 628         ve->gpa         = args.r9;
 629         ve->instr_len   = lower_32_bits(args.r10);
 630         ve->instr_info  = upper_32_bits(args.r10);
 631 }
 632
 633 /*
 634  * Handle the user initiated #VE.
 635  *
 636  * On success, returns the number of bytes RIP should be incremented (>=0)
 637  * or -errno on error.
 638  */
 639 static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
 640 {
 641         switch (ve->exit_reason) {
 642         case EXIT_REASON_CPUID:
 643                 return handle_cpuid(regs, ve);
 644         default:
 645                 pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
 646                 return -EIO;
 647         }
 648 }
 649
 650 static inline bool is_private_gpa(u64 gpa)
 651 {
 652         return gpa == cc_mkenc(gpa);
 653 }
 654
 655 /*
 656  * Handle the kernel #VE.
 657  *
 658  * On success, returns the number of bytes RIP should be incremented (>=0)
 659  * or -errno on error.
 660  */
 661 static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
 662 {
 663         switch (ve->exit_reason) {
 664         case EXIT_REASON_HLT:
 665                 return handle_halt(ve);
 666         case EXIT_REASON_MSR_READ:
 667                 return read_msr(regs, ve);
 668         case EXIT_REASON_MSR_WRITE:
 669                 return write_msr(regs, ve);
 670         case EXIT_REASON_CPUID:
 671                 return handle_cpuid(regs, ve);
 672         case EXIT_REASON_EPT_VIOLATION:
 673                 if (is_private_gpa(ve->gpa))
 674                         panic("Unexpected EPT-violation on private memory.");
 675                 return handle_mmio(regs, ve);
 676         case EXIT_REASON_IO_INSTRUCTION:
 677                 return handle_io(regs, ve);
 678         default:
 679                 pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
 680                 return -EIO;
 681         }
 682 }
 683
 684 bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
 685 {
 686         int insn_len;
 687
 688         if (user_mode(regs))
 689                 insn_len = virt_exception_user(regs, ve);
 690         else
 691                 insn_len = virt_exception_kernel(regs, ve);
 692         if (insn_len < 0)
 693                 return false;
 694
 695         /* After successful #VE handling, move the IP */
 696         regs->ip += insn_len;
 697
 698         return true;
 699 }
 700
 701 static bool tdx_tlb_flush_required(bool private)
 702 {
 703         /*
 704          * TDX guest is responsible for flushing TLB on private->shared
 705          * transition. VMM is responsible for flushing on shared->private.
 706          *
 707          * The VMM _can't_ flush private addresses as it can't generate PAs
 708          * with the guest's HKID.  Shared memory isn't subject to integrity
 709          * checking, i.e. the VMM doesn't need to flush for its own protection.
 710          *
 711          * There's no need to flush when converting from shared to private,
 712          * as flushing is the VMM's responsibility in this case, e.g. it must
 713          * flush to avoid integrity failures in the face of a buggy or
 714          * malicious guest.
 715          */
 716         return !private;
 717 }
 718
 719 static bool tdx_cache_flush_required(void)
 720 {
 721         /*
 722          * AMD SME/SEV can avoid cache flushing if HW enforces cache coherence.
 723          * TDX doesn't have such capability.
 724          *
 725          * Flush cache unconditionally.
 726          */
 727         return true;
 728 }
 729
 730 /*
 731  * Notify the VMM about page mapping conversion. More info about ABI
 732  * can be found in TDX Guest-Host-Communication Interface (GHCI),
 733  * section "TDG.VP.VMCALL<MapGPA>".
 734  */
 735 static bool tdx_map_gpa(phys_addr_t start, phys_addr_t end, bool enc)
 736 {
 737         /* Retrying the hypercall a second time should succeed; use 3 just in case */
 738         const int max_retries_per_page = 3;
 739         int retry_count = 0;
 740
 741         if (!enc) {
 742                 /* Set the shared (decrypted) bits: */
 743                 start |= cc_mkdec(0);
 744                 end   |= cc_mkdec(0);
 745         }
 746
 747         while (retry_count < max_retries_per_page) {
 748                 struct tdx_module_args args = {
 749                         .r10 = TDX_HYPERCALL_STANDARD,
 750                         .r11 = TDVMCALL_MAP_GPA,
 751                         .r12 = start,
 752                         .r13 = end - start };
 753
 754                 u64 map_fail_paddr;
 755                 u64 ret = __tdx_hypercall(&args);
 756
 757                 if (ret != TDVMCALL_STATUS_RETRY)
 758                         return !ret;
 759                 /*
 760                  * The guest must retry the operation for the pages in the
 761                  * region starting at the GPA specified in R11. R11 comes
 762                  * from the untrusted VMM. Sanity check it.
 763                  */
 764                 map_fail_paddr = args.r11;
 765                 if (map_fail_paddr < start || map_fail_paddr >= end)
 766                         return false;
 767
 768                 /* "Consume" a retry without forward progress */
 769                 if (map_fail_paddr == start) {
 770                         retry_count++;
 771                         continue;
 772                 }
 773
 774                 start = map_fail_paddr;
 775                 retry_count = 0;
 776         }
 777
 778         return false;
 779 }
 780
 781 /*
 782  * Inform the VMM of the guest's intent for this physical page: shared with
 783  * the VMM or private to the guest.  The VMM is expected to change its mapping
 784  * of the page in response.
 785  */
 786 static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
 787 {
 788         phys_addr_t start = __pa(vaddr);
 789         phys_addr_t end   = __pa(vaddr + numpages * PAGE_SIZE);
 790
 791         if (!tdx_map_gpa(start, end, enc))
 792                 return false;
 793
 794         /* shared->private conversion requires memory to be accepted before use */
 795         if (enc)
 796                 return tdx_accept_memory(start, end);
 797
 798         return true;
 799 }
 800
 801 static bool tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
 802                                           bool enc)
 803 {
 804         /*
 805          * Only handle shared->private conversion here.
 806          * See the comment in tdx_early_init().
 807          */
 808         if (enc)
 809                 return tdx_enc_status_changed(vaddr, numpages, enc);
 810         return true;
 811 }
 812
 813 static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
 814                                          bool enc)
 815 {
 816         /*
 817          * Only handle private->shared conversion here.
 818          * See the comment in tdx_early_init().
 819          */
 820         if (!enc)
 821                 return tdx_enc_status_changed(vaddr, numpages, enc);
 822         return true;
 823 }
 824
 825 void __init tdx_early_init(void)
 826 {
 827         struct tdx_module_args args = {
 828                 .rdx = TDCS_NOTIFY_ENABLES,
 829                 .r9 = -1ULL,
 830         };
 831         u64 cc_mask;
 832         u32 eax, sig[3];
 833
 834         cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2],  &sig[1]);
 835
 836         if (memcmp(TDX_IDENT, sig, sizeof(sig)))
 837                 return;
 838
 839         setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
 840
 841         /* TSC is the only reliable clock in TDX guest */
 842         setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
 843
 844         cc_vendor = CC_VENDOR_INTEL;
 845         tdx_parse_tdinfo(&cc_mask);
 846         cc_set_mask(cc_mask);
 847
 848         /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
 849         tdcall(TDG_VM_WR, &args);
 850
 851         /*
 852          * All bits above GPA width are reserved and kernel treats shared bit
 853          * as flag, not as part of physical address.
 854          *
 855          * Adjust physical mask to only cover valid GPA bits.
 856          */
 857         physical_mask &= cc_mask - 1;
 858
 859         /*
 860          * The kernel mapping should match the TDX metadata for the page.
 861          * load_unaligned_zeropad() can touch memory *adjacent* to that which is
 862          * owned by the caller and can catch even _momentary_ mismatches.  Bad
 863          * things happen on mismatch:
 864          *
 865          *   - Private mapping => Shared Page  == Guest shutdown
 866          *   - Shared mapping  => Private Page == Recoverable #VE
 867          *
 868          * guest.enc_status_change_prepare() converts the page from
 869          * shared=>private before the mapping becomes private.
 870          *
 871          * guest.enc_status_change_finish() converts the page from
 872          * private=>shared after the mapping becomes private.
 873          *
 874          * In both cases there is a temporary shared mapping to a private page,
 875          * which can result in a #VE.  But, there is never a private mapping to
 876          * a shared page.
 877          */
 878         x86_platform.guest.enc_status_change_prepare = tdx_enc_status_change_prepare;
 879         x86_platform.guest.enc_status_change_finish  = tdx_enc_status_change_finish;
 880
 881         x86_platform.guest.enc_cache_flush_required  = tdx_cache_flush_required;
 882         x86_platform.guest.enc_tlb_flush_required    = tdx_tlb_flush_required;
 883
 884         /*
 885          * TDX intercepts the RDMSR to read the X2APIC ID in the parallel
 886          * bringup low level code. That raises #VE which cannot be handled
 887          * there.
 888          *
 889          * Intel-TDX has a secure RDMSR hypercall, but that needs to be
 890          * implemented separately in the low level startup ASM code.
 891          * Until that is in place, disable parallel bringup for TDX.
 892          */
 893         x86_cpuinit.parallel_bringup = false;
 894
 895         pr_info("Guest detected\n");
 896 }