]> Git Repo - linux.git/blob - arch/x86/kvm/xen.c
mm: make alloc_contig_range work at pageblock granularity
[linux.git] / arch / x86 / kvm / xen.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
4  * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5  *
6  * KVM Xen emulation
7  */
8
9 #include "x86.h"
10 #include "xen.h"
11 #include "hyperv.h"
12
13 #include <linux/kvm_host.h>
14 #include <linux/sched/stat.h>
15
16 #include <trace/events/kvm.h>
17 #include <xen/interface/xen.h>
18 #include <xen/interface/vcpu.h>
19 #include <xen/interface/event_channel.h>
20
21 #include "trace.h"
22
23 DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
24
25 static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
26 {
27         struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
28         struct pvclock_wall_clock *wc;
29         gpa_t gpa = gfn_to_gpa(gfn);
30         u32 *wc_sec_hi;
31         u32 wc_version;
32         u64 wall_nsec;
33         int ret = 0;
34         int idx = srcu_read_lock(&kvm->srcu);
35
36         if (gfn == GPA_INVALID) {
37                 kvm_gfn_to_pfn_cache_destroy(kvm, gpc);
38                 goto out;
39         }
40
41         do {
42                 ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN,
43                                                 gpa, PAGE_SIZE);
44                 if (ret)
45                         goto out;
46
47                 /*
48                  * This code mirrors kvm_write_wall_clock() except that it writes
49                  * directly through the pfn cache and doesn't mark the page dirty.
50                  */
51                 wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
52
53                 /* It could be invalid again already, so we need to check */
54                 read_lock_irq(&gpc->lock);
55
56                 if (gpc->valid)
57                         break;
58
59                 read_unlock_irq(&gpc->lock);
60         } while (1);
61
62         /* Paranoia checks on the 32-bit struct layout */
63         BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
64         BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924);
65         BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
66
67 #ifdef CONFIG_X86_64
68         /* Paranoia checks on the 64-bit struct layout */
69         BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00);
70         BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c);
71
72         if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
73                 struct shared_info *shinfo = gpc->khva;
74
75                 wc_sec_hi = &shinfo->wc_sec_hi;
76                 wc = &shinfo->wc;
77         } else
78 #endif
79         {
80                 struct compat_shared_info *shinfo = gpc->khva;
81
82                 wc_sec_hi = &shinfo->arch.wc_sec_hi;
83                 wc = &shinfo->wc;
84         }
85
86         /* Increment and ensure an odd value */
87         wc_version = wc->version = (wc->version + 1) | 1;
88         smp_wmb();
89
90         wc->nsec = do_div(wall_nsec,  1000000000);
91         wc->sec = (u32)wall_nsec;
92         *wc_sec_hi = wall_nsec >> 32;
93         smp_wmb();
94
95         wc->version = wc_version + 1;
96         read_unlock_irq(&gpc->lock);
97
98         kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
99
100 out:
101         srcu_read_unlock(&kvm->srcu, idx);
102         return ret;
103 }
104
105 static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
106 {
107         struct kvm_vcpu_xen *vx = &v->arch.xen;
108         u64 now = get_kvmclock_ns(v->kvm);
109         u64 delta_ns = now - vx->runstate_entry_time;
110         u64 run_delay = current->sched_info.run_delay;
111
112         if (unlikely(!vx->runstate_entry_time))
113                 vx->current_runstate = RUNSTATE_offline;
114
115         /*
116          * Time waiting for the scheduler isn't "stolen" if the
117          * vCPU wasn't running anyway.
118          */
119         if (vx->current_runstate == RUNSTATE_running) {
120                 u64 steal_ns = run_delay - vx->last_steal;
121
122                 delta_ns -= steal_ns;
123
124                 vx->runstate_times[RUNSTATE_runnable] += steal_ns;
125         }
126         vx->last_steal = run_delay;
127
128         vx->runstate_times[vx->current_runstate] += delta_ns;
129         vx->current_runstate = state;
130         vx->runstate_entry_time = now;
131 }
132
133 void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
134 {
135         struct kvm_vcpu_xen *vx = &v->arch.xen;
136         struct gfn_to_hva_cache *ghc = &vx->runstate_cache;
137         struct kvm_memslots *slots = kvm_memslots(v->kvm);
138         bool atomic = (state == RUNSTATE_runnable);
139         uint64_t state_entry_time;
140         int __user *user_state;
141         uint64_t __user *user_times;
142
143         kvm_xen_update_runstate(v, state);
144
145         if (!vx->runstate_set)
146                 return;
147
148         if (unlikely(slots->generation != ghc->generation || kvm_is_error_hva(ghc->hva)) &&
149             kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len))
150                 return;
151
152         /* We made sure it fits in a single page */
153         BUG_ON(!ghc->memslot);
154
155         if (atomic)
156                 pagefault_disable();
157
158         /*
159          * The only difference between 32-bit and 64-bit versions of the
160          * runstate struct us the alignment of uint64_t in 32-bit, which
161          * means that the 64-bit version has an additional 4 bytes of
162          * padding after the first field 'state'.
163          *
164          * So we use 'int __user *user_state' to point to the state field,
165          * and 'uint64_t __user *user_times' for runstate_entry_time. So
166          * the actual array of time[] in each state starts at user_times[1].
167          */
168         BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
169         BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
170         user_state = (int __user *)ghc->hva;
171
172         BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
173
174         user_times = (uint64_t __user *)(ghc->hva +
175                                          offsetof(struct compat_vcpu_runstate_info,
176                                                   state_entry_time));
177 #ifdef CONFIG_X86_64
178         BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
179                      offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
180         BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
181                      offsetof(struct compat_vcpu_runstate_info, time) + 4);
182
183         if (v->kvm->arch.xen.long_mode)
184                 user_times = (uint64_t __user *)(ghc->hva +
185                                                  offsetof(struct vcpu_runstate_info,
186                                                           state_entry_time));
187 #endif
188         /*
189          * First write the updated state_entry_time at the appropriate
190          * location determined by 'offset'.
191          */
192         state_entry_time = vx->runstate_entry_time;
193         state_entry_time |= XEN_RUNSTATE_UPDATE;
194
195         BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
196                      sizeof(state_entry_time));
197         BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
198                      sizeof(state_entry_time));
199
200         if (__put_user(state_entry_time, user_times))
201                 goto out;
202         smp_wmb();
203
204         /*
205          * Next, write the new runstate. This is in the *same* place
206          * for 32-bit and 64-bit guests, asserted here for paranoia.
207          */
208         BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
209                      offsetof(struct compat_vcpu_runstate_info, state));
210         BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
211                      sizeof(vx->current_runstate));
212         BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
213                      sizeof(vx->current_runstate));
214
215         if (__put_user(vx->current_runstate, user_state))
216                 goto out;
217
218         /*
219          * Write the actual runstate times immediately after the
220          * runstate_entry_time.
221          */
222         BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
223                      offsetof(struct vcpu_runstate_info, time) - sizeof(u64));
224         BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
225                      offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64));
226         BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
227                      sizeof_field(struct compat_vcpu_runstate_info, time));
228         BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
229                      sizeof(vx->runstate_times));
230
231         if (__copy_to_user(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)))
232                 goto out;
233         smp_wmb();
234
235         /*
236          * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
237          * runstate_entry_time field.
238          */
239         state_entry_time &= ~XEN_RUNSTATE_UPDATE;
240         __put_user(state_entry_time, user_times);
241         smp_wmb();
242
243  out:
244         mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
245
246         if (atomic)
247                 pagefault_enable();
248 }
249
250 int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
251 {
252         unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
253         bool atomic = in_atomic() || !task_is_running(current);
254         int err;
255         u8 rc = 0;
256
257         /*
258          * If the global upcall vector (HVMIRQ_callback_vector) is set and
259          * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
260          */
261         struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
262         struct kvm_memslots *slots = kvm_memslots(v->kvm);
263         bool ghc_valid = slots->generation == ghc->generation &&
264                 !kvm_is_error_hva(ghc->hva) && ghc->memslot;
265
266         unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
267
268         /* No need for compat handling here */
269         BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
270                      offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
271         BUILD_BUG_ON(sizeof(rc) !=
272                      sizeof_field(struct vcpu_info, evtchn_upcall_pending));
273         BUILD_BUG_ON(sizeof(rc) !=
274                      sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
275
276         /*
277          * For efficiency, this mirrors the checks for using the valid
278          * cache in kvm_read_guest_offset_cached(), but just uses
279          * __get_user() instead. And falls back to the slow path.
280          */
281         if (!evtchn_pending_sel && ghc_valid) {
282                 /* Fast path */
283                 pagefault_disable();
284                 err = __get_user(rc, (u8 __user *)ghc->hva + offset);
285                 pagefault_enable();
286                 if (!err)
287                         return rc;
288         }
289
290         /* Slow path */
291
292         /*
293          * This function gets called from kvm_vcpu_block() after setting the
294          * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
295          * from a HLT. So we really mustn't sleep. If the page ended up absent
296          * at that point, just return 1 in order to trigger an immediate wake,
297          * and we'll end up getting called again from a context where we *can*
298          * fault in the page and wait for it.
299          */
300         if (atomic)
301                 return 1;
302
303         if (!ghc_valid) {
304                 err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len);
305                 if (err || !ghc->memslot) {
306                         /*
307                          * If this failed, userspace has screwed up the
308                          * vcpu_info mapping. No interrupts for you.
309                          */
310                         return 0;
311                 }
312         }
313
314         /*
315          * Now we have a valid (protected by srcu) userspace HVA in
316          * ghc->hva which points to the struct vcpu_info. If there
317          * are any bits in the in-kernel evtchn_pending_sel then
318          * we need to write those to the guest vcpu_info and set
319          * its evtchn_upcall_pending flag. If there aren't any bits
320          * to add, we only want to *check* evtchn_upcall_pending.
321          */
322         if (evtchn_pending_sel) {
323                 bool long_mode = v->kvm->arch.xen.long_mode;
324
325                 if (!user_access_begin((void __user *)ghc->hva, sizeof(struct vcpu_info)))
326                         return 0;
327
328                 if (IS_ENABLED(CONFIG_64BIT) && long_mode) {
329                         struct vcpu_info __user *vi = (void __user *)ghc->hva;
330
331                         /* Attempt to set the evtchn_pending_sel bits in the
332                          * guest, and if that succeeds then clear the same
333                          * bits in the in-kernel version. */
334                         asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n"
335                                      "\tnotq %0\n"
336                                      "\t" LOCK_PREFIX "andq %0, %2\n"
337                                      "2:\n"
338                                      _ASM_EXTABLE_UA(1b, 2b)
339                                      : "=r" (evtchn_pending_sel),
340                                        "+m" (vi->evtchn_pending_sel),
341                                        "+m" (v->arch.xen.evtchn_pending_sel)
342                                      : "0" (evtchn_pending_sel));
343                 } else {
344                         struct compat_vcpu_info __user *vi = (void __user *)ghc->hva;
345                         u32 evtchn_pending_sel32 = evtchn_pending_sel;
346
347                         /* Attempt to set the evtchn_pending_sel bits in the
348                          * guest, and if that succeeds then clear the same
349                          * bits in the in-kernel version. */
350                         asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n"
351                                      "\tnotl %0\n"
352                                      "\t" LOCK_PREFIX "andl %0, %2\n"
353                                      "2:\n"
354                                      _ASM_EXTABLE_UA(1b, 2b)
355                                      : "=r" (evtchn_pending_sel32),
356                                        "+m" (vi->evtchn_pending_sel),
357                                        "+m" (v->arch.xen.evtchn_pending_sel)
358                                      : "0" (evtchn_pending_sel32));
359                 }
360                 rc = 1;
361                 unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err);
362
363         err:
364                 user_access_end();
365
366                 mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
367         } else {
368                 __get_user(rc, (u8 __user *)ghc->hva + offset);
369         }
370
371         return rc;
372 }
373
374 int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
375 {
376         int r = -ENOENT;
377
378         mutex_lock(&kvm->lock);
379
380         switch (data->type) {
381         case KVM_XEN_ATTR_TYPE_LONG_MODE:
382                 if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
383                         r = -EINVAL;
384                 } else {
385                         kvm->arch.xen.long_mode = !!data->u.long_mode;
386                         r = 0;
387                 }
388                 break;
389
390         case KVM_XEN_ATTR_TYPE_SHARED_INFO:
391                 r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
392                 break;
393
394         case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
395                 if (data->u.vector && data->u.vector < 0x10)
396                         r = -EINVAL;
397                 else {
398                         kvm->arch.xen.upcall_vector = data->u.vector;
399                         r = 0;
400                 }
401                 break;
402
403         default:
404                 break;
405         }
406
407         mutex_unlock(&kvm->lock);
408         return r;
409 }
410
411 int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
412 {
413         int r = -ENOENT;
414
415         mutex_lock(&kvm->lock);
416
417         switch (data->type) {
418         case KVM_XEN_ATTR_TYPE_LONG_MODE:
419                 data->u.long_mode = kvm->arch.xen.long_mode;
420                 r = 0;
421                 break;
422
423         case KVM_XEN_ATTR_TYPE_SHARED_INFO:
424                 if (kvm->arch.xen.shinfo_cache.active)
425                         data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
426                 else
427                         data->u.shared_info.gfn = GPA_INVALID;
428                 r = 0;
429                 break;
430
431         case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
432                 data->u.vector = kvm->arch.xen.upcall_vector;
433                 r = 0;
434                 break;
435
436         default:
437                 break;
438         }
439
440         mutex_unlock(&kvm->lock);
441         return r;
442 }
443
444 int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
445 {
446         int idx, r = -ENOENT;
447
448         mutex_lock(&vcpu->kvm->lock);
449         idx = srcu_read_lock(&vcpu->kvm->srcu);
450
451         switch (data->type) {
452         case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
453                 /* No compat necessary here. */
454                 BUILD_BUG_ON(sizeof(struct vcpu_info) !=
455                              sizeof(struct compat_vcpu_info));
456                 BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
457                              offsetof(struct compat_vcpu_info, time));
458
459                 if (data->u.gpa == GPA_INVALID) {
460                         vcpu->arch.xen.vcpu_info_set = false;
461                         r = 0;
462                         break;
463                 }
464
465                 /* It must fit within a single page */
466                 if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) {
467                         r = -EINVAL;
468                         break;
469                 }
470
471                 r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
472                                               &vcpu->arch.xen.vcpu_info_cache,
473                                               data->u.gpa,
474                                               sizeof(struct vcpu_info));
475                 if (!r) {
476                         vcpu->arch.xen.vcpu_info_set = true;
477                         kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
478                 }
479                 break;
480
481         case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
482                 if (data->u.gpa == GPA_INVALID) {
483                         vcpu->arch.xen.vcpu_time_info_set = false;
484                         r = 0;
485                         break;
486                 }
487
488                 /* It must fit within a single page */
489                 if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct pvclock_vcpu_time_info) > PAGE_SIZE) {
490                         r = -EINVAL;
491                         break;
492                 }
493
494                 r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
495                                               &vcpu->arch.xen.vcpu_time_info_cache,
496                                               data->u.gpa,
497                                               sizeof(struct pvclock_vcpu_time_info));
498                 if (!r) {
499                         vcpu->arch.xen.vcpu_time_info_set = true;
500                         kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
501                 }
502                 break;
503
504         case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
505                 if (!sched_info_on()) {
506                         r = -EOPNOTSUPP;
507                         break;
508                 }
509                 if (data->u.gpa == GPA_INVALID) {
510                         vcpu->arch.xen.runstate_set = false;
511                         r = 0;
512                         break;
513                 }
514
515                 /* It must fit within a single page */
516                 if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_runstate_info) > PAGE_SIZE) {
517                         r = -EINVAL;
518                         break;
519                 }
520
521                 r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
522                                               &vcpu->arch.xen.runstate_cache,
523                                               data->u.gpa,
524                                               sizeof(struct vcpu_runstate_info));
525                 if (!r) {
526                         vcpu->arch.xen.runstate_set = true;
527                 }
528                 break;
529
530         case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
531                 if (!sched_info_on()) {
532                         r = -EOPNOTSUPP;
533                         break;
534                 }
535                 if (data->u.runstate.state > RUNSTATE_offline) {
536                         r = -EINVAL;
537                         break;
538                 }
539
540                 kvm_xen_update_runstate(vcpu, data->u.runstate.state);
541                 r = 0;
542                 break;
543
544         case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
545                 if (!sched_info_on()) {
546                         r = -EOPNOTSUPP;
547                         break;
548                 }
549                 if (data->u.runstate.state > RUNSTATE_offline) {
550                         r = -EINVAL;
551                         break;
552                 }
553                 if (data->u.runstate.state_entry_time !=
554                     (data->u.runstate.time_running +
555                      data->u.runstate.time_runnable +
556                      data->u.runstate.time_blocked +
557                      data->u.runstate.time_offline)) {
558                         r = -EINVAL;
559                         break;
560                 }
561                 if (get_kvmclock_ns(vcpu->kvm) <
562                     data->u.runstate.state_entry_time) {
563                         r = -EINVAL;
564                         break;
565                 }
566
567                 vcpu->arch.xen.current_runstate = data->u.runstate.state;
568                 vcpu->arch.xen.runstate_entry_time =
569                         data->u.runstate.state_entry_time;
570                 vcpu->arch.xen.runstate_times[RUNSTATE_running] =
571                         data->u.runstate.time_running;
572                 vcpu->arch.xen.runstate_times[RUNSTATE_runnable] =
573                         data->u.runstate.time_runnable;
574                 vcpu->arch.xen.runstate_times[RUNSTATE_blocked] =
575                         data->u.runstate.time_blocked;
576                 vcpu->arch.xen.runstate_times[RUNSTATE_offline] =
577                         data->u.runstate.time_offline;
578                 vcpu->arch.xen.last_steal = current->sched_info.run_delay;
579                 r = 0;
580                 break;
581
582         case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
583                 if (!sched_info_on()) {
584                         r = -EOPNOTSUPP;
585                         break;
586                 }
587                 if (data->u.runstate.state > RUNSTATE_offline &&
588                     data->u.runstate.state != (u64)-1) {
589                         r = -EINVAL;
590                         break;
591                 }
592                 /* The adjustment must add up */
593                 if (data->u.runstate.state_entry_time !=
594                     (data->u.runstate.time_running +
595                      data->u.runstate.time_runnable +
596                      data->u.runstate.time_blocked +
597                      data->u.runstate.time_offline)) {
598                         r = -EINVAL;
599                         break;
600                 }
601
602                 if (get_kvmclock_ns(vcpu->kvm) <
603                     (vcpu->arch.xen.runstate_entry_time +
604                      data->u.runstate.state_entry_time)) {
605                         r = -EINVAL;
606                         break;
607                 }
608
609                 vcpu->arch.xen.runstate_entry_time +=
610                         data->u.runstate.state_entry_time;
611                 vcpu->arch.xen.runstate_times[RUNSTATE_running] +=
612                         data->u.runstate.time_running;
613                 vcpu->arch.xen.runstate_times[RUNSTATE_runnable] +=
614                         data->u.runstate.time_runnable;
615                 vcpu->arch.xen.runstate_times[RUNSTATE_blocked] +=
616                         data->u.runstate.time_blocked;
617                 vcpu->arch.xen.runstate_times[RUNSTATE_offline] +=
618                         data->u.runstate.time_offline;
619
620                 if (data->u.runstate.state <= RUNSTATE_offline)
621                         kvm_xen_update_runstate(vcpu, data->u.runstate.state);
622                 r = 0;
623                 break;
624
625         default:
626                 break;
627         }
628
629         srcu_read_unlock(&vcpu->kvm->srcu, idx);
630         mutex_unlock(&vcpu->kvm->lock);
631         return r;
632 }
633
634 int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
635 {
636         int r = -ENOENT;
637
638         mutex_lock(&vcpu->kvm->lock);
639
640         switch (data->type) {
641         case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
642                 if (vcpu->arch.xen.vcpu_info_set)
643                         data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
644                 else
645                         data->u.gpa = GPA_INVALID;
646                 r = 0;
647                 break;
648
649         case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
650                 if (vcpu->arch.xen.vcpu_time_info_set)
651                         data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
652                 else
653                         data->u.gpa = GPA_INVALID;
654                 r = 0;
655                 break;
656
657         case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
658                 if (!sched_info_on()) {
659                         r = -EOPNOTSUPP;
660                         break;
661                 }
662                 if (vcpu->arch.xen.runstate_set) {
663                         data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
664                         r = 0;
665                 }
666                 break;
667
668         case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
669                 if (!sched_info_on()) {
670                         r = -EOPNOTSUPP;
671                         break;
672                 }
673                 data->u.runstate.state = vcpu->arch.xen.current_runstate;
674                 r = 0;
675                 break;
676
677         case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
678                 if (!sched_info_on()) {
679                         r = -EOPNOTSUPP;
680                         break;
681                 }
682                 data->u.runstate.state = vcpu->arch.xen.current_runstate;
683                 data->u.runstate.state_entry_time =
684                         vcpu->arch.xen.runstate_entry_time;
685                 data->u.runstate.time_running =
686                         vcpu->arch.xen.runstate_times[RUNSTATE_running];
687                 data->u.runstate.time_runnable =
688                         vcpu->arch.xen.runstate_times[RUNSTATE_runnable];
689                 data->u.runstate.time_blocked =
690                         vcpu->arch.xen.runstate_times[RUNSTATE_blocked];
691                 data->u.runstate.time_offline =
692                         vcpu->arch.xen.runstate_times[RUNSTATE_offline];
693                 r = 0;
694                 break;
695
696         case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
697                 r = -EINVAL;
698                 break;
699
700         default:
701                 break;
702         }
703
704         mutex_unlock(&vcpu->kvm->lock);
705         return r;
706 }
707
708 int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
709 {
710         struct kvm *kvm = vcpu->kvm;
711         u32 page_num = data & ~PAGE_MASK;
712         u64 page_addr = data & PAGE_MASK;
713         bool lm = is_long_mode(vcpu);
714
715         /* Latch long_mode for shared_info pages etc. */
716         vcpu->kvm->arch.xen.long_mode = lm;
717
718         /*
719          * If Xen hypercall intercept is enabled, fill the hypercall
720          * page with VMCALL/VMMCALL instructions since that's what
721          * we catch. Else the VMM has provided the hypercall pages
722          * with instructions of its own choosing, so use those.
723          */
724         if (kvm_xen_hypercall_enabled(kvm)) {
725                 u8 instructions[32];
726                 int i;
727
728                 if (page_num)
729                         return 1;
730
731                 /* mov imm32, %eax */
732                 instructions[0] = 0xb8;
733
734                 /* vmcall / vmmcall */
735                 static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5);
736
737                 /* ret */
738                 instructions[8] = 0xc3;
739
740                 /* int3 to pad */
741                 memset(instructions + 9, 0xcc, sizeof(instructions) - 9);
742
743                 for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) {
744                         *(u32 *)&instructions[1] = i;
745                         if (kvm_vcpu_write_guest(vcpu,
746                                                  page_addr + (i * sizeof(instructions)),
747                                                  instructions, sizeof(instructions)))
748                                 return 1;
749                 }
750         } else {
751                 /*
752                  * Note, truncation is a non-issue as 'lm' is guaranteed to be
753                  * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
754                  */
755                 hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64
756                                      : kvm->arch.xen_hvm_config.blob_addr_32;
757                 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
758                                   : kvm->arch.xen_hvm_config.blob_size_32;
759                 u8 *page;
760
761                 if (page_num >= blob_size)
762                         return 1;
763
764                 blob_addr += page_num * PAGE_SIZE;
765
766                 page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE);
767                 if (IS_ERR(page))
768                         return PTR_ERR(page);
769
770                 if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
771                         kfree(page);
772                         return 1;
773                 }
774         }
775         return 0;
776 }
777
778 int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
779 {
780         if (xhc->flags & ~KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL)
781                 return -EINVAL;
782
783         /*
784          * With hypercall interception the kernel generates its own
785          * hypercall page so it must not be provided.
786          */
787         if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
788             (xhc->blob_addr_32 || xhc->blob_addr_64 ||
789              xhc->blob_size_32 || xhc->blob_size_64))
790                 return -EINVAL;
791
792         mutex_lock(&kvm->lock);
793
794         if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
795                 static_branch_inc(&kvm_xen_enabled.key);
796         else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
797                 static_branch_slow_dec_deferred(&kvm_xen_enabled);
798
799         memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
800
801         mutex_unlock(&kvm->lock);
802         return 0;
803 }
804
805 void kvm_xen_init_vm(struct kvm *kvm)
806 {
807 }
808
809 void kvm_xen_destroy_vm(struct kvm *kvm)
810 {
811         kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
812
813         if (kvm->arch.xen_hvm_config.msr)
814                 static_branch_slow_dec_deferred(&kvm_xen_enabled);
815 }
816
817 static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
818 {
819         kvm_rax_write(vcpu, result);
820         return kvm_skip_emulated_instruction(vcpu);
821 }
822
823 static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
824 {
825         struct kvm_run *run = vcpu->run;
826
827         if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip)))
828                 return 1;
829
830         return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
831 }
832
833 int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
834 {
835         bool longmode;
836         u64 input, params[6];
837
838         input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
839
840         /* Hyper-V hypercalls get bit 31 set in EAX */
841         if ((input & 0x80000000) &&
842             kvm_hv_hypercall_enabled(vcpu))
843                 return kvm_hv_hypercall(vcpu);
844
845         longmode = is_64_bit_hypercall(vcpu);
846         if (!longmode) {
847                 params[0] = (u32)kvm_rbx_read(vcpu);
848                 params[1] = (u32)kvm_rcx_read(vcpu);
849                 params[2] = (u32)kvm_rdx_read(vcpu);
850                 params[3] = (u32)kvm_rsi_read(vcpu);
851                 params[4] = (u32)kvm_rdi_read(vcpu);
852                 params[5] = (u32)kvm_rbp_read(vcpu);
853         }
854 #ifdef CONFIG_X86_64
855         else {
856                 params[0] = (u64)kvm_rdi_read(vcpu);
857                 params[1] = (u64)kvm_rsi_read(vcpu);
858                 params[2] = (u64)kvm_rdx_read(vcpu);
859                 params[3] = (u64)kvm_r10_read(vcpu);
860                 params[4] = (u64)kvm_r8_read(vcpu);
861                 params[5] = (u64)kvm_r9_read(vcpu);
862         }
863 #endif
864         trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
865                                 params[3], params[4], params[5]);
866
867         vcpu->run->exit_reason = KVM_EXIT_XEN;
868         vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
869         vcpu->run->xen.u.hcall.longmode = longmode;
870         vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu);
871         vcpu->run->xen.u.hcall.input = input;
872         vcpu->run->xen.u.hcall.params[0] = params[0];
873         vcpu->run->xen.u.hcall.params[1] = params[1];
874         vcpu->run->xen.u.hcall.params[2] = params[2];
875         vcpu->run->xen.u.hcall.params[3] = params[3];
876         vcpu->run->xen.u.hcall.params[4] = params[4];
877         vcpu->run->xen.u.hcall.params[5] = params[5];
878         vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
879         vcpu->arch.complete_userspace_io =
880                 kvm_xen_hypercall_complete_userspace;
881
882         return 0;
883 }
884
885 static inline int max_evtchn_port(struct kvm *kvm)
886 {
887         if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
888                 return EVTCHN_2L_NR_CHANNELS;
889         else
890                 return COMPAT_EVTCHN_2L_NR_CHANNELS;
891 }
892
893 /*
894  * This follows the kvm_set_irq() API, so it returns:
895  *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
896  *  = 0   Interrupt was coalesced (previous irq is still pending)
897  *  > 0   Number of CPUs interrupt was delivered to
898  */
899 int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
900                             struct kvm *kvm)
901 {
902         struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
903         struct kvm_vcpu *vcpu;
904         unsigned long *pending_bits, *mask_bits;
905         unsigned long flags;
906         int port_word_bit;
907         bool kick_vcpu = false;
908         int idx;
909         int rc;
910
911         vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu);
912         if (!vcpu)
913                 return -1;
914
915         if (!vcpu->arch.xen.vcpu_info_set)
916                 return -1;
917
918         if (e->xen_evtchn.port >= max_evtchn_port(kvm))
919                 return -1;
920
921         rc = -EWOULDBLOCK;
922         read_lock_irqsave(&gpc->lock, flags);
923
924         idx = srcu_read_lock(&kvm->srcu);
925         if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
926                 goto out_rcu;
927
928         if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
929                 struct shared_info *shinfo = gpc->khva;
930                 pending_bits = (unsigned long *)&shinfo->evtchn_pending;
931                 mask_bits = (unsigned long *)&shinfo->evtchn_mask;
932                 port_word_bit = e->xen_evtchn.port / 64;
933         } else {
934                 struct compat_shared_info *shinfo = gpc->khva;
935                 pending_bits = (unsigned long *)&shinfo->evtchn_pending;
936                 mask_bits = (unsigned long *)&shinfo->evtchn_mask;
937                 port_word_bit = e->xen_evtchn.port / 32;
938         }
939
940         /*
941          * If this port wasn't already set, and if it isn't masked, then
942          * we try to set the corresponding bit in the in-kernel shadow of
943          * evtchn_pending_sel for the target vCPU. And if *that* wasn't
944          * already set, then we kick the vCPU in question to write to the
945          * *real* evtchn_pending_sel in its own guest vcpu_info struct.
946          */
947         if (test_and_set_bit(e->xen_evtchn.port, pending_bits)) {
948                 rc = 0; /* It was already raised */
949         } else if (test_bit(e->xen_evtchn.port, mask_bits)) {
950                 rc = -1; /* Masked */
951         } else {
952                 rc = 1; /* Delivered. But was the vCPU waking already? */
953                 if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
954                         kick_vcpu = true;
955         }
956
957  out_rcu:
958         srcu_read_unlock(&kvm->srcu, idx);
959         read_unlock_irqrestore(&gpc->lock, flags);
960
961         if (kick_vcpu) {
962                 kvm_make_request(KVM_REQ_EVENT, vcpu);
963                 kvm_vcpu_kick(vcpu);
964         }
965
966         return rc;
967 }
968
969 /* This is the version called from kvm_set_irq() as the .set function */
970 static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
971                          int irq_source_id, int level, bool line_status)
972 {
973         bool mm_borrowed = false;
974         int rc;
975
976         if (!level)
977                 return -1;
978
979         rc = kvm_xen_set_evtchn_fast(e, kvm);
980         if (rc != -EWOULDBLOCK)
981                 return rc;
982
983         if (current->mm != kvm->mm) {
984                 /*
985                  * If not on a thread which already belongs to this KVM,
986                  * we'd better be in the irqfd workqueue.
987                  */
988                 if (WARN_ON_ONCE(current->mm))
989                         return -EINVAL;
990
991                 kthread_use_mm(kvm->mm);
992                 mm_borrowed = true;
993         }
994
995         /*
996          * For the irqfd workqueue, using the main kvm->lock mutex is
997          * fine since this function is invoked from kvm_set_irq() with
998          * no other lock held, no srcu. In future if it will be called
999          * directly from a vCPU thread (e.g. on hypercall for an IPI)
1000          * then it may need to switch to using a leaf-node mutex for
1001          * serializing the shared_info mapping.
1002          */
1003         mutex_lock(&kvm->lock);
1004
1005         /*
1006          * It is theoretically possible for the page to be unmapped
1007          * and the MMU notifier to invalidate the shared_info before
1008          * we even get to use it. In that case, this looks like an
1009          * infinite loop. It was tempting to do it via the userspace
1010          * HVA instead... but that just *hides* the fact that it's
1011          * an infinite loop, because if a fault occurs and it waits
1012          * for the page to come back, it can *still* immediately
1013          * fault and have to wait again, repeatedly.
1014          *
1015          * Conversely, the page could also have been reinstated by
1016          * another thread before we even obtain the mutex above, so
1017          * check again *first* before remapping it.
1018          */
1019         do {
1020                 struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
1021                 int idx;
1022
1023                 rc = kvm_xen_set_evtchn_fast(e, kvm);
1024                 if (rc != -EWOULDBLOCK)
1025                         break;
1026
1027                 idx = srcu_read_lock(&kvm->srcu);
1028                 rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE);
1029                 srcu_read_unlock(&kvm->srcu, idx);
1030         } while(!rc);
1031
1032         mutex_unlock(&kvm->lock);
1033
1034         if (mm_borrowed)
1035                 kthread_unuse_mm(kvm->mm);
1036
1037         return rc;
1038 }
1039
1040 int kvm_xen_setup_evtchn(struct kvm *kvm,
1041                          struct kvm_kernel_irq_routing_entry *e,
1042                          const struct kvm_irq_routing_entry *ue)
1043
1044 {
1045         if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
1046                 return -EINVAL;
1047
1048         /* We only support 2 level event channels for now */
1049         if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
1050                 return -EINVAL;
1051
1052         e->xen_evtchn.port = ue->u.xen_evtchn.port;
1053         e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu;
1054         e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
1055         e->set = evtchn_set_fn;
1056
1057         return 0;
1058 }
This page took 0.127824 seconds and 4 git commands to generate.