]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - arch/x86/kvm/svm.c
Merge branch 'for-4.8/core' of git://git.kernel.dk/linux-block
[karo-tx-linux.git] / arch / x86 / kvm / svm.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * AMD SVM support
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8  *
9  * Authors:
10  *   Yaniv Kamay  <yaniv@qumranet.com>
11  *   Avi Kivity   <avi@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #define pr_fmt(fmt) "SVM: " fmt
19
20 #include <linux/kvm_host.h>
21
22 #include "irq.h"
23 #include "mmu.h"
24 #include "kvm_cache_regs.h"
25 #include "x86.h"
26 #include "cpuid.h"
27 #include "pmu.h"
28
29 #include <linux/module.h>
30 #include <linux/mod_devicetable.h>
31 #include <linux/kernel.h>
32 #include <linux/vmalloc.h>
33 #include <linux/highmem.h>
34 #include <linux/sched.h>
35 #include <linux/trace_events.h>
36 #include <linux/slab.h>
37
38 #include <asm/apic.h>
39 #include <asm/perf_event.h>
40 #include <asm/tlbflush.h>
41 #include <asm/desc.h>
42 #include <asm/debugreg.h>
43 #include <asm/kvm_para.h>
44
45 #include <asm/virtext.h>
46 #include "trace.h"
47
48 #define __ex(x) __kvm_handle_fault_on_reboot(x)
49
50 MODULE_AUTHOR("Qumranet");
51 MODULE_LICENSE("GPL");
52
53 static const struct x86_cpu_id svm_cpu_id[] = {
54         X86_FEATURE_MATCH(X86_FEATURE_SVM),
55         {}
56 };
57 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
58
59 #define IOPM_ALLOC_ORDER 2
60 #define MSRPM_ALLOC_ORDER 1
61
62 #define SEG_TYPE_LDT 2
63 #define SEG_TYPE_BUSY_TSS16 3
64
65 #define SVM_FEATURE_NPT            (1 <<  0)
66 #define SVM_FEATURE_LBRV           (1 <<  1)
67 #define SVM_FEATURE_SVML           (1 <<  2)
68 #define SVM_FEATURE_NRIP           (1 <<  3)
69 #define SVM_FEATURE_TSC_RATE       (1 <<  4)
70 #define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
71 #define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
72 #define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
73 #define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
74
75 #define SVM_AVIC_DOORBELL       0xc001011b
76
77 #define NESTED_EXIT_HOST        0       /* Exit handled on host level */
78 #define NESTED_EXIT_DONE        1       /* Exit caused nested vmexit  */
79 #define NESTED_EXIT_CONTINUE    2       /* Further checks needed      */
80
81 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
82
83 #define TSC_RATIO_RSVD          0xffffff0000000000ULL
84 #define TSC_RATIO_MIN           0x0000000000000001ULL
85 #define TSC_RATIO_MAX           0x000000ffffffffffULL
86
87 #define AVIC_HPA_MASK   ~((0xFFFULL << 52) | 0xFFF)
88
89 /*
90  * 0xff is broadcast, so the max index allowed for physical APIC ID
91  * table is 0xfe.  APIC IDs above 0xff are reserved.
92  */
93 #define AVIC_MAX_PHYSICAL_ID_COUNT      255
94
95 #define AVIC_UNACCEL_ACCESS_WRITE_MASK          1
96 #define AVIC_UNACCEL_ACCESS_OFFSET_MASK         0xFF0
97 #define AVIC_UNACCEL_ACCESS_VECTOR_MASK         0xFFFFFFFF
98
99 static bool erratum_383_found __read_mostly;
100
101 static const u32 host_save_user_msrs[] = {
102 #ifdef CONFIG_X86_64
103         MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
104         MSR_FS_BASE,
105 #endif
106         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
107         MSR_TSC_AUX,
108 };
109
110 #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
111
112 struct kvm_vcpu;
113
114 struct nested_state {
115         struct vmcb *hsave;
116         u64 hsave_msr;
117         u64 vm_cr_msr;
118         u64 vmcb;
119
120         /* These are the merged vectors */
121         u32 *msrpm;
122
123         /* gpa pointers to the real vectors */
124         u64 vmcb_msrpm;
125         u64 vmcb_iopm;
126
127         /* A VMEXIT is required but not yet emulated */
128         bool exit_required;
129
130         /* cache for intercepts of the guest */
131         u32 intercept_cr;
132         u32 intercept_dr;
133         u32 intercept_exceptions;
134         u64 intercept;
135
136         /* Nested Paging related state */
137         u64 nested_cr3;
138 };
139
140 #define MSRPM_OFFSETS   16
141 static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
142
143 /*
144  * Set osvw_len to higher value when updated Revision Guides
145  * are published and we know what the new status bits are
146  */
147 static uint64_t osvw_len = 4, osvw_status;
148
149 struct vcpu_svm {
150         struct kvm_vcpu vcpu;
151         struct vmcb *vmcb;
152         unsigned long vmcb_pa;
153         struct svm_cpu_data *svm_data;
154         uint64_t asid_generation;
155         uint64_t sysenter_esp;
156         uint64_t sysenter_eip;
157         uint64_t tsc_aux;
158
159         u64 next_rip;
160
161         u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
162         struct {
163                 u16 fs;
164                 u16 gs;
165                 u16 ldt;
166                 u64 gs_base;
167         } host;
168
169         u32 *msrpm;
170
171         ulong nmi_iret_rip;
172
173         struct nested_state nested;
174
175         bool nmi_singlestep;
176
177         unsigned int3_injected;
178         unsigned long int3_rip;
179         u32 apf_reason;
180
181         /* cached guest cpuid flags for faster access */
182         bool nrips_enabled      : 1;
183
184         u32 ldr_reg;
185         struct page *avic_backing_page;
186         u64 *avic_physical_id_cache;
187         bool avic_is_running;
188 };
189
190 #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK    (0xFF)
191 #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK                (1 << 31)
192
193 #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK    (0xFFULL)
194 #define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK        (0xFFFFFFFFFFULL << 12)
195 #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK          (1ULL << 62)
196 #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK               (1ULL << 63)
197
198 static DEFINE_PER_CPU(u64, current_tsc_ratio);
199 #define TSC_RATIO_DEFAULT       0x0100000000ULL
200
201 #define MSR_INVALID                     0xffffffffU
202
203 static const struct svm_direct_access_msrs {
204         u32 index;   /* Index of the MSR */
205         bool always; /* True if intercept is always on */
206 } direct_access_msrs[] = {
207         { .index = MSR_STAR,                            .always = true  },
208         { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
209 #ifdef CONFIG_X86_64
210         { .index = MSR_GS_BASE,                         .always = true  },
211         { .index = MSR_FS_BASE,                         .always = true  },
212         { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
213         { .index = MSR_LSTAR,                           .always = true  },
214         { .index = MSR_CSTAR,                           .always = true  },
215         { .index = MSR_SYSCALL_MASK,                    .always = true  },
216 #endif
217         { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
218         { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
219         { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
220         { .index = MSR_IA32_LASTINTTOIP,                .always = false },
221         { .index = MSR_INVALID,                         .always = false },
222 };
223
224 /* enable NPT for AMD64 and X86 with PAE */
225 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
226 static bool npt_enabled = true;
227 #else
228 static bool npt_enabled;
229 #endif
230
231 /* allow nested paging (virtualized MMU) for all guests */
232 static int npt = true;
233 module_param(npt, int, S_IRUGO);
234
235 /* allow nested virtualization in KVM/SVM */
236 static int nested = true;
237 module_param(nested, int, S_IRUGO);
238
239 /* enable / disable AVIC */
240 static int avic;
241 #ifdef CONFIG_X86_LOCAL_APIC
242 module_param(avic, int, S_IRUGO);
243 #endif
244
245 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
246 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
247 static void svm_complete_interrupts(struct vcpu_svm *svm);
248
249 static int nested_svm_exit_handled(struct vcpu_svm *svm);
250 static int nested_svm_intercept(struct vcpu_svm *svm);
251 static int nested_svm_vmexit(struct vcpu_svm *svm);
252 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
253                                       bool has_error_code, u32 error_code);
254
255 enum {
256         VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
257                             pause filter count */
258         VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
259         VMCB_ASID,       /* ASID */
260         VMCB_INTR,       /* int_ctl, int_vector */
261         VMCB_NPT,        /* npt_en, nCR3, gPAT */
262         VMCB_CR,         /* CR0, CR3, CR4, EFER */
263         VMCB_DR,         /* DR6, DR7 */
264         VMCB_DT,         /* GDT, IDT */
265         VMCB_SEG,        /* CS, DS, SS, ES, CPL */
266         VMCB_CR2,        /* CR2 only */
267         VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
268         VMCB_AVIC,       /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
269                           * AVIC PHYSICAL_TABLE pointer,
270                           * AVIC LOGICAL_TABLE pointer
271                           */
272         VMCB_DIRTY_MAX,
273 };
274
275 /* TPR and CR2 are always written before VMRUN */
276 #define VMCB_ALWAYS_DIRTY_MASK  ((1U << VMCB_INTR) | (1U << VMCB_CR2))
277
278 #define VMCB_AVIC_APIC_BAR_MASK         0xFFFFFFFFFF000ULL
279
280 static inline void mark_all_dirty(struct vmcb *vmcb)
281 {
282         vmcb->control.clean = 0;
283 }
284
285 static inline void mark_all_clean(struct vmcb *vmcb)
286 {
287         vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
288                                & ~VMCB_ALWAYS_DIRTY_MASK;
289 }
290
291 static inline void mark_dirty(struct vmcb *vmcb, int bit)
292 {
293         vmcb->control.clean &= ~(1 << bit);
294 }
295
296 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
297 {
298         return container_of(vcpu, struct vcpu_svm, vcpu);
299 }
300
301 static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
302 {
303         svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
304         mark_dirty(svm->vmcb, VMCB_AVIC);
305 }
306
307 static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
308 {
309         struct vcpu_svm *svm = to_svm(vcpu);
310         u64 *entry = svm->avic_physical_id_cache;
311
312         if (!entry)
313                 return false;
314
315         return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
316 }
317
318 static void recalc_intercepts(struct vcpu_svm *svm)
319 {
320         struct vmcb_control_area *c, *h;
321         struct nested_state *g;
322
323         mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
324
325         if (!is_guest_mode(&svm->vcpu))
326                 return;
327
328         c = &svm->vmcb->control;
329         h = &svm->nested.hsave->control;
330         g = &svm->nested;
331
332         c->intercept_cr = h->intercept_cr | g->intercept_cr;
333         c->intercept_dr = h->intercept_dr | g->intercept_dr;
334         c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
335         c->intercept = h->intercept | g->intercept;
336 }
337
338 static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
339 {
340         if (is_guest_mode(&svm->vcpu))
341                 return svm->nested.hsave;
342         else
343                 return svm->vmcb;
344 }
345
346 static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
347 {
348         struct vmcb *vmcb = get_host_vmcb(svm);
349
350         vmcb->control.intercept_cr |= (1U << bit);
351
352         recalc_intercepts(svm);
353 }
354
355 static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
356 {
357         struct vmcb *vmcb = get_host_vmcb(svm);
358
359         vmcb->control.intercept_cr &= ~(1U << bit);
360
361         recalc_intercepts(svm);
362 }
363
364 static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
365 {
366         struct vmcb *vmcb = get_host_vmcb(svm);
367
368         return vmcb->control.intercept_cr & (1U << bit);
369 }
370
371 static inline void set_dr_intercepts(struct vcpu_svm *svm)
372 {
373         struct vmcb *vmcb = get_host_vmcb(svm);
374
375         vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
376                 | (1 << INTERCEPT_DR1_READ)
377                 | (1 << INTERCEPT_DR2_READ)
378                 | (1 << INTERCEPT_DR3_READ)
379                 | (1 << INTERCEPT_DR4_READ)
380                 | (1 << INTERCEPT_DR5_READ)
381                 | (1 << INTERCEPT_DR6_READ)
382                 | (1 << INTERCEPT_DR7_READ)
383                 | (1 << INTERCEPT_DR0_WRITE)
384                 | (1 << INTERCEPT_DR1_WRITE)
385                 | (1 << INTERCEPT_DR2_WRITE)
386                 | (1 << INTERCEPT_DR3_WRITE)
387                 | (1 << INTERCEPT_DR4_WRITE)
388                 | (1 << INTERCEPT_DR5_WRITE)
389                 | (1 << INTERCEPT_DR6_WRITE)
390                 | (1 << INTERCEPT_DR7_WRITE);
391
392         recalc_intercepts(svm);
393 }
394
395 static inline void clr_dr_intercepts(struct vcpu_svm *svm)
396 {
397         struct vmcb *vmcb = get_host_vmcb(svm);
398
399         vmcb->control.intercept_dr = 0;
400
401         recalc_intercepts(svm);
402 }
403
404 static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
405 {
406         struct vmcb *vmcb = get_host_vmcb(svm);
407
408         vmcb->control.intercept_exceptions |= (1U << bit);
409
410         recalc_intercepts(svm);
411 }
412
413 static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
414 {
415         struct vmcb *vmcb = get_host_vmcb(svm);
416
417         vmcb->control.intercept_exceptions &= ~(1U << bit);
418
419         recalc_intercepts(svm);
420 }
421
422 static inline void set_intercept(struct vcpu_svm *svm, int bit)
423 {
424         struct vmcb *vmcb = get_host_vmcb(svm);
425
426         vmcb->control.intercept |= (1ULL << bit);
427
428         recalc_intercepts(svm);
429 }
430
431 static inline void clr_intercept(struct vcpu_svm *svm, int bit)
432 {
433         struct vmcb *vmcb = get_host_vmcb(svm);
434
435         vmcb->control.intercept &= ~(1ULL << bit);
436
437         recalc_intercepts(svm);
438 }
439
440 static inline void enable_gif(struct vcpu_svm *svm)
441 {
442         svm->vcpu.arch.hflags |= HF_GIF_MASK;
443 }
444
445 static inline void disable_gif(struct vcpu_svm *svm)
446 {
447         svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
448 }
449
450 static inline bool gif_set(struct vcpu_svm *svm)
451 {
452         return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
453 }
454
455 static unsigned long iopm_base;
456
457 struct kvm_ldttss_desc {
458         u16 limit0;
459         u16 base0;
460         unsigned base1:8, type:5, dpl:2, p:1;
461         unsigned limit1:4, zero0:3, g:1, base2:8;
462         u32 base3;
463         u32 zero1;
464 } __attribute__((packed));
465
466 struct svm_cpu_data {
467         int cpu;
468
469         u64 asid_generation;
470         u32 max_asid;
471         u32 next_asid;
472         struct kvm_ldttss_desc *tss_desc;
473
474         struct page *save_area;
475 };
476
477 static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
478
479 struct svm_init_data {
480         int cpu;
481         int r;
482 };
483
484 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
485
486 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
487 #define MSRS_RANGE_SIZE 2048
488 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
489
490 static u32 svm_msrpm_offset(u32 msr)
491 {
492         u32 offset;
493         int i;
494
495         for (i = 0; i < NUM_MSR_MAPS; i++) {
496                 if (msr < msrpm_ranges[i] ||
497                     msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
498                         continue;
499
500                 offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
501                 offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
502
503                 /* Now we have the u8 offset - but need the u32 offset */
504                 return offset / 4;
505         }
506
507         /* MSR not in any range */
508         return MSR_INVALID;
509 }
510
511 #define MAX_INST_SIZE 15
512
513 static inline void clgi(void)
514 {
515         asm volatile (__ex(SVM_CLGI));
516 }
517
518 static inline void stgi(void)
519 {
520         asm volatile (__ex(SVM_STGI));
521 }
522
523 static inline void invlpga(unsigned long addr, u32 asid)
524 {
525         asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
526 }
527
528 static int get_npt_level(void)
529 {
530 #ifdef CONFIG_X86_64
531         return PT64_ROOT_LEVEL;
532 #else
533         return PT32E_ROOT_LEVEL;
534 #endif
535 }
536
537 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
538 {
539         vcpu->arch.efer = efer;
540         if (!npt_enabled && !(efer & EFER_LMA))
541                 efer &= ~EFER_LME;
542
543         to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
544         mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
545 }
546
547 static int is_external_interrupt(u32 info)
548 {
549         info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
550         return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
551 }
552
553 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
554 {
555         struct vcpu_svm *svm = to_svm(vcpu);
556         u32 ret = 0;
557
558         if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
559                 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
560         return ret;
561 }
562
563 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
564 {
565         struct vcpu_svm *svm = to_svm(vcpu);
566
567         if (mask == 0)
568                 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
569         else
570                 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
571
572 }
573
574 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
575 {
576         struct vcpu_svm *svm = to_svm(vcpu);
577
578         if (svm->vmcb->control.next_rip != 0) {
579                 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
580                 svm->next_rip = svm->vmcb->control.next_rip;
581         }
582
583         if (!svm->next_rip) {
584                 if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
585                                 EMULATE_DONE)
586                         printk(KERN_DEBUG "%s: NOP\n", __func__);
587                 return;
588         }
589         if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
590                 printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
591                        __func__, kvm_rip_read(vcpu), svm->next_rip);
592
593         kvm_rip_write(vcpu, svm->next_rip);
594         svm_set_interrupt_shadow(vcpu, 0);
595 }
596
597 static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
598                                 bool has_error_code, u32 error_code,
599                                 bool reinject)
600 {
601         struct vcpu_svm *svm = to_svm(vcpu);
602
603         /*
604          * If we are within a nested VM we'd better #VMEXIT and let the guest
605          * handle the exception
606          */
607         if (!reinject &&
608             nested_svm_check_exception(svm, nr, has_error_code, error_code))
609                 return;
610
611         if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
612                 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
613
614                 /*
615                  * For guest debugging where we have to reinject #BP if some
616                  * INT3 is guest-owned:
617                  * Emulate nRIP by moving RIP forward. Will fail if injection
618                  * raises a fault that is not intercepted. Still better than
619                  * failing in all cases.
620                  */
621                 skip_emulated_instruction(&svm->vcpu);
622                 rip = kvm_rip_read(&svm->vcpu);
623                 svm->int3_rip = rip + svm->vmcb->save.cs.base;
624                 svm->int3_injected = rip - old_rip;
625         }
626
627         svm->vmcb->control.event_inj = nr
628                 | SVM_EVTINJ_VALID
629                 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
630                 | SVM_EVTINJ_TYPE_EXEPT;
631         svm->vmcb->control.event_inj_err = error_code;
632 }
633
634 static void svm_init_erratum_383(void)
635 {
636         u32 low, high;
637         int err;
638         u64 val;
639
640         if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
641                 return;
642
643         /* Use _safe variants to not break nested virtualization */
644         val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
645         if (err)
646                 return;
647
648         val |= (1ULL << 47);
649
650         low  = lower_32_bits(val);
651         high = upper_32_bits(val);
652
653         native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
654
655         erratum_383_found = true;
656 }
657
658 static void svm_init_osvw(struct kvm_vcpu *vcpu)
659 {
660         /*
661          * Guests should see errata 400 and 415 as fixed (assuming that
662          * HLT and IO instructions are intercepted).
663          */
664         vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
665         vcpu->arch.osvw.status = osvw_status & ~(6ULL);
666
667         /*
668          * By increasing VCPU's osvw.length to 3 we are telling the guest that
669          * all osvw.status bits inside that length, including bit 0 (which is
670          * reserved for erratum 298), are valid. However, if host processor's
671          * osvw_len is 0 then osvw_status[0] carries no information. We need to
672          * be conservative here and therefore we tell the guest that erratum 298
673          * is present (because we really don't know).
674          */
675         if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
676                 vcpu->arch.osvw.status |= 1;
677 }
678
679 static int has_svm(void)
680 {
681         const char *msg;
682
683         if (!cpu_has_svm(&msg)) {
684                 printk(KERN_INFO "has_svm: %s\n", msg);
685                 return 0;
686         }
687
688         return 1;
689 }
690
691 static void svm_hardware_disable(void)
692 {
693         /* Make sure we clean up behind us */
694         if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
695                 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
696
697         cpu_svm_disable();
698
699         amd_pmu_disable_virt();
700 }
701
702 static int svm_hardware_enable(void)
703 {
704
705         struct svm_cpu_data *sd;
706         uint64_t efer;
707         struct desc_ptr gdt_descr;
708         struct desc_struct *gdt;
709         int me = raw_smp_processor_id();
710
711         rdmsrl(MSR_EFER, efer);
712         if (efer & EFER_SVME)
713                 return -EBUSY;
714
715         if (!has_svm()) {
716                 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
717                 return -EINVAL;
718         }
719         sd = per_cpu(svm_data, me);
720         if (!sd) {
721                 pr_err("%s: svm_data is NULL on %d\n", __func__, me);
722                 return -EINVAL;
723         }
724
725         sd->asid_generation = 1;
726         sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
727         sd->next_asid = sd->max_asid + 1;
728
729         native_store_gdt(&gdt_descr);
730         gdt = (struct desc_struct *)gdt_descr.address;
731         sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
732
733         wrmsrl(MSR_EFER, efer | EFER_SVME);
734
735         wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
736
737         if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
738                 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
739                 __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
740         }
741
742
743         /*
744          * Get OSVW bits.
745          *
746          * Note that it is possible to have a system with mixed processor
747          * revisions and therefore different OSVW bits. If bits are not the same
748          * on different processors then choose the worst case (i.e. if erratum
749          * is present on one processor and not on another then assume that the
750          * erratum is present everywhere).
751          */
752         if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
753                 uint64_t len, status = 0;
754                 int err;
755
756                 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
757                 if (!err)
758                         status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
759                                                       &err);
760
761                 if (err)
762                         osvw_status = osvw_len = 0;
763                 else {
764                         if (len < osvw_len)
765                                 osvw_len = len;
766                         osvw_status |= status;
767                         osvw_status &= (1ULL << osvw_len) - 1;
768                 }
769         } else
770                 osvw_status = osvw_len = 0;
771
772         svm_init_erratum_383();
773
774         amd_pmu_enable_virt();
775
776         return 0;
777 }
778
779 static void svm_cpu_uninit(int cpu)
780 {
781         struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
782
783         if (!sd)
784                 return;
785
786         per_cpu(svm_data, raw_smp_processor_id()) = NULL;
787         __free_page(sd->save_area);
788         kfree(sd);
789 }
790
791 static int svm_cpu_init(int cpu)
792 {
793         struct svm_cpu_data *sd;
794         int r;
795
796         sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
797         if (!sd)
798                 return -ENOMEM;
799         sd->cpu = cpu;
800         sd->save_area = alloc_page(GFP_KERNEL);
801         r = -ENOMEM;
802         if (!sd->save_area)
803                 goto err_1;
804
805         per_cpu(svm_data, cpu) = sd;
806
807         return 0;
808
809 err_1:
810         kfree(sd);
811         return r;
812
813 }
814
815 static bool valid_msr_intercept(u32 index)
816 {
817         int i;
818
819         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
820                 if (direct_access_msrs[i].index == index)
821                         return true;
822
823         return false;
824 }
825
826 static void set_msr_interception(u32 *msrpm, unsigned msr,
827                                  int read, int write)
828 {
829         u8 bit_read, bit_write;
830         unsigned long tmp;
831         u32 offset;
832
833         /*
834          * If this warning triggers extend the direct_access_msrs list at the
835          * beginning of the file
836          */
837         WARN_ON(!valid_msr_intercept(msr));
838
839         offset    = svm_msrpm_offset(msr);
840         bit_read  = 2 * (msr & 0x0f);
841         bit_write = 2 * (msr & 0x0f) + 1;
842         tmp       = msrpm[offset];
843
844         BUG_ON(offset == MSR_INVALID);
845
846         read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
847         write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
848
849         msrpm[offset] = tmp;
850 }
851
852 static void svm_vcpu_init_msrpm(u32 *msrpm)
853 {
854         int i;
855
856         memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
857
858         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
859                 if (!direct_access_msrs[i].always)
860                         continue;
861
862                 set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
863         }
864 }
865
866 static void add_msr_offset(u32 offset)
867 {
868         int i;
869
870         for (i = 0; i < MSRPM_OFFSETS; ++i) {
871
872                 /* Offset already in list? */
873                 if (msrpm_offsets[i] == offset)
874                         return;
875
876                 /* Slot used by another offset? */
877                 if (msrpm_offsets[i] != MSR_INVALID)
878                         continue;
879
880                 /* Add offset to list */
881                 msrpm_offsets[i] = offset;
882
883                 return;
884         }
885
886         /*
887          * If this BUG triggers the msrpm_offsets table has an overflow. Just
888          * increase MSRPM_OFFSETS in this case.
889          */
890         BUG();
891 }
892
893 static void init_msrpm_offsets(void)
894 {
895         int i;
896
897         memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
898
899         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
900                 u32 offset;
901
902                 offset = svm_msrpm_offset(direct_access_msrs[i].index);
903                 BUG_ON(offset == MSR_INVALID);
904
905                 add_msr_offset(offset);
906         }
907 }
908
909 static void svm_enable_lbrv(struct vcpu_svm *svm)
910 {
911         u32 *msrpm = svm->msrpm;
912
913         svm->vmcb->control.lbr_ctl = 1;
914         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
915         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
916         set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
917         set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
918 }
919
920 static void svm_disable_lbrv(struct vcpu_svm *svm)
921 {
922         u32 *msrpm = svm->msrpm;
923
924         svm->vmcb->control.lbr_ctl = 0;
925         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
926         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
927         set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
928         set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
929 }
930
931 static __init int svm_hardware_setup(void)
932 {
933         int cpu;
934         struct page *iopm_pages;
935         void *iopm_va;
936         int r;
937
938         iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
939
940         if (!iopm_pages)
941                 return -ENOMEM;
942
943         iopm_va = page_address(iopm_pages);
944         memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
945         iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
946
947         init_msrpm_offsets();
948
949         if (boot_cpu_has(X86_FEATURE_NX))
950                 kvm_enable_efer_bits(EFER_NX);
951
952         if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
953                 kvm_enable_efer_bits(EFER_FFXSR);
954
955         if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
956                 kvm_has_tsc_control = true;
957                 kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
958                 kvm_tsc_scaling_ratio_frac_bits = 32;
959         }
960
961         if (nested) {
962                 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
963                 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
964         }
965
966         for_each_possible_cpu(cpu) {
967                 r = svm_cpu_init(cpu);
968                 if (r)
969                         goto err;
970         }
971
972         if (!boot_cpu_has(X86_FEATURE_NPT))
973                 npt_enabled = false;
974
975         if (npt_enabled && !npt) {
976                 printk(KERN_INFO "kvm: Nested Paging disabled\n");
977                 npt_enabled = false;
978         }
979
980         if (npt_enabled) {
981                 printk(KERN_INFO "kvm: Nested Paging enabled\n");
982                 kvm_enable_tdp();
983         } else
984                 kvm_disable_tdp();
985
986         if (avic) {
987                 if (!npt_enabled ||
988                     !boot_cpu_has(X86_FEATURE_AVIC) ||
989                     !IS_ENABLED(CONFIG_X86_LOCAL_APIC))
990                         avic = false;
991                 else
992                         pr_info("AVIC enabled\n");
993         }
994
995         return 0;
996
997 err:
998         __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
999         iopm_base = 0;
1000         return r;
1001 }
1002
1003 static __exit void svm_hardware_unsetup(void)
1004 {
1005         int cpu;
1006
1007         for_each_possible_cpu(cpu)
1008                 svm_cpu_uninit(cpu);
1009
1010         __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
1011         iopm_base = 0;
1012 }
1013
1014 static void init_seg(struct vmcb_seg *seg)
1015 {
1016         seg->selector = 0;
1017         seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1018                       SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1019         seg->limit = 0xffff;
1020         seg->base = 0;
1021 }
1022
1023 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1024 {
1025         seg->selector = 0;
1026         seg->attrib = SVM_SELECTOR_P_MASK | type;
1027         seg->limit = 0xffff;
1028         seg->base = 0;
1029 }
1030
1031 static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
1032 {
1033         struct vcpu_svm *svm = to_svm(vcpu);
1034
1035         return svm->vmcb->control.tsc_offset;
1036 }
1037
1038 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1039 {
1040         struct vcpu_svm *svm = to_svm(vcpu);
1041         u64 g_tsc_offset = 0;
1042
1043         if (is_guest_mode(vcpu)) {
1044                 g_tsc_offset = svm->vmcb->control.tsc_offset -
1045                                svm->nested.hsave->control.tsc_offset;
1046                 svm->nested.hsave->control.tsc_offset = offset;
1047         } else
1048                 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1049                                            svm->vmcb->control.tsc_offset,
1050                                            offset);
1051
1052         svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
1053
1054         mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1055 }
1056
1057 static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
1058 {
1059         struct vcpu_svm *svm = to_svm(vcpu);
1060
1061         svm->vmcb->control.tsc_offset += adjustment;
1062         if (is_guest_mode(vcpu))
1063                 svm->nested.hsave->control.tsc_offset += adjustment;
1064         else
1065                 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1066                                      svm->vmcb->control.tsc_offset - adjustment,
1067                                      svm->vmcb->control.tsc_offset);
1068
1069         mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1070 }
1071
1072 static void avic_init_vmcb(struct vcpu_svm *svm)
1073 {
1074         struct vmcb *vmcb = svm->vmcb;
1075         struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
1076         phys_addr_t bpa = page_to_phys(svm->avic_backing_page);
1077         phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page);
1078         phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page);
1079
1080         vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
1081         vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
1082         vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
1083         vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
1084         vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
1085         svm->vcpu.arch.apicv_active = true;
1086 }
1087
1088 static void init_vmcb(struct vcpu_svm *svm)
1089 {
1090         struct vmcb_control_area *control = &svm->vmcb->control;
1091         struct vmcb_save_area *save = &svm->vmcb->save;
1092
1093         svm->vcpu.fpu_active = 1;
1094         svm->vcpu.arch.hflags = 0;
1095
1096         set_cr_intercept(svm, INTERCEPT_CR0_READ);
1097         set_cr_intercept(svm, INTERCEPT_CR3_READ);
1098         set_cr_intercept(svm, INTERCEPT_CR4_READ);
1099         set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1100         set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1101         set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
1102         if (!kvm_vcpu_apicv_active(&svm->vcpu))
1103                 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
1104
1105         set_dr_intercepts(svm);
1106
1107         set_exception_intercept(svm, PF_VECTOR);
1108         set_exception_intercept(svm, UD_VECTOR);
1109         set_exception_intercept(svm, MC_VECTOR);
1110         set_exception_intercept(svm, AC_VECTOR);
1111         set_exception_intercept(svm, DB_VECTOR);
1112
1113         set_intercept(svm, INTERCEPT_INTR);
1114         set_intercept(svm, INTERCEPT_NMI);
1115         set_intercept(svm, INTERCEPT_SMI);
1116         set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1117         set_intercept(svm, INTERCEPT_RDPMC);
1118         set_intercept(svm, INTERCEPT_CPUID);
1119         set_intercept(svm, INTERCEPT_INVD);
1120         set_intercept(svm, INTERCEPT_HLT);
1121         set_intercept(svm, INTERCEPT_INVLPG);
1122         set_intercept(svm, INTERCEPT_INVLPGA);
1123         set_intercept(svm, INTERCEPT_IOIO_PROT);
1124         set_intercept(svm, INTERCEPT_MSR_PROT);
1125         set_intercept(svm, INTERCEPT_TASK_SWITCH);
1126         set_intercept(svm, INTERCEPT_SHUTDOWN);
1127         set_intercept(svm, INTERCEPT_VMRUN);
1128         set_intercept(svm, INTERCEPT_VMMCALL);
1129         set_intercept(svm, INTERCEPT_VMLOAD);
1130         set_intercept(svm, INTERCEPT_VMSAVE);
1131         set_intercept(svm, INTERCEPT_STGI);
1132         set_intercept(svm, INTERCEPT_CLGI);
1133         set_intercept(svm, INTERCEPT_SKINIT);
1134         set_intercept(svm, INTERCEPT_WBINVD);
1135         set_intercept(svm, INTERCEPT_MONITOR);
1136         set_intercept(svm, INTERCEPT_MWAIT);
1137         set_intercept(svm, INTERCEPT_XSETBV);
1138
1139         control->iopm_base_pa = iopm_base;
1140         control->msrpm_base_pa = __pa(svm->msrpm);
1141         control->int_ctl = V_INTR_MASKING_MASK;
1142
1143         init_seg(&save->es);
1144         init_seg(&save->ss);
1145         init_seg(&save->ds);
1146         init_seg(&save->fs);
1147         init_seg(&save->gs);
1148
1149         save->cs.selector = 0xf000;
1150         save->cs.base = 0xffff0000;
1151         /* Executable/Readable Code Segment */
1152         save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1153                 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1154         save->cs.limit = 0xffff;
1155
1156         save->gdtr.limit = 0xffff;
1157         save->idtr.limit = 0xffff;
1158
1159         init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1160         init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1161
1162         svm_set_efer(&svm->vcpu, 0);
1163         save->dr6 = 0xffff0ff0;
1164         kvm_set_rflags(&svm->vcpu, 2);
1165         save->rip = 0x0000fff0;
1166         svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
1167
1168         /*
1169          * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
1170          * It also updates the guest-visible cr0 value.
1171          */
1172         svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
1173         kvm_mmu_reset_context(&svm->vcpu);
1174
1175         save->cr4 = X86_CR4_PAE;
1176         /* rdx = ?? */
1177
1178         if (npt_enabled) {
1179                 /* Setup VMCB for Nested Paging */
1180                 control->nested_ctl = 1;
1181                 clr_intercept(svm, INTERCEPT_INVLPG);
1182                 clr_exception_intercept(svm, PF_VECTOR);
1183                 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
1184                 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1185                 save->g_pat = svm->vcpu.arch.pat;
1186                 save->cr3 = 0;
1187                 save->cr4 = 0;
1188         }
1189         svm->asid_generation = 0;
1190
1191         svm->nested.vmcb = 0;
1192         svm->vcpu.arch.hflags = 0;
1193
1194         if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
1195                 control->pause_filter_count = 3000;
1196                 set_intercept(svm, INTERCEPT_PAUSE);
1197         }
1198
1199         if (avic)
1200                 avic_init_vmcb(svm);
1201
1202         mark_all_dirty(svm->vmcb);
1203
1204         enable_gif(svm);
1205
1206 }
1207
1208 static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, int index)
1209 {
1210         u64 *avic_physical_id_table;
1211         struct kvm_arch *vm_data = &vcpu->kvm->arch;
1212
1213         if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
1214                 return NULL;
1215
1216         avic_physical_id_table = page_address(vm_data->avic_physical_id_table_page);
1217
1218         return &avic_physical_id_table[index];
1219 }
1220
1221 /**
1222  * Note:
1223  * AVIC hardware walks the nested page table to check permissions,
1224  * but does not use the SPA address specified in the leaf page
1225  * table entry since it uses  address in the AVIC_BACKING_PAGE pointer
1226  * field of the VMCB. Therefore, we set up the
1227  * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
1228  */
1229 static int avic_init_access_page(struct kvm_vcpu *vcpu)
1230 {
1231         struct kvm *kvm = vcpu->kvm;
1232         int ret;
1233
1234         if (kvm->arch.apic_access_page_done)
1235                 return 0;
1236
1237         ret = x86_set_memory_region(kvm,
1238                                     APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
1239                                     APIC_DEFAULT_PHYS_BASE,
1240                                     PAGE_SIZE);
1241         if (ret)
1242                 return ret;
1243
1244         kvm->arch.apic_access_page_done = true;
1245         return 0;
1246 }
1247
1248 static int avic_init_backing_page(struct kvm_vcpu *vcpu)
1249 {
1250         int ret;
1251         u64 *entry, new_entry;
1252         int id = vcpu->vcpu_id;
1253         struct vcpu_svm *svm = to_svm(vcpu);
1254
1255         ret = avic_init_access_page(vcpu);
1256         if (ret)
1257                 return ret;
1258
1259         if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
1260                 return -EINVAL;
1261
1262         if (!svm->vcpu.arch.apic->regs)
1263                 return -EINVAL;
1264
1265         svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
1266
1267         /* Setting AVIC backing page address in the phy APIC ID table */
1268         entry = avic_get_physical_id_entry(vcpu, id);
1269         if (!entry)
1270                 return -EINVAL;
1271
1272         new_entry = READ_ONCE(*entry);
1273         new_entry = (page_to_phys(svm->avic_backing_page) &
1274                      AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
1275                      AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
1276         WRITE_ONCE(*entry, new_entry);
1277
1278         svm->avic_physical_id_cache = entry;
1279
1280         return 0;
1281 }
1282
1283 static void avic_vm_destroy(struct kvm *kvm)
1284 {
1285         struct kvm_arch *vm_data = &kvm->arch;
1286
1287         if (vm_data->avic_logical_id_table_page)
1288                 __free_page(vm_data->avic_logical_id_table_page);
1289         if (vm_data->avic_physical_id_table_page)
1290                 __free_page(vm_data->avic_physical_id_table_page);
1291 }
1292
1293 static int avic_vm_init(struct kvm *kvm)
1294 {
1295         int err = -ENOMEM;
1296         struct kvm_arch *vm_data = &kvm->arch;
1297         struct page *p_page;
1298         struct page *l_page;
1299
1300         if (!avic)
1301                 return 0;
1302
1303         /* Allocating physical APIC ID table (4KB) */
1304         p_page = alloc_page(GFP_KERNEL);
1305         if (!p_page)
1306                 goto free_avic;
1307
1308         vm_data->avic_physical_id_table_page = p_page;
1309         clear_page(page_address(p_page));
1310
1311         /* Allocating logical APIC ID table (4KB) */
1312         l_page = alloc_page(GFP_KERNEL);
1313         if (!l_page)
1314                 goto free_avic;
1315
1316         vm_data->avic_logical_id_table_page = l_page;
1317         clear_page(page_address(l_page));
1318
1319         return 0;
1320
1321 free_avic:
1322         avic_vm_destroy(kvm);
1323         return err;
1324 }
1325
1326 /**
1327  * This function is called during VCPU halt/unhalt.
1328  */
1329 static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
1330 {
1331         u64 entry;
1332         int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu);
1333         struct vcpu_svm *svm = to_svm(vcpu);
1334
1335         if (!kvm_vcpu_apicv_active(vcpu))
1336                 return;
1337
1338         svm->avic_is_running = is_run;
1339
1340         /* ID = 0xff (broadcast), ID > 0xff (reserved) */
1341         if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
1342                 return;
1343
1344         entry = READ_ONCE(*(svm->avic_physical_id_cache));
1345         WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
1346
1347         entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1348         if (is_run)
1349                 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1350         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1351 }
1352
1353 static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1354 {
1355         u64 entry;
1356         /* ID = 0xff (broadcast), ID > 0xff (reserved) */
1357         int h_physical_id = kvm_cpu_get_apicid(cpu);
1358         struct vcpu_svm *svm = to_svm(vcpu);
1359
1360         if (!kvm_vcpu_apicv_active(vcpu))
1361                 return;
1362
1363         if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
1364                 return;
1365
1366         entry = READ_ONCE(*(svm->avic_physical_id_cache));
1367         WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
1368
1369         entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
1370         entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
1371
1372         entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1373         if (svm->avic_is_running)
1374                 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1375
1376         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1377 }
1378
1379 static void avic_vcpu_put(struct kvm_vcpu *vcpu)
1380 {
1381         u64 entry;
1382         struct vcpu_svm *svm = to_svm(vcpu);
1383
1384         if (!kvm_vcpu_apicv_active(vcpu))
1385                 return;
1386
1387         entry = READ_ONCE(*(svm->avic_physical_id_cache));
1388         entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1389         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1390 }
1391
1392 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1393 {
1394         struct vcpu_svm *svm = to_svm(vcpu);
1395         u32 dummy;
1396         u32 eax = 1;
1397
1398         if (!init_event) {
1399                 svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1400                                            MSR_IA32_APICBASE_ENABLE;
1401                 if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
1402                         svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1403         }
1404         init_vmcb(svm);
1405
1406         kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
1407         kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
1408
1409         if (kvm_vcpu_apicv_active(vcpu) && !init_event)
1410                 avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
1411 }
1412
1413 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1414 {
1415         struct vcpu_svm *svm;
1416         struct page *page;
1417         struct page *msrpm_pages;
1418         struct page *hsave_page;
1419         struct page *nested_msrpm_pages;
1420         int err;
1421
1422         svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
1423         if (!svm) {
1424                 err = -ENOMEM;
1425                 goto out;
1426         }
1427
1428         err = kvm_vcpu_init(&svm->vcpu, kvm, id);
1429         if (err)
1430                 goto free_svm;
1431
1432         err = -ENOMEM;
1433         page = alloc_page(GFP_KERNEL);
1434         if (!page)
1435                 goto uninit;
1436
1437         msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1438         if (!msrpm_pages)
1439                 goto free_page1;
1440
1441         nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1442         if (!nested_msrpm_pages)
1443                 goto free_page2;
1444
1445         hsave_page = alloc_page(GFP_KERNEL);
1446         if (!hsave_page)
1447                 goto free_page3;
1448
1449         if (avic) {
1450                 err = avic_init_backing_page(&svm->vcpu);
1451                 if (err)
1452                         goto free_page4;
1453         }
1454
1455         /* We initialize this flag to true to make sure that the is_running
1456          * bit would be set the first time the vcpu is loaded.
1457          */
1458         svm->avic_is_running = true;
1459
1460         svm->nested.hsave = page_address(hsave_page);
1461
1462         svm->msrpm = page_address(msrpm_pages);
1463         svm_vcpu_init_msrpm(svm->msrpm);
1464
1465         svm->nested.msrpm = page_address(nested_msrpm_pages);
1466         svm_vcpu_init_msrpm(svm->nested.msrpm);
1467
1468         svm->vmcb = page_address(page);
1469         clear_page(svm->vmcb);
1470         svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
1471         svm->asid_generation = 0;
1472         init_vmcb(svm);
1473
1474         svm_init_osvw(&svm->vcpu);
1475
1476         return &svm->vcpu;
1477
1478 free_page4:
1479         __free_page(hsave_page);
1480 free_page3:
1481         __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
1482 free_page2:
1483         __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
1484 free_page1:
1485         __free_page(page);
1486 uninit:
1487         kvm_vcpu_uninit(&svm->vcpu);
1488 free_svm:
1489         kmem_cache_free(kvm_vcpu_cache, svm);
1490 out:
1491         return ERR_PTR(err);
1492 }
1493
1494 static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1495 {
1496         struct vcpu_svm *svm = to_svm(vcpu);
1497
1498         __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
1499         __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
1500         __free_page(virt_to_page(svm->nested.hsave));
1501         __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
1502         kvm_vcpu_uninit(vcpu);
1503         kmem_cache_free(kvm_vcpu_cache, svm);
1504 }
1505
1506 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1507 {
1508         struct vcpu_svm *svm = to_svm(vcpu);
1509         int i;
1510
1511         if (unlikely(cpu != vcpu->cpu)) {
1512                 svm->asid_generation = 0;
1513                 mark_all_dirty(svm->vmcb);
1514         }
1515
1516 #ifdef CONFIG_X86_64
1517         rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1518 #endif
1519         savesegment(fs, svm->host.fs);
1520         savesegment(gs, svm->host.gs);
1521         svm->host.ldt = kvm_read_ldt();
1522
1523         for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1524                 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1525
1526         if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
1527                 u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
1528                 if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
1529                         __this_cpu_write(current_tsc_ratio, tsc_ratio);
1530                         wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
1531                 }
1532         }
1533         /* This assumes that the kernel never uses MSR_TSC_AUX */
1534         if (static_cpu_has(X86_FEATURE_RDTSCP))
1535                 wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
1536
1537         avic_vcpu_load(vcpu, cpu);
1538 }
1539
1540 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1541 {
1542         struct vcpu_svm *svm = to_svm(vcpu);
1543         int i;
1544
1545         avic_vcpu_put(vcpu);
1546
1547         ++vcpu->stat.host_state_reload;
1548         kvm_load_ldt(svm->host.ldt);
1549 #ifdef CONFIG_X86_64
1550         loadsegment(fs, svm->host.fs);
1551         wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
1552         load_gs_index(svm->host.gs);
1553 #else
1554 #ifdef CONFIG_X86_32_LAZY_GS
1555         loadsegment(gs, svm->host.gs);
1556 #endif
1557 #endif
1558         for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1559                 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1560 }
1561
1562 static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
1563 {
1564         avic_set_running(vcpu, false);
1565 }
1566
1567 static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
1568 {
1569         avic_set_running(vcpu, true);
1570 }
1571
1572 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1573 {
1574         return to_svm(vcpu)->vmcb->save.rflags;
1575 }
1576
1577 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1578 {
1579        /*
1580         * Any change of EFLAGS.VM is accompained by a reload of SS
1581         * (caused by either a task switch or an inter-privilege IRET),
1582         * so we do not need to update the CPL here.
1583         */
1584         to_svm(vcpu)->vmcb->save.rflags = rflags;
1585 }
1586
1587 static u32 svm_get_pkru(struct kvm_vcpu *vcpu)
1588 {
1589         return 0;
1590 }
1591
1592 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1593 {
1594         switch (reg) {
1595         case VCPU_EXREG_PDPTR:
1596                 BUG_ON(!npt_enabled);
1597                 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1598                 break;
1599         default:
1600                 BUG();
1601         }
1602 }
1603
1604 static void svm_set_vintr(struct vcpu_svm *svm)
1605 {
1606         set_intercept(svm, INTERCEPT_VINTR);
1607 }
1608
1609 static void svm_clear_vintr(struct vcpu_svm *svm)
1610 {
1611         clr_intercept(svm, INTERCEPT_VINTR);
1612 }
1613
1614 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1615 {
1616         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1617
1618         switch (seg) {
1619         case VCPU_SREG_CS: return &save->cs;
1620         case VCPU_SREG_DS: return &save->ds;
1621         case VCPU_SREG_ES: return &save->es;
1622         case VCPU_SREG_FS: return &save->fs;
1623         case VCPU_SREG_GS: return &save->gs;
1624         case VCPU_SREG_SS: return &save->ss;
1625         case VCPU_SREG_TR: return &save->tr;
1626         case VCPU_SREG_LDTR: return &save->ldtr;
1627         }
1628         BUG();
1629         return NULL;
1630 }
1631
1632 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1633 {
1634         struct vmcb_seg *s = svm_seg(vcpu, seg);
1635
1636         return s->base;
1637 }
1638
1639 static void svm_get_segment(struct kvm_vcpu *vcpu,
1640                             struct kvm_segment *var, int seg)
1641 {
1642         struct vmcb_seg *s = svm_seg(vcpu, seg);
1643
1644         var->base = s->base;
1645         var->limit = s->limit;
1646         var->selector = s->selector;
1647         var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1648         var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1649         var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1650         var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1651         var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1652         var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1653         var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1654
1655         /*
1656          * AMD CPUs circa 2014 track the G bit for all segments except CS.
1657          * However, the SVM spec states that the G bit is not observed by the
1658          * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1659          * So let's synthesize a legal G bit for all segments, this helps
1660          * running KVM nested. It also helps cross-vendor migration, because
1661          * Intel's vmentry has a check on the 'G' bit.
1662          */
1663         var->g = s->limit > 0xfffff;
1664
1665         /*
1666          * AMD's VMCB does not have an explicit unusable field, so emulate it
1667          * for cross vendor migration purposes by "not present"
1668          */
1669         var->unusable = !var->present || (var->type == 0);
1670
1671         switch (seg) {
1672         case VCPU_SREG_TR:
1673                 /*
1674                  * Work around a bug where the busy flag in the tr selector
1675                  * isn't exposed
1676                  */
1677                 var->type |= 0x2;
1678                 break;
1679         case VCPU_SREG_DS:
1680         case VCPU_SREG_ES:
1681         case VCPU_SREG_FS:
1682         case VCPU_SREG_GS:
1683                 /*
1684                  * The accessed bit must always be set in the segment
1685                  * descriptor cache, although it can be cleared in the
1686                  * descriptor, the cached bit always remains at 1. Since
1687                  * Intel has a check on this, set it here to support
1688                  * cross-vendor migration.
1689                  */
1690                 if (!var->unusable)
1691                         var->type |= 0x1;
1692                 break;
1693         case VCPU_SREG_SS:
1694                 /*
1695                  * On AMD CPUs sometimes the DB bit in the segment
1696                  * descriptor is left as 1, although the whole segment has
1697                  * been made unusable. Clear it here to pass an Intel VMX
1698                  * entry check when cross vendor migrating.
1699                  */
1700                 if (var->unusable)
1701                         var->db = 0;
1702                 var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1703                 break;
1704         }
1705 }
1706
1707 static int svm_get_cpl(struct kvm_vcpu *vcpu)
1708 {
1709         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1710
1711         return save->cpl;
1712 }
1713
1714 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1715 {
1716         struct vcpu_svm *svm = to_svm(vcpu);
1717
1718         dt->size = svm->vmcb->save.idtr.limit;
1719         dt->address = svm->vmcb->save.idtr.base;
1720 }
1721
1722 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1723 {
1724         struct vcpu_svm *svm = to_svm(vcpu);
1725
1726         svm->vmcb->save.idtr.limit = dt->size;
1727         svm->vmcb->save.idtr.base = dt->address ;
1728         mark_dirty(svm->vmcb, VMCB_DT);
1729 }
1730
1731 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1732 {
1733         struct vcpu_svm *svm = to_svm(vcpu);
1734
1735         dt->size = svm->vmcb->save.gdtr.limit;
1736         dt->address = svm->vmcb->save.gdtr.base;
1737 }
1738
1739 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1740 {
1741         struct vcpu_svm *svm = to_svm(vcpu);
1742
1743         svm->vmcb->save.gdtr.limit = dt->size;
1744         svm->vmcb->save.gdtr.base = dt->address ;
1745         mark_dirty(svm->vmcb, VMCB_DT);
1746 }
1747
1748 static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1749 {
1750 }
1751
1752 static void svm_decache_cr3(struct kvm_vcpu *vcpu)
1753 {
1754 }
1755
1756 static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1757 {
1758 }
1759
1760 static void update_cr0_intercept(struct vcpu_svm *svm)
1761 {
1762         ulong gcr0 = svm->vcpu.arch.cr0;
1763         u64 *hcr0 = &svm->vmcb->save.cr0;
1764
1765         if (!svm->vcpu.fpu_active)
1766                 *hcr0 |= SVM_CR0_SELECTIVE_MASK;
1767         else
1768                 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1769                         | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1770
1771         mark_dirty(svm->vmcb, VMCB_CR);
1772
1773         if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
1774                 clr_cr_intercept(svm, INTERCEPT_CR0_READ);
1775                 clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1776         } else {
1777                 set_cr_intercept(svm, INTERCEPT_CR0_READ);
1778                 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1779         }
1780 }
1781
1782 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1783 {
1784         struct vcpu_svm *svm = to_svm(vcpu);
1785
1786 #ifdef CONFIG_X86_64
1787         if (vcpu->arch.efer & EFER_LME) {
1788                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1789                         vcpu->arch.efer |= EFER_LMA;
1790                         svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1791                 }
1792
1793                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1794                         vcpu->arch.efer &= ~EFER_LMA;
1795                         svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1796                 }
1797         }
1798 #endif
1799         vcpu->arch.cr0 = cr0;
1800
1801         if (!npt_enabled)
1802                 cr0 |= X86_CR0_PG | X86_CR0_WP;
1803
1804         if (!vcpu->fpu_active)
1805                 cr0 |= X86_CR0_TS;
1806         /*
1807          * re-enable caching here because the QEMU bios
1808          * does not do it - this results in some delay at
1809          * reboot
1810          */
1811         if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1812                 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1813         svm->vmcb->save.cr0 = cr0;
1814         mark_dirty(svm->vmcb, VMCB_CR);
1815         update_cr0_intercept(svm);
1816 }
1817
1818 static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1819 {
1820         unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1821         unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1822
1823         if (cr4 & X86_CR4_VMXE)
1824                 return 1;
1825
1826         if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1827                 svm_flush_tlb(vcpu);
1828
1829         vcpu->arch.cr4 = cr4;
1830         if (!npt_enabled)
1831                 cr4 |= X86_CR4_PAE;
1832         cr4 |= host_cr4_mce;
1833         to_svm(vcpu)->vmcb->save.cr4 = cr4;
1834         mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1835         return 0;
1836 }
1837
1838 static void svm_set_segment(struct kvm_vcpu *vcpu,
1839                             struct kvm_segment *var, int seg)
1840 {
1841         struct vcpu_svm *svm = to_svm(vcpu);
1842         struct vmcb_seg *s = svm_seg(vcpu, seg);
1843
1844         s->base = var->base;
1845         s->limit = var->limit;
1846         s->selector = var->selector;
1847         if (var->unusable)
1848                 s->attrib = 0;
1849         else {
1850                 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1851                 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1852                 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1853                 s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
1854                 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1855                 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1856                 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1857                 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1858         }
1859
1860         /*
1861          * This is always accurate, except if SYSRET returned to a segment
1862          * with SS.DPL != 3.  Intel does not have this quirk, and always
1863          * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1864          * would entail passing the CPL to userspace and back.
1865          */
1866         if (seg == VCPU_SREG_SS)
1867                 svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1868
1869         mark_dirty(svm->vmcb, VMCB_SEG);
1870 }
1871
1872 static void update_bp_intercept(struct kvm_vcpu *vcpu)
1873 {
1874         struct vcpu_svm *svm = to_svm(vcpu);
1875
1876         clr_exception_intercept(svm, BP_VECTOR);
1877
1878         if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1879                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1880                         set_exception_intercept(svm, BP_VECTOR);
1881         } else
1882                 vcpu->guest_debug = 0;
1883 }
1884
1885 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1886 {
1887         if (sd->next_asid > sd->max_asid) {
1888                 ++sd->asid_generation;
1889                 sd->next_asid = 1;
1890                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1891         }
1892
1893         svm->asid_generation = sd->asid_generation;
1894         svm->vmcb->control.asid = sd->next_asid++;
1895
1896         mark_dirty(svm->vmcb, VMCB_ASID);
1897 }
1898
1899 static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
1900 {
1901         return to_svm(vcpu)->vmcb->save.dr6;
1902 }
1903
1904 static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
1905 {
1906         struct vcpu_svm *svm = to_svm(vcpu);
1907
1908         svm->vmcb->save.dr6 = value;
1909         mark_dirty(svm->vmcb, VMCB_DR);
1910 }
1911
1912 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1913 {
1914         struct vcpu_svm *svm = to_svm(vcpu);
1915
1916         get_debugreg(vcpu->arch.db[0], 0);
1917         get_debugreg(vcpu->arch.db[1], 1);
1918         get_debugreg(vcpu->arch.db[2], 2);
1919         get_debugreg(vcpu->arch.db[3], 3);
1920         vcpu->arch.dr6 = svm_get_dr6(vcpu);
1921         vcpu->arch.dr7 = svm->vmcb->save.dr7;
1922
1923         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1924         set_dr_intercepts(svm);
1925 }
1926
1927 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1928 {
1929         struct vcpu_svm *svm = to_svm(vcpu);
1930
1931         svm->vmcb->save.dr7 = value;
1932         mark_dirty(svm->vmcb, VMCB_DR);
1933 }
1934
1935 static int pf_interception(struct vcpu_svm *svm)
1936 {
1937         u64 fault_address = svm->vmcb->control.exit_info_2;
1938         u32 error_code;
1939         int r = 1;
1940
1941         switch (svm->apf_reason) {
1942         default:
1943                 error_code = svm->vmcb->control.exit_info_1;
1944
1945                 trace_kvm_page_fault(fault_address, error_code);
1946                 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1947                         kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1948                 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1949                         svm->vmcb->control.insn_bytes,
1950                         svm->vmcb->control.insn_len);
1951                 break;
1952         case KVM_PV_REASON_PAGE_NOT_PRESENT:
1953                 svm->apf_reason = 0;
1954                 local_irq_disable();
1955                 kvm_async_pf_task_wait(fault_address);
1956                 local_irq_enable();
1957                 break;
1958         case KVM_PV_REASON_PAGE_READY:
1959                 svm->apf_reason = 0;
1960                 local_irq_disable();
1961                 kvm_async_pf_task_wake(fault_address);
1962                 local_irq_enable();
1963                 break;
1964         }
1965         return r;
1966 }
1967
1968 static int db_interception(struct vcpu_svm *svm)
1969 {
1970         struct kvm_run *kvm_run = svm->vcpu.run;
1971
1972         if (!(svm->vcpu.guest_debug &
1973               (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1974                 !svm->nmi_singlestep) {
1975                 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1976                 return 1;
1977         }
1978
1979         if (svm->nmi_singlestep) {
1980                 svm->nmi_singlestep = false;
1981                 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1982                         svm->vmcb->save.rflags &=
1983                                 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1984         }
1985
1986         if (svm->vcpu.guest_debug &
1987             (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1988                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1989                 kvm_run->debug.arch.pc =
1990                         svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1991                 kvm_run->debug.arch.exception = DB_VECTOR;
1992                 return 0;
1993         }
1994
1995         return 1;
1996 }
1997
1998 static int bp_interception(struct vcpu_svm *svm)
1999 {
2000         struct kvm_run *kvm_run = svm->vcpu.run;
2001
2002         kvm_run->exit_reason = KVM_EXIT_DEBUG;
2003         kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2004         kvm_run->debug.arch.exception = BP_VECTOR;
2005         return 0;
2006 }
2007
2008 static int ud_interception(struct vcpu_svm *svm)
2009 {
2010         int er;
2011
2012         er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
2013         if (er != EMULATE_DONE)
2014                 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2015         return 1;
2016 }
2017
2018 static int ac_interception(struct vcpu_svm *svm)
2019 {
2020         kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
2021         return 1;
2022 }
2023
2024 static void svm_fpu_activate(struct kvm_vcpu *vcpu)
2025 {
2026         struct vcpu_svm *svm = to_svm(vcpu);
2027
2028         clr_exception_intercept(svm, NM_VECTOR);
2029
2030         svm->vcpu.fpu_active = 1;
2031         update_cr0_intercept(svm);
2032 }
2033
2034 static int nm_interception(struct vcpu_svm *svm)
2035 {
2036         svm_fpu_activate(&svm->vcpu);
2037         return 1;
2038 }
2039
2040 static bool is_erratum_383(void)
2041 {
2042         int err, i;
2043         u64 value;
2044
2045         if (!erratum_383_found)
2046                 return false;
2047
2048         value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2049         if (err)
2050                 return false;
2051
2052         /* Bit 62 may or may not be set for this mce */
2053         value &= ~(1ULL << 62);
2054
2055         if (value != 0xb600000000010015ULL)
2056                 return false;
2057
2058         /* Clear MCi_STATUS registers */
2059         for (i = 0; i < 6; ++i)
2060                 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2061
2062         value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2063         if (!err) {
2064                 u32 low, high;
2065
2066                 value &= ~(1ULL << 2);
2067                 low    = lower_32_bits(value);
2068                 high   = upper_32_bits(value);
2069
2070                 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2071         }
2072
2073         /* Flush tlb to evict multi-match entries */
2074         __flush_tlb_all();
2075
2076         return true;
2077 }
2078
2079 static void svm_handle_mce(struct vcpu_svm *svm)
2080 {
2081         if (is_erratum_383()) {
2082                 /*
2083                  * Erratum 383 triggered. Guest state is corrupt so kill the
2084                  * guest.
2085                  */
2086                 pr_err("KVM: Guest triggered AMD Erratum 383\n");
2087
2088                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
2089
2090                 return;
2091         }
2092
2093         /*
2094          * On an #MC intercept the MCE handler is not called automatically in
2095          * the host. So do it by hand here.
2096          */
2097         asm volatile (
2098                 "int $0x12\n");
2099         /* not sure if we ever come back to this point */
2100
2101         return;
2102 }
2103
2104 static int mc_interception(struct vcpu_svm *svm)
2105 {
2106         return 1;
2107 }
2108
2109 static int shutdown_interception(struct vcpu_svm *svm)
2110 {
2111         struct kvm_run *kvm_run = svm->vcpu.run;
2112
2113         /*
2114          * VMCB is undefined after a SHUTDOWN intercept
2115          * so reinitialize it.
2116          */
2117         clear_page(svm->vmcb);
2118         init_vmcb(svm);
2119
2120         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2121         return 0;
2122 }
2123
2124 static int io_interception(struct vcpu_svm *svm)
2125 {
2126         struct kvm_vcpu *vcpu = &svm->vcpu;
2127         u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2128         int size, in, string;
2129         unsigned port;
2130
2131         ++svm->vcpu.stat.io_exits;
2132         string = (io_info & SVM_IOIO_STR_MASK) != 0;
2133         in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2134         if (string || in)
2135                 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
2136
2137         port = io_info >> 16;
2138         size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2139         svm->next_rip = svm->vmcb->control.exit_info_2;
2140         skip_emulated_instruction(&svm->vcpu);
2141
2142         return kvm_fast_pio_out(vcpu, size, port);
2143 }
2144
2145 static int nmi_interception(struct vcpu_svm *svm)
2146 {
2147         return 1;
2148 }
2149
2150 static int intr_interception(struct vcpu_svm *svm)
2151 {
2152         ++svm->vcpu.stat.irq_exits;
2153         return 1;
2154 }
2155
2156 static int nop_on_interception(struct vcpu_svm *svm)
2157 {
2158         return 1;
2159 }
2160
2161 static int halt_interception(struct vcpu_svm *svm)
2162 {
2163         svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
2164         return kvm_emulate_halt(&svm->vcpu);
2165 }
2166
2167 static int vmmcall_interception(struct vcpu_svm *svm)
2168 {
2169         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2170         return kvm_emulate_hypercall(&svm->vcpu);
2171 }
2172
2173 static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
2174 {
2175         struct vcpu_svm *svm = to_svm(vcpu);
2176
2177         return svm->nested.nested_cr3;
2178 }
2179
2180 static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
2181 {
2182         struct vcpu_svm *svm = to_svm(vcpu);
2183         u64 cr3 = svm->nested.nested_cr3;
2184         u64 pdpte;
2185         int ret;
2186
2187         ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
2188                                        offset_in_page(cr3) + index * 8, 8);
2189         if (ret)
2190                 return 0;
2191         return pdpte;
2192 }
2193
2194 static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
2195                                    unsigned long root)
2196 {
2197         struct vcpu_svm *svm = to_svm(vcpu);
2198
2199         svm->vmcb->control.nested_cr3 = root;
2200         mark_dirty(svm->vmcb, VMCB_NPT);
2201         svm_flush_tlb(vcpu);
2202 }
2203
2204 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
2205                                        struct x86_exception *fault)
2206 {
2207         struct vcpu_svm *svm = to_svm(vcpu);
2208
2209         if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
2210                 /*
2211                  * TODO: track the cause of the nested page fault, and
2212                  * correctly fill in the high bits of exit_info_1.
2213                  */
2214                 svm->vmcb->control.exit_code = SVM_EXIT_NPF;
2215                 svm->vmcb->control.exit_code_hi = 0;
2216                 svm->vmcb->control.exit_info_1 = (1ULL << 32);
2217                 svm->vmcb->control.exit_info_2 = fault->address;
2218         }
2219
2220         svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
2221         svm->vmcb->control.exit_info_1 |= fault->error_code;
2222
2223         /*
2224          * The present bit is always zero for page structure faults on real
2225          * hardware.
2226          */
2227         if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
2228                 svm->vmcb->control.exit_info_1 &= ~1;
2229
2230         nested_svm_vmexit(svm);
2231 }
2232
2233 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
2234 {
2235         WARN_ON(mmu_is_nested(vcpu));
2236         kvm_init_shadow_mmu(vcpu);
2237         vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
2238         vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
2239         vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
2240         vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
2241         vcpu->arch.mmu.shadow_root_level = get_npt_level();
2242         reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
2243         vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
2244 }
2245
2246 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
2247 {
2248         vcpu->arch.walk_mmu = &vcpu->arch.mmu;
2249 }
2250
2251 static int nested_svm_check_permissions(struct vcpu_svm *svm)
2252 {
2253         if (!(svm->vcpu.arch.efer & EFER_SVME)
2254             || !is_paging(&svm->vcpu)) {
2255                 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2256                 return 1;
2257         }
2258
2259         if (svm->vmcb->save.cpl) {
2260                 kvm_inject_gp(&svm->vcpu, 0);
2261                 return 1;
2262         }
2263
2264        return 0;
2265 }
2266
2267 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
2268                                       bool has_error_code, u32 error_code)
2269 {
2270         int vmexit;
2271
2272         if (!is_guest_mode(&svm->vcpu))
2273                 return 0;
2274
2275         svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
2276         svm->vmcb->control.exit_code_hi = 0;
2277         svm->vmcb->control.exit_info_1 = error_code;
2278         svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
2279
2280         vmexit = nested_svm_intercept(svm);
2281         if (vmexit == NESTED_EXIT_DONE)
2282                 svm->nested.exit_required = true;
2283
2284         return vmexit;
2285 }
2286
2287 /* This function returns true if it is save to enable the irq window */
2288 static inline bool nested_svm_intr(struct vcpu_svm *svm)
2289 {
2290         if (!is_guest_mode(&svm->vcpu))
2291                 return true;
2292
2293         if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
2294                 return true;
2295
2296         if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
2297                 return false;
2298
2299         /*
2300          * if vmexit was already requested (by intercepted exception
2301          * for instance) do not overwrite it with "external interrupt"
2302          * vmexit.
2303          */
2304         if (svm->nested.exit_required)
2305                 return false;
2306
2307         svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
2308         svm->vmcb->control.exit_info_1 = 0;
2309         svm->vmcb->control.exit_info_2 = 0;
2310
2311         if (svm->nested.intercept & 1ULL) {
2312                 /*
2313                  * The #vmexit can't be emulated here directly because this
2314                  * code path runs with irqs and preemption disabled. A
2315                  * #vmexit emulation might sleep. Only signal request for
2316                  * the #vmexit here.
2317                  */
2318                 svm->nested.exit_required = true;
2319                 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
2320                 return false;
2321         }
2322
2323         return true;
2324 }
2325
2326 /* This function returns true if it is save to enable the nmi window */
2327 static inline bool nested_svm_nmi(struct vcpu_svm *svm)
2328 {
2329         if (!is_guest_mode(&svm->vcpu))
2330                 return true;
2331
2332         if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
2333                 return true;
2334
2335         svm->vmcb->control.exit_code = SVM_EXIT_NMI;
2336         svm->nested.exit_required = true;
2337
2338         return false;
2339 }
2340
2341 static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
2342 {
2343         struct page *page;
2344
2345         might_sleep();
2346
2347         page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
2348         if (is_error_page(page))
2349                 goto error;
2350
2351         *_page = page;
2352
2353         return kmap(page);
2354
2355 error:
2356         kvm_inject_gp(&svm->vcpu, 0);
2357
2358         return NULL;
2359 }
2360
2361 static void nested_svm_unmap(struct page *page)
2362 {
2363         kunmap(page);
2364         kvm_release_page_dirty(page);
2365 }
2366
2367 static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
2368 {
2369         unsigned port, size, iopm_len;
2370         u16 val, mask;
2371         u8 start_bit;
2372         u64 gpa;
2373
2374         if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
2375                 return NESTED_EXIT_HOST;
2376
2377         port = svm->vmcb->control.exit_info_1 >> 16;
2378         size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
2379                 SVM_IOIO_SIZE_SHIFT;
2380         gpa  = svm->nested.vmcb_iopm + (port / 8);
2381         start_bit = port % 8;
2382         iopm_len = (start_bit + size > 8) ? 2 : 1;
2383         mask = (0xf >> (4 - size)) << start_bit;
2384         val = 0;
2385
2386         if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
2387                 return NESTED_EXIT_DONE;
2388
2389         return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2390 }
2391
2392 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
2393 {
2394         u32 offset, msr, value;
2395         int write, mask;
2396
2397         if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
2398                 return NESTED_EXIT_HOST;
2399
2400         msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2401         offset = svm_msrpm_offset(msr);
2402         write  = svm->vmcb->control.exit_info_1 & 1;
2403         mask   = 1 << ((2 * (msr & 0xf)) + write);
2404
2405         if (offset == MSR_INVALID)
2406                 return NESTED_EXIT_DONE;
2407
2408         /* Offset is in 32 bit units but need in 8 bit units */
2409         offset *= 4;
2410
2411         if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
2412                 return NESTED_EXIT_DONE;
2413
2414         return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2415 }
2416
2417 static int nested_svm_exit_special(struct vcpu_svm *svm)
2418 {
2419         u32 exit_code = svm->vmcb->control.exit_code;
2420
2421         switch (exit_code) {
2422         case SVM_EXIT_INTR:
2423         case SVM_EXIT_NMI:
2424         case SVM_EXIT_EXCP_BASE + MC_VECTOR:
2425                 return NESTED_EXIT_HOST;
2426         case SVM_EXIT_NPF:
2427                 /* For now we are always handling NPFs when using them */
2428                 if (npt_enabled)
2429                         return NESTED_EXIT_HOST;
2430                 break;
2431         case SVM_EXIT_EXCP_BASE + PF_VECTOR:
2432                 /* When we're shadowing, trap PFs, but not async PF */
2433                 if (!npt_enabled && svm->apf_reason == 0)
2434                         return NESTED_EXIT_HOST;
2435                 break;
2436         case SVM_EXIT_EXCP_BASE + NM_VECTOR:
2437                 nm_interception(svm);
2438                 break;
2439         default:
2440                 break;
2441         }
2442
2443         return NESTED_EXIT_CONTINUE;
2444 }
2445
2446 /*
2447  * If this function returns true, this #vmexit was already handled
2448  */
2449 static int nested_svm_intercept(struct vcpu_svm *svm)
2450 {
2451         u32 exit_code = svm->vmcb->control.exit_code;
2452         int vmexit = NESTED_EXIT_HOST;
2453
2454         switch (exit_code) {
2455         case SVM_EXIT_MSR:
2456                 vmexit = nested_svm_exit_handled_msr(svm);
2457                 break;
2458         case SVM_EXIT_IOIO:
2459                 vmexit = nested_svm_intercept_ioio(svm);
2460                 break;
2461         case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
2462                 u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
2463                 if (svm->nested.intercept_cr & bit)
2464                         vmexit = NESTED_EXIT_DONE;
2465                 break;
2466         }
2467         case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
2468                 u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
2469                 if (svm->nested.intercept_dr & bit)
2470                         vmexit = NESTED_EXIT_DONE;
2471                 break;
2472         }
2473         case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
2474                 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
2475                 if (svm->nested.intercept_exceptions & excp_bits)
2476                         vmexit = NESTED_EXIT_DONE;
2477                 /* async page fault always cause vmexit */
2478                 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2479                          svm->apf_reason != 0)
2480                         vmexit = NESTED_EXIT_DONE;
2481                 break;
2482         }
2483         case SVM_EXIT_ERR: {
2484                 vmexit = NESTED_EXIT_DONE;
2485                 break;
2486         }
2487         default: {
2488                 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
2489                 if (svm->nested.intercept & exit_bits)
2490                         vmexit = NESTED_EXIT_DONE;
2491         }
2492         }
2493
2494         return vmexit;
2495 }
2496
2497 static int nested_svm_exit_handled(struct vcpu_svm *svm)
2498 {
2499         int vmexit;
2500
2501         vmexit = nested_svm_intercept(svm);
2502
2503         if (vmexit == NESTED_EXIT_DONE)
2504                 nested_svm_vmexit(svm);
2505
2506         return vmexit;
2507 }
2508
2509 static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
2510 {
2511         struct vmcb_control_area *dst  = &dst_vmcb->control;
2512         struct vmcb_control_area *from = &from_vmcb->control;
2513
2514         dst->intercept_cr         = from->intercept_cr;
2515         dst->intercept_dr         = from->intercept_dr;
2516         dst->intercept_exceptions = from->intercept_exceptions;
2517         dst->intercept            = from->intercept;
2518         dst->iopm_base_pa         = from->iopm_base_pa;
2519         dst->msrpm_base_pa        = from->msrpm_base_pa;
2520         dst->tsc_offset           = from->tsc_offset;
2521         dst->asid                 = from->asid;
2522         dst->tlb_ctl              = from->tlb_ctl;
2523         dst->int_ctl              = from->int_ctl;
2524         dst->int_vector           = from->int_vector;
2525         dst->int_state            = from->int_state;
2526         dst->exit_code            = from->exit_code;
2527         dst->exit_code_hi         = from->exit_code_hi;
2528         dst->exit_info_1          = from->exit_info_1;
2529         dst->exit_info_2          = from->exit_info_2;
2530         dst->exit_int_info        = from->exit_int_info;
2531         dst->exit_int_info_err    = from->exit_int_info_err;
2532         dst->nested_ctl           = from->nested_ctl;
2533         dst->event_inj            = from->event_inj;
2534         dst->event_inj_err        = from->event_inj_err;
2535         dst->nested_cr3           = from->nested_cr3;
2536         dst->lbr_ctl              = from->lbr_ctl;
2537 }
2538
2539 static int nested_svm_vmexit(struct vcpu_svm *svm)
2540 {
2541         struct vmcb *nested_vmcb;
2542         struct vmcb *hsave = svm->nested.hsave;
2543         struct vmcb *vmcb = svm->vmcb;
2544         struct page *page;
2545
2546         trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
2547                                        vmcb->control.exit_info_1,
2548                                        vmcb->control.exit_info_2,
2549                                        vmcb->control.exit_int_info,
2550                                        vmcb->control.exit_int_info_err,
2551                                        KVM_ISA_SVM);
2552
2553         nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
2554         if (!nested_vmcb)
2555                 return 1;
2556
2557         /* Exit Guest-Mode */
2558         leave_guest_mode(&svm->vcpu);
2559         svm->nested.vmcb = 0;
2560
2561         /* Give the current vmcb to the guest */
2562         disable_gif(svm);
2563
2564         nested_vmcb->save.es     = vmcb->save.es;
2565         nested_vmcb->save.cs     = vmcb->save.cs;
2566         nested_vmcb->save.ss     = vmcb->save.ss;
2567         nested_vmcb->save.ds     = vmcb->save.ds;
2568         nested_vmcb->save.gdtr   = vmcb->save.gdtr;
2569         nested_vmcb->save.idtr   = vmcb->save.idtr;
2570         nested_vmcb->save.efer   = svm->vcpu.arch.efer;
2571         nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
2572         nested_vmcb->save.cr3    = kvm_read_cr3(&svm->vcpu);
2573         nested_vmcb->save.cr2    = vmcb->save.cr2;
2574         nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
2575         nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
2576         nested_vmcb->save.rip    = vmcb->save.rip;
2577         nested_vmcb->save.rsp    = vmcb->save.rsp;
2578         nested_vmcb->save.rax    = vmcb->save.rax;
2579         nested_vmcb->save.dr7    = vmcb->save.dr7;
2580         nested_vmcb->save.dr6    = vmcb->save.dr6;
2581         nested_vmcb->save.cpl    = vmcb->save.cpl;
2582
2583         nested_vmcb->control.int_ctl           = vmcb->control.int_ctl;
2584         nested_vmcb->control.int_vector        = vmcb->control.int_vector;
2585         nested_vmcb->control.int_state         = vmcb->control.int_state;
2586         nested_vmcb->control.exit_code         = vmcb->control.exit_code;
2587         nested_vmcb->control.exit_code_hi      = vmcb->control.exit_code_hi;
2588         nested_vmcb->control.exit_info_1       = vmcb->control.exit_info_1;
2589         nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
2590         nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
2591         nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
2592
2593         if (svm->nrips_enabled)
2594                 nested_vmcb->control.next_rip  = vmcb->control.next_rip;
2595
2596         /*
2597          * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
2598          * to make sure that we do not lose injected events. So check event_inj
2599          * here and copy it to exit_int_info if it is valid.
2600          * Exit_int_info and event_inj can't be both valid because the case
2601          * below only happens on a VMRUN instruction intercept which has
2602          * no valid exit_int_info set.
2603          */
2604         if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
2605                 struct vmcb_control_area *nc = &nested_vmcb->control;
2606
2607                 nc->exit_int_info     = vmcb->control.event_inj;
2608                 nc->exit_int_info_err = vmcb->control.event_inj_err;
2609         }
2610
2611         nested_vmcb->control.tlb_ctl           = 0;
2612         nested_vmcb->control.event_inj         = 0;
2613         nested_vmcb->control.event_inj_err     = 0;
2614
2615         /* We always set V_INTR_MASKING and remember the old value in hflags */
2616         if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
2617                 nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
2618
2619         /* Restore the original control entries */
2620         copy_vmcb_control_area(vmcb, hsave);
2621
2622         kvm_clear_exception_queue(&svm->vcpu);
2623         kvm_clear_interrupt_queue(&svm->vcpu);
2624
2625         svm->nested.nested_cr3 = 0;
2626
2627         /* Restore selected save entries */
2628         svm->vmcb->save.es = hsave->save.es;
2629         svm->vmcb->save.cs = hsave->save.cs;
2630         svm->vmcb->save.ss = hsave->save.ss;
2631         svm->vmcb->save.ds = hsave->save.ds;
2632         svm->vmcb->save.gdtr = hsave->save.gdtr;
2633         svm->vmcb->save.idtr = hsave->save.idtr;
2634         kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
2635         svm_set_efer(&svm->vcpu, hsave->save.efer);
2636         svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
2637         svm_set_cr4(&svm->vcpu, hsave->save.cr4);
2638         if (npt_enabled) {
2639                 svm->vmcb->save.cr3 = hsave->save.cr3;
2640                 svm->vcpu.arch.cr3 = hsave->save.cr3;
2641         } else {
2642                 (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
2643         }
2644         kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
2645         kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
2646         kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
2647         svm->vmcb->save.dr7 = 0;
2648         svm->vmcb->save.cpl = 0;
2649         svm->vmcb->control.exit_int_info = 0;
2650
2651         mark_all_dirty(svm->vmcb);
2652
2653         nested_svm_unmap(page);
2654
2655         nested_svm_uninit_mmu_context(&svm->vcpu);
2656         kvm_mmu_reset_context(&svm->vcpu);
2657         kvm_mmu_load(&svm->vcpu);
2658
2659         return 0;
2660 }
2661
2662 static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2663 {
2664         /*
2665          * This function merges the msr permission bitmaps of kvm and the
2666          * nested vmcb. It is optimized in that it only merges the parts where
2667          * the kvm msr permission bitmap may contain zero bits
2668          */
2669         int i;
2670
2671         if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
2672                 return true;
2673
2674         for (i = 0; i < MSRPM_OFFSETS; i++) {
2675                 u32 value, p;
2676                 u64 offset;
2677
2678                 if (msrpm_offsets[i] == 0xffffffff)
2679                         break;
2680
2681                 p      = msrpm_offsets[i];
2682                 offset = svm->nested.vmcb_msrpm + (p * 4);
2683
2684                 if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
2685                         return false;
2686
2687                 svm->nested.msrpm[p] = svm->msrpm[p] | value;
2688         }
2689
2690         svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
2691
2692         return true;
2693 }
2694
2695 static bool nested_vmcb_checks(struct vmcb *vmcb)
2696 {
2697         if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
2698                 return false;
2699
2700         if (vmcb->control.asid == 0)
2701                 return false;
2702
2703         if (vmcb->control.nested_ctl && !npt_enabled)
2704                 return false;
2705
2706         return true;
2707 }
2708
2709 static bool nested_svm_vmrun(struct vcpu_svm *svm)
2710 {
2711         struct vmcb *nested_vmcb;
2712         struct vmcb *hsave = svm->nested.hsave;
2713         struct vmcb *vmcb = svm->vmcb;
2714         struct page *page;
2715         u64 vmcb_gpa;
2716
2717         vmcb_gpa = svm->vmcb->save.rax;
2718
2719         nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2720         if (!nested_vmcb)
2721                 return false;
2722
2723         if (!nested_vmcb_checks(nested_vmcb)) {
2724                 nested_vmcb->control.exit_code    = SVM_EXIT_ERR;
2725                 nested_vmcb->control.exit_code_hi = 0;
2726                 nested_vmcb->control.exit_info_1  = 0;
2727                 nested_vmcb->control.exit_info_2  = 0;
2728
2729                 nested_svm_unmap(page);
2730
2731                 return false;
2732         }
2733
2734         trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
2735                                nested_vmcb->save.rip,
2736                                nested_vmcb->control.int_ctl,
2737                                nested_vmcb->control.event_inj,
2738                                nested_vmcb->control.nested_ctl);
2739
2740         trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
2741                                     nested_vmcb->control.intercept_cr >> 16,
2742                                     nested_vmcb->control.intercept_exceptions,
2743                                     nested_vmcb->control.intercept);
2744
2745         /* Clear internal status */
2746         kvm_clear_exception_queue(&svm->vcpu);
2747         kvm_clear_interrupt_queue(&svm->vcpu);
2748
2749         /*
2750          * Save the old vmcb, so we don't need to pick what we save, but can
2751          * restore everything when a VMEXIT occurs
2752          */
2753         hsave->save.es     = vmcb->save.es;
2754         hsave->save.cs     = vmcb->save.cs;
2755         hsave->save.ss     = vmcb->save.ss;
2756         hsave->save.ds     = vmcb->save.ds;
2757         hsave->save.gdtr   = vmcb->save.gdtr;
2758         hsave->save.idtr   = vmcb->save.idtr;
2759         hsave->save.efer   = svm->vcpu.arch.efer;
2760         hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
2761         hsave->save.cr4    = svm->vcpu.arch.cr4;
2762         hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
2763         hsave->save.rip    = kvm_rip_read(&svm->vcpu);
2764         hsave->save.rsp    = vmcb->save.rsp;
2765         hsave->save.rax    = vmcb->save.rax;
2766         if (npt_enabled)
2767                 hsave->save.cr3    = vmcb->save.cr3;
2768         else
2769                 hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
2770
2771         copy_vmcb_control_area(hsave, vmcb);
2772
2773         if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
2774                 svm->vcpu.arch.hflags |= HF_HIF_MASK;
2775         else
2776                 svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
2777
2778         if (nested_vmcb->control.nested_ctl) {
2779                 kvm_mmu_unload(&svm->vcpu);
2780                 svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
2781                 nested_svm_init_mmu_context(&svm->vcpu);
2782         }
2783
2784         /* Load the nested guest state */
2785         svm->vmcb->save.es = nested_vmcb->save.es;
2786         svm->vmcb->save.cs = nested_vmcb->save.cs;
2787         svm->vmcb->save.ss = nested_vmcb->save.ss;
2788         svm->vmcb->save.ds = nested_vmcb->save.ds;
2789         svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
2790         svm->vmcb->save.idtr = nested_vmcb->save.idtr;
2791         kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
2792         svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
2793         svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
2794         svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
2795         if (npt_enabled) {
2796                 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
2797                 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
2798         } else
2799                 (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
2800
2801         /* Guest paging mode is active - reset mmu */
2802         kvm_mmu_reset_context(&svm->vcpu);
2803
2804         svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
2805         kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
2806         kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
2807         kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
2808
2809         /* In case we don't even reach vcpu_run, the fields are not updated */
2810         svm->vmcb->save.rax = nested_vmcb->save.rax;
2811         svm->vmcb->save.rsp = nested_vmcb->save.rsp;
2812         svm->vmcb->save.rip = nested_vmcb->save.rip;
2813         svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
2814         svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
2815         svm->vmcb->save.cpl = nested_vmcb->save.cpl;
2816
2817         svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
2818         svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
2819
2820         /* cache intercepts */
2821         svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr;
2822         svm->nested.intercept_dr         = nested_vmcb->control.intercept_dr;
2823         svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
2824         svm->nested.intercept            = nested_vmcb->control.intercept;
2825
2826         svm_flush_tlb(&svm->vcpu);
2827         svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
2828         if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
2829                 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
2830         else
2831                 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
2832
2833         if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2834                 /* We only want the cr8 intercept bits of the guest */
2835                 clr_cr_intercept(svm, INTERCEPT_CR8_READ);
2836                 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2837         }
2838
2839         /* We don't want to see VMMCALLs from a nested guest */
2840         clr_intercept(svm, INTERCEPT_VMMCALL);
2841
2842         svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
2843         svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
2844         svm->vmcb->control.int_state = nested_vmcb->control.int_state;
2845         svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
2846         svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
2847         svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
2848
2849         nested_svm_unmap(page);
2850
2851         /* Enter Guest-Mode */
2852         enter_guest_mode(&svm->vcpu);
2853
2854         /*
2855          * Merge guest and host intercepts - must be called  with vcpu in
2856          * guest-mode to take affect here
2857          */
2858         recalc_intercepts(svm);
2859
2860         svm->nested.vmcb = vmcb_gpa;
2861
2862         enable_gif(svm);
2863
2864         mark_all_dirty(svm->vmcb);
2865
2866         return true;
2867 }
2868
2869 static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
2870 {
2871         to_vmcb->save.fs = from_vmcb->save.fs;
2872         to_vmcb->save.gs = from_vmcb->save.gs;
2873         to_vmcb->save.tr = from_vmcb->save.tr;
2874         to_vmcb->save.ldtr = from_vmcb->save.ldtr;
2875         to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
2876         to_vmcb->save.star = from_vmcb->save.star;
2877         to_vmcb->save.lstar = from_vmcb->save.lstar;
2878         to_vmcb->save.cstar = from_vmcb->save.cstar;
2879         to_vmcb->save.sfmask = from_vmcb->save.sfmask;
2880         to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
2881         to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
2882         to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
2883 }
2884
2885 static int vmload_interception(struct vcpu_svm *svm)
2886 {
2887         struct vmcb *nested_vmcb;
2888         struct page *page;
2889
2890         if (nested_svm_check_permissions(svm))
2891                 return 1;
2892
2893         nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2894         if (!nested_vmcb)
2895                 return 1;
2896
2897         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2898         skip_emulated_instruction(&svm->vcpu);
2899
2900         nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
2901         nested_svm_unmap(page);
2902
2903         return 1;
2904 }
2905
2906 static int vmsave_interception(struct vcpu_svm *svm)
2907 {
2908         struct vmcb *nested_vmcb;
2909         struct page *page;
2910
2911         if (nested_svm_check_permissions(svm))
2912                 return 1;
2913
2914         nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2915         if (!nested_vmcb)
2916                 return 1;
2917
2918         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2919         skip_emulated_instruction(&svm->vcpu);
2920
2921         nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
2922         nested_svm_unmap(page);
2923
2924         return 1;
2925 }
2926
2927 static int vmrun_interception(struct vcpu_svm *svm)
2928 {
2929         if (nested_svm_check_permissions(svm))
2930                 return 1;
2931
2932         /* Save rip after vmrun instruction */
2933         kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
2934
2935         if (!nested_svm_vmrun(svm))
2936                 return 1;
2937
2938         if (!nested_svm_vmrun_msrpm(svm))
2939                 goto failed;
2940
2941         return 1;
2942
2943 failed:
2944
2945         svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
2946         svm->vmcb->control.exit_code_hi = 0;
2947         svm->vmcb->control.exit_info_1  = 0;
2948         svm->vmcb->control.exit_info_2  = 0;
2949
2950         nested_svm_vmexit(svm);
2951
2952         return 1;
2953 }
2954
2955 static int stgi_interception(struct vcpu_svm *svm)
2956 {
2957         if (nested_svm_check_permissions(svm))
2958                 return 1;
2959
2960         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2961         skip_emulated_instruction(&svm->vcpu);
2962         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2963
2964         enable_gif(svm);
2965
2966         return 1;
2967 }
2968
2969 static int clgi_interception(struct vcpu_svm *svm)
2970 {
2971         if (nested_svm_check_permissions(svm))
2972                 return 1;
2973
2974         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2975         skip_emulated_instruction(&svm->vcpu);
2976
2977         disable_gif(svm);
2978
2979         /* After a CLGI no interrupts should come */
2980         if (!kvm_vcpu_apicv_active(&svm->vcpu)) {
2981                 svm_clear_vintr(svm);
2982                 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2983                 mark_dirty(svm->vmcb, VMCB_INTR);
2984         }
2985
2986         return 1;
2987 }
2988
2989 static int invlpga_interception(struct vcpu_svm *svm)
2990 {
2991         struct kvm_vcpu *vcpu = &svm->vcpu;
2992
2993         trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
2994                           kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
2995
2996         /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2997         kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
2998
2999         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3000         skip_emulated_instruction(&svm->vcpu);
3001         return 1;
3002 }
3003
3004 static int skinit_interception(struct vcpu_svm *svm)
3005 {
3006         trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
3007
3008         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3009         return 1;
3010 }
3011
3012 static int wbinvd_interception(struct vcpu_svm *svm)
3013 {
3014         kvm_emulate_wbinvd(&svm->vcpu);
3015         return 1;
3016 }
3017
3018 static int xsetbv_interception(struct vcpu_svm *svm)
3019 {
3020         u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
3021         u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3022
3023         if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
3024                 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3025                 skip_emulated_instruction(&svm->vcpu);
3026         }
3027
3028         return 1;
3029 }
3030
3031 static int task_switch_interception(struct vcpu_svm *svm)
3032 {
3033         u16 tss_selector;
3034         int reason;
3035         int int_type = svm->vmcb->control.exit_int_info &
3036                 SVM_EXITINTINFO_TYPE_MASK;
3037         int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
3038         uint32_t type =
3039                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
3040         uint32_t idt_v =
3041                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
3042         bool has_error_code = false;
3043         u32 error_code = 0;
3044
3045         tss_selector = (u16)svm->vmcb->control.exit_info_1;
3046
3047         if (svm->vmcb->control.exit_info_2 &
3048             (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
3049                 reason = TASK_SWITCH_IRET;
3050         else if (svm->vmcb->control.exit_info_2 &
3051                  (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
3052                 reason = TASK_SWITCH_JMP;
3053         else if (idt_v)
3054                 reason = TASK_SWITCH_GATE;
3055         else
3056                 reason = TASK_SWITCH_CALL;
3057
3058         if (reason == TASK_SWITCH_GATE) {
3059                 switch (type) {
3060                 case SVM_EXITINTINFO_TYPE_NMI:
3061                         svm->vcpu.arch.nmi_injected = false;
3062                         break;
3063                 case SVM_EXITINTINFO_TYPE_EXEPT:
3064                         if (svm->vmcb->control.exit_info_2 &
3065                             (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
3066                                 has_error_code = true;
3067                                 error_code =
3068                                         (u32)svm->vmcb->control.exit_info_2;
3069                         }
3070                         kvm_clear_exception_queue(&svm->vcpu);
3071                         break;
3072                 case SVM_EXITINTINFO_TYPE_INTR:
3073                         kvm_clear_interrupt_queue(&svm->vcpu);
3074                         break;
3075                 default:
3076                         break;
3077                 }
3078         }
3079
3080         if (reason != TASK_SWITCH_GATE ||
3081             int_type == SVM_EXITINTINFO_TYPE_SOFT ||
3082             (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
3083              (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
3084                 skip_emulated_instruction(&svm->vcpu);
3085
3086         if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
3087                 int_vec = -1;
3088
3089         if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
3090                                 has_error_code, error_code) == EMULATE_FAIL) {
3091                 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3092                 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3093                 svm->vcpu.run->internal.ndata = 0;
3094                 return 0;
3095         }
3096         return 1;
3097 }
3098
3099 static int cpuid_interception(struct vcpu_svm *svm)
3100 {
3101         svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3102         kvm_emulate_cpuid(&svm->vcpu);
3103         return 1;
3104 }
3105
3106 static int iret_interception(struct vcpu_svm *svm)
3107 {
3108         ++svm->vcpu.stat.nmi_window_exits;
3109         clr_intercept(svm, INTERCEPT_IRET);
3110         svm->vcpu.arch.hflags |= HF_IRET_MASK;
3111         svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
3112         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3113         return 1;
3114 }
3115
3116 static int invlpg_interception(struct vcpu_svm *svm)
3117 {
3118         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
3119                 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
3120
3121         kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
3122         skip_emulated_instruction(&svm->vcpu);
3123         return 1;
3124 }
3125
3126 static int emulate_on_interception(struct vcpu_svm *svm)
3127 {
3128         return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
3129 }
3130
3131 static int rdpmc_interception(struct vcpu_svm *svm)
3132 {
3133         int err;
3134
3135         if (!static_cpu_has(X86_FEATURE_NRIPS))
3136                 return emulate_on_interception(svm);
3137
3138         err = kvm_rdpmc(&svm->vcpu);
3139         kvm_complete_insn_gp(&svm->vcpu, err);
3140
3141         return 1;
3142 }
3143
3144 static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
3145                                             unsigned long val)
3146 {
3147         unsigned long cr0 = svm->vcpu.arch.cr0;
3148         bool ret = false;
3149         u64 intercept;
3150
3151         intercept = svm->nested.intercept;
3152
3153         if (!is_guest_mode(&svm->vcpu) ||
3154             (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
3155                 return false;
3156
3157         cr0 &= ~SVM_CR0_SELECTIVE_MASK;
3158         val &= ~SVM_CR0_SELECTIVE_MASK;
3159
3160         if (cr0 ^ val) {
3161                 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
3162                 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
3163         }
3164
3165         return ret;
3166 }
3167
3168 #define CR_VALID (1ULL << 63)
3169
3170 static int cr_interception(struct vcpu_svm *svm)
3171 {
3172         int reg, cr;
3173         unsigned long val;
3174         int err;
3175
3176         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
3177                 return emulate_on_interception(svm);
3178
3179         if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
3180                 return emulate_on_interception(svm);
3181
3182         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
3183         if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
3184                 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
3185         else
3186                 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
3187
3188         err = 0;
3189         if (cr >= 16) { /* mov to cr */
3190                 cr -= 16;
3191                 val = kvm_register_read(&svm->vcpu, reg);
3192                 switch (cr) {
3193                 case 0:
3194                         if (!check_selective_cr0_intercepted(svm, val))
3195                                 err = kvm_set_cr0(&svm->vcpu, val);
3196                         else
3197                                 return 1;
3198
3199                         break;
3200                 case 3:
3201                         err = kvm_set_cr3(&svm->vcpu, val);
3202                         break;
3203                 case 4:
3204                         err = kvm_set_cr4(&svm->vcpu, val);
3205                         break;
3206                 case 8:
3207                         err = kvm_set_cr8(&svm->vcpu, val);
3208                         break;
3209                 default:
3210                         WARN(1, "unhandled write to CR%d", cr);
3211                         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3212                         return 1;
3213                 }
3214         } else { /* mov from cr */
3215                 switch (cr) {
3216                 case 0:
3217                         val = kvm_read_cr0(&svm->vcpu);
3218                         break;
3219                 case 2:
3220                         val = svm->vcpu.arch.cr2;
3221                         break;
3222                 case 3:
3223                         val = kvm_read_cr3(&svm->vcpu);
3224                         break;
3225                 case 4:
3226                         val = kvm_read_cr4(&svm->vcpu);
3227                         break;
3228                 case 8:
3229                         val = kvm_get_cr8(&svm->vcpu);
3230                         break;
3231                 default:
3232                         WARN(1, "unhandled read from CR%d", cr);
3233                         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3234                         return 1;
3235                 }
3236                 kvm_register_write(&svm->vcpu, reg, val);
3237         }
3238         kvm_complete_insn_gp(&svm->vcpu, err);
3239
3240         return 1;
3241 }
3242
3243 static int dr_interception(struct vcpu_svm *svm)
3244 {
3245         int reg, dr;
3246         unsigned long val;
3247
3248         if (svm->vcpu.guest_debug == 0) {
3249                 /*
3250                  * No more DR vmexits; force a reload of the debug registers
3251                  * and reenter on this instruction.  The next vmexit will
3252                  * retrieve the full state of the debug registers.
3253                  */
3254                 clr_dr_intercepts(svm);
3255                 svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
3256                 return 1;
3257         }
3258
3259         if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
3260                 return emulate_on_interception(svm);
3261
3262         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
3263         dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
3264
3265         if (dr >= 16) { /* mov to DRn */
3266                 if (!kvm_require_dr(&svm->vcpu, dr - 16))
3267                         return 1;
3268                 val = kvm_register_read(&svm->vcpu, reg);
3269                 kvm_set_dr(&svm->vcpu, dr - 16, val);
3270         } else {
3271                 if (!kvm_require_dr(&svm->vcpu, dr))
3272                         return 1;
3273                 kvm_get_dr(&svm->vcpu, dr, &val);
3274                 kvm_register_write(&svm->vcpu, reg, val);
3275         }
3276
3277         skip_emulated_instruction(&svm->vcpu);
3278
3279         return 1;
3280 }
3281
3282 static int cr8_write_interception(struct vcpu_svm *svm)
3283 {
3284         struct kvm_run *kvm_run = svm->vcpu.run;
3285         int r;
3286
3287         u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
3288         /* instruction emulation calls kvm_set_cr8() */
3289         r = cr_interception(svm);
3290         if (lapic_in_kernel(&svm->vcpu))
3291                 return r;
3292         if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
3293                 return r;
3294         kvm_run->exit_reason = KVM_EXIT_SET_TPR;
3295         return 0;
3296 }
3297
3298 static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
3299 {
3300         struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
3301         return vmcb->control.tsc_offset + host_tsc;
3302 }
3303
3304 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3305 {
3306         struct vcpu_svm *svm = to_svm(vcpu);
3307
3308         switch (msr_info->index) {
3309         case MSR_IA32_TSC: {
3310                 msr_info->data = svm->vmcb->control.tsc_offset +
3311                         kvm_scale_tsc(vcpu, rdtsc());
3312
3313                 break;
3314         }
3315         case MSR_STAR:
3316                 msr_info->data = svm->vmcb->save.star;
3317                 break;
3318 #ifdef CONFIG_X86_64
3319         case MSR_LSTAR:
3320                 msr_info->data = svm->vmcb->save.lstar;
3321                 break;
3322         case MSR_CSTAR:
3323                 msr_info->data = svm->vmcb->save.cstar;
3324                 break;
3325         case MSR_KERNEL_GS_BASE:
3326                 msr_info->data = svm->vmcb->save.kernel_gs_base;
3327                 break;
3328         case MSR_SYSCALL_MASK:
3329                 msr_info->data = svm->vmcb->save.sfmask;
3330                 break;
3331 #endif
3332         case MSR_IA32_SYSENTER_CS:
3333                 msr_info->data = svm->vmcb->save.sysenter_cs;
3334                 break;
3335         case MSR_IA32_SYSENTER_EIP:
3336                 msr_info->data = svm->sysenter_eip;
3337                 break;
3338         case MSR_IA32_SYSENTER_ESP:
3339                 msr_info->data = svm->sysenter_esp;
3340                 break;
3341         case MSR_TSC_AUX:
3342                 if (!boot_cpu_has(X86_FEATURE_RDTSCP))
3343                         return 1;
3344                 msr_info->data = svm->tsc_aux;
3345                 break;
3346         /*
3347          * Nobody will change the following 5 values in the VMCB so we can
3348          * safely return them on rdmsr. They will always be 0 until LBRV is
3349          * implemented.
3350          */
3351         case MSR_IA32_DEBUGCTLMSR:
3352                 msr_info->data = svm->vmcb->save.dbgctl;
3353                 break;
3354         case MSR_IA32_LASTBRANCHFROMIP:
3355                 msr_info->data = svm->vmcb->save.br_from;
3356                 break;
3357         case MSR_IA32_LASTBRANCHTOIP:
3358                 msr_info->data = svm->vmcb->save.br_to;
3359                 break;
3360         case MSR_IA32_LASTINTFROMIP:
3361                 msr_info->data = svm->vmcb->save.last_excp_from;
3362                 break;
3363         case MSR_IA32_LASTINTTOIP:
3364                 msr_info->data = svm->vmcb->save.last_excp_to;
3365                 break;
3366         case MSR_VM_HSAVE_PA:
3367                 msr_info->data = svm->nested.hsave_msr;
3368                 break;
3369         case MSR_VM_CR:
3370                 msr_info->data = svm->nested.vm_cr_msr;
3371                 break;
3372         case MSR_IA32_UCODE_REV:
3373                 msr_info->data = 0x01000065;
3374                 break;
3375         case MSR_F15H_IC_CFG: {
3376
3377                 int family, model;
3378
3379                 family = guest_cpuid_family(vcpu);
3380                 model  = guest_cpuid_model(vcpu);
3381
3382                 if (family < 0 || model < 0)
3383                         return kvm_get_msr_common(vcpu, msr_info);
3384
3385                 msr_info->data = 0;
3386
3387                 if (family == 0x15 &&
3388                     (model >= 0x2 && model < 0x20))
3389                         msr_info->data = 0x1E;
3390                 }
3391                 break;
3392         default:
3393                 return kvm_get_msr_common(vcpu, msr_info);
3394         }
3395         return 0;
3396 }
3397
3398 static int rdmsr_interception(struct vcpu_svm *svm)
3399 {
3400         u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3401         struct msr_data msr_info;
3402
3403         msr_info.index = ecx;
3404         msr_info.host_initiated = false;
3405         if (svm_get_msr(&svm->vcpu, &msr_info)) {
3406                 trace_kvm_msr_read_ex(ecx);
3407                 kvm_inject_gp(&svm->vcpu, 0);
3408         } else {
3409                 trace_kvm_msr_read(ecx, msr_info.data);
3410
3411                 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
3412                                    msr_info.data & 0xffffffff);
3413                 kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
3414                                    msr_info.data >> 32);
3415                 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3416                 skip_emulated_instruction(&svm->vcpu);
3417         }
3418         return 1;
3419 }
3420
3421 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
3422 {
3423         struct vcpu_svm *svm = to_svm(vcpu);
3424         int svm_dis, chg_mask;
3425
3426         if (data & ~SVM_VM_CR_VALID_MASK)
3427                 return 1;
3428
3429         chg_mask = SVM_VM_CR_VALID_MASK;
3430
3431         if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
3432                 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
3433
3434         svm->nested.vm_cr_msr &= ~chg_mask;
3435         svm->nested.vm_cr_msr |= (data & chg_mask);
3436
3437         svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
3438
3439         /* check for svm_disable while efer.svme is set */
3440         if (svm_dis && (vcpu->arch.efer & EFER_SVME))
3441                 return 1;
3442
3443         return 0;
3444 }
3445
3446 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
3447 {
3448         struct vcpu_svm *svm = to_svm(vcpu);
3449
3450         u32 ecx = msr->index;
3451         u64 data = msr->data;
3452         switch (ecx) {
3453         case MSR_IA32_TSC:
3454                 kvm_write_tsc(vcpu, msr);
3455                 break;
3456         case MSR_STAR:
3457                 svm->vmcb->save.star = data;
3458                 break;
3459 #ifdef CONFIG_X86_64
3460         case MSR_LSTAR:
3461                 svm->vmcb->save.lstar = data;
3462                 break;
3463         case MSR_CSTAR:
3464                 svm->vmcb->save.cstar = data;
3465                 break;
3466         case MSR_KERNEL_GS_BASE:
3467                 svm->vmcb->save.kernel_gs_base = data;
3468                 break;
3469         case MSR_SYSCALL_MASK:
3470                 svm->vmcb->save.sfmask = data;
3471                 break;
3472 #endif
3473         case MSR_IA32_SYSENTER_CS:
3474                 svm->vmcb->save.sysenter_cs = data;
3475                 break;
3476         case MSR_IA32_SYSENTER_EIP:
3477                 svm->sysenter_eip = data;
3478                 svm->vmcb->save.sysenter_eip = data;
3479                 break;
3480         case MSR_IA32_SYSENTER_ESP:
3481                 svm->sysenter_esp = data;
3482                 svm->vmcb->save.sysenter_esp = data;
3483                 break;
3484         case MSR_TSC_AUX:
3485                 if (!boot_cpu_has(X86_FEATURE_RDTSCP))
3486                         return 1;
3487
3488                 /*
3489                  * This is rare, so we update the MSR here instead of using
3490                  * direct_access_msrs.  Doing that would require a rdmsr in
3491                  * svm_vcpu_put.
3492                  */
3493                 svm->tsc_aux = data;
3494                 wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
3495                 break;
3496         case MSR_IA32_DEBUGCTLMSR:
3497                 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
3498                         vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3499                                     __func__, data);
3500                         break;
3501                 }
3502                 if (data & DEBUGCTL_RESERVED_BITS)
3503                         return 1;
3504
3505                 svm->vmcb->save.dbgctl = data;
3506                 mark_dirty(svm->vmcb, VMCB_LBR);
3507                 if (data & (1ULL<<0))
3508                         svm_enable_lbrv(svm);
3509                 else
3510                         svm_disable_lbrv(svm);
3511                 break;
3512         case MSR_VM_HSAVE_PA:
3513                 svm->nested.hsave_msr = data;
3514                 break;
3515         case MSR_VM_CR:
3516                 return svm_set_vm_cr(vcpu, data);
3517         case MSR_VM_IGNNE:
3518                 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3519                 break;
3520         case MSR_IA32_APICBASE:
3521                 if (kvm_vcpu_apicv_active(vcpu))
3522                         avic_update_vapic_bar(to_svm(vcpu), data);
3523                 /* Follow through */
3524         default:
3525                 return kvm_set_msr_common(vcpu, msr);
3526         }
3527         return 0;
3528 }
3529
3530 static int wrmsr_interception(struct vcpu_svm *svm)
3531 {
3532         struct msr_data msr;
3533         u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3534         u64 data = kvm_read_edx_eax(&svm->vcpu);
3535
3536         msr.data = data;
3537         msr.index = ecx;
3538         msr.host_initiated = false;
3539
3540         svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3541         if (kvm_set_msr(&svm->vcpu, &msr)) {
3542                 trace_kvm_msr_write_ex(ecx, data);
3543                 kvm_inject_gp(&svm->vcpu, 0);
3544         } else {
3545                 trace_kvm_msr_write(ecx, data);
3546                 skip_emulated_instruction(&svm->vcpu);
3547         }
3548         return 1;
3549 }
3550
3551 static int msr_interception(struct vcpu_svm *svm)
3552 {
3553         if (svm->vmcb->control.exit_info_1)
3554                 return wrmsr_interception(svm);
3555         else
3556                 return rdmsr_interception(svm);
3557 }
3558
3559 static int interrupt_window_interception(struct vcpu_svm *svm)
3560 {
3561         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3562         svm_clear_vintr(svm);
3563         svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3564         mark_dirty(svm->vmcb, VMCB_INTR);
3565         ++svm->vcpu.stat.irq_window_exits;
3566         return 1;
3567 }
3568
3569 static int pause_interception(struct vcpu_svm *svm)
3570 {
3571         kvm_vcpu_on_spin(&(svm->vcpu));
3572         return 1;
3573 }
3574
3575 static int nop_interception(struct vcpu_svm *svm)
3576 {
3577         skip_emulated_instruction(&(svm->vcpu));
3578         return 1;
3579 }
3580
3581 static int monitor_interception(struct vcpu_svm *svm)
3582 {
3583         printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
3584         return nop_interception(svm);
3585 }
3586
3587 static int mwait_interception(struct vcpu_svm *svm)
3588 {
3589         printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
3590         return nop_interception(svm);
3591 }
3592
3593 enum avic_ipi_failure_cause {
3594         AVIC_IPI_FAILURE_INVALID_INT_TYPE,
3595         AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
3596         AVIC_IPI_FAILURE_INVALID_TARGET,
3597         AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
3598 };
3599
3600 static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
3601 {
3602         u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
3603         u32 icrl = svm->vmcb->control.exit_info_1;
3604         u32 id = svm->vmcb->control.exit_info_2 >> 32;
3605         u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
3606         struct kvm_lapic *apic = svm->vcpu.arch.apic;
3607
3608         trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
3609
3610         switch (id) {
3611         case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
3612                 /*
3613                  * AVIC hardware handles the generation of
3614                  * IPIs when the specified Message Type is Fixed
3615                  * (also known as fixed delivery mode) and
3616                  * the Trigger Mode is edge-triggered. The hardware
3617                  * also supports self and broadcast delivery modes
3618                  * specified via the Destination Shorthand(DSH)
3619                  * field of the ICRL. Logical and physical APIC ID
3620                  * formats are supported. All other IPI types cause
3621                  * a #VMEXIT, which needs to emulated.
3622                  */
3623                 kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
3624                 kvm_lapic_reg_write(apic, APIC_ICR, icrl);
3625                 break;
3626         case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
3627                 int i;
3628                 struct kvm_vcpu *vcpu;
3629                 struct kvm *kvm = svm->vcpu.kvm;
3630                 struct kvm_lapic *apic = svm->vcpu.arch.apic;
3631
3632                 /*
3633                  * At this point, we expect that the AVIC HW has already
3634                  * set the appropriate IRR bits on the valid target
3635                  * vcpus. So, we just need to kick the appropriate vcpu.
3636                  */
3637                 kvm_for_each_vcpu(i, vcpu, kvm) {
3638                         bool m = kvm_apic_match_dest(vcpu, apic,
3639                                                      icrl & KVM_APIC_SHORT_MASK,
3640                                                      GET_APIC_DEST_FIELD(icrh),
3641                                                      icrl & KVM_APIC_DEST_MASK);
3642
3643                         if (m && !avic_vcpu_is_running(vcpu))
3644                                 kvm_vcpu_wake_up(vcpu);
3645                 }
3646                 break;
3647         }
3648         case AVIC_IPI_FAILURE_INVALID_TARGET:
3649                 break;
3650         case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
3651                 WARN_ONCE(1, "Invalid backing page\n");
3652                 break;
3653         default:
3654                 pr_err("Unknown IPI interception\n");
3655         }
3656
3657         return 1;
3658 }
3659
3660 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
3661 {
3662         struct kvm_arch *vm_data = &vcpu->kvm->arch;
3663         int index;
3664         u32 *logical_apic_id_table;
3665         int dlid = GET_APIC_LOGICAL_ID(ldr);
3666
3667         if (!dlid)
3668                 return NULL;
3669
3670         if (flat) { /* flat */
3671                 index = ffs(dlid) - 1;
3672                 if (index > 7)
3673                         return NULL;
3674         } else { /* cluster */
3675                 int cluster = (dlid & 0xf0) >> 4;
3676                 int apic = ffs(dlid & 0x0f) - 1;
3677
3678                 if ((apic < 0) || (apic > 7) ||
3679                     (cluster >= 0xf))
3680                         return NULL;
3681                 index = (cluster << 2) + apic;
3682         }
3683
3684         logical_apic_id_table = (u32 *) page_address(vm_data->avic_logical_id_table_page);
3685
3686         return &logical_apic_id_table[index];
3687 }
3688
3689 static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
3690                           bool valid)
3691 {
3692         bool flat;
3693         u32 *entry, new_entry;
3694
3695         flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
3696         entry = avic_get_logical_id_entry(vcpu, ldr, flat);
3697         if (!entry)
3698                 return -EINVAL;
3699
3700         new_entry = READ_ONCE(*entry);
3701         new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
3702         new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
3703         if (valid)
3704                 new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
3705         else
3706                 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
3707         WRITE_ONCE(*entry, new_entry);
3708
3709         return 0;
3710 }
3711
3712 static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
3713 {
3714         int ret;
3715         struct vcpu_svm *svm = to_svm(vcpu);
3716         u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
3717
3718         if (!ldr)
3719                 return 1;
3720
3721         ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true);
3722         if (ret && svm->ldr_reg) {
3723                 avic_ldr_write(vcpu, 0, svm->ldr_reg, false);
3724                 svm->ldr_reg = 0;
3725         } else {
3726                 svm->ldr_reg = ldr;
3727         }
3728         return ret;
3729 }
3730
3731 static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
3732 {
3733         u64 *old, *new;
3734         struct vcpu_svm *svm = to_svm(vcpu);
3735         u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID);
3736         u32 id = (apic_id_reg >> 24) & 0xff;
3737
3738         if (vcpu->vcpu_id == id)
3739                 return 0;
3740
3741         old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
3742         new = avic_get_physical_id_entry(vcpu, id);
3743         if (!new || !old)
3744                 return 1;
3745
3746         /* We need to move physical_id_entry to new offset */
3747         *new = *old;
3748         *old = 0ULL;
3749         to_svm(vcpu)->avic_physical_id_cache = new;
3750
3751         /*
3752          * Also update the guest physical APIC ID in the logical
3753          * APIC ID table entry if already setup the LDR.
3754          */
3755         if (svm->ldr_reg)
3756                 avic_handle_ldr_update(vcpu);
3757
3758         return 0;
3759 }
3760
3761 static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
3762 {
3763         struct vcpu_svm *svm = to_svm(vcpu);
3764         struct kvm_arch *vm_data = &vcpu->kvm->arch;
3765         u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
3766         u32 mod = (dfr >> 28) & 0xf;
3767
3768         /*
3769          * We assume that all local APICs are using the same type.
3770          * If this changes, we need to flush the AVIC logical
3771          * APID id table.
3772          */
3773         if (vm_data->ldr_mode == mod)
3774                 return 0;
3775
3776         clear_page(page_address(vm_data->avic_logical_id_table_page));
3777         vm_data->ldr_mode = mod;
3778
3779         if (svm->ldr_reg)
3780                 avic_handle_ldr_update(vcpu);
3781         return 0;
3782 }
3783
3784 static int avic_unaccel_trap_write(struct vcpu_svm *svm)
3785 {
3786         struct kvm_lapic *apic = svm->vcpu.arch.apic;
3787         u32 offset = svm->vmcb->control.exit_info_1 &
3788                                 AVIC_UNACCEL_ACCESS_OFFSET_MASK;
3789
3790         switch (offset) {
3791         case APIC_ID:
3792                 if (avic_handle_apic_id_update(&svm->vcpu))
3793                         return 0;
3794                 break;
3795         case APIC_LDR:
3796                 if (avic_handle_ldr_update(&svm->vcpu))
3797                         return 0;
3798                 break;
3799         case APIC_DFR:
3800                 avic_handle_dfr_update(&svm->vcpu);
3801                 break;
3802         default:
3803                 break;
3804         }
3805
3806         kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
3807
3808         return 1;
3809 }
3810
3811 static bool is_avic_unaccelerated_access_trap(u32 offset)
3812 {
3813         bool ret = false;
3814
3815         switch (offset) {
3816         case APIC_ID:
3817         case APIC_EOI:
3818         case APIC_RRR:
3819         case APIC_LDR:
3820         case APIC_DFR:
3821         case APIC_SPIV:
3822         case APIC_ESR:
3823         case APIC_ICR:
3824         case APIC_LVTT:
3825         case APIC_LVTTHMR:
3826         case APIC_LVTPC:
3827         case APIC_LVT0:
3828         case APIC_LVT1:
3829         case APIC_LVTERR:
3830         case APIC_TMICT:
3831         case APIC_TDCR:
3832                 ret = true;
3833                 break;
3834         default:
3835                 break;
3836         }
3837         return ret;
3838 }
3839
3840 static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
3841 {
3842         int ret = 0;
3843         u32 offset = svm->vmcb->control.exit_info_1 &
3844                      AVIC_UNACCEL_ACCESS_OFFSET_MASK;
3845         u32 vector = svm->vmcb->control.exit_info_2 &
3846                      AVIC_UNACCEL_ACCESS_VECTOR_MASK;
3847         bool write = (svm->vmcb->control.exit_info_1 >> 32) &
3848                      AVIC_UNACCEL_ACCESS_WRITE_MASK;
3849         bool trap = is_avic_unaccelerated_access_trap(offset);
3850
3851         trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
3852                                             trap, write, vector);
3853         if (trap) {
3854                 /* Handling Trap */
3855                 WARN_ONCE(!write, "svm: Handling trap read.\n");
3856                 ret = avic_unaccel_trap_write(svm);
3857         } else {
3858                 /* Handling Fault */
3859                 ret = (emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
3860         }
3861
3862         return ret;
3863 }
3864
3865 static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3866         [SVM_EXIT_READ_CR0]                     = cr_interception,
3867         [SVM_EXIT_READ_CR3]                     = cr_interception,
3868         [SVM_EXIT_READ_CR4]                     = cr_interception,
3869         [SVM_EXIT_READ_CR8]                     = cr_interception,
3870         [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
3871         [SVM_EXIT_WRITE_CR0]                    = cr_interception,
3872         [SVM_EXIT_WRITE_CR3]                    = cr_interception,
3873         [SVM_EXIT_WRITE_CR4]                    = cr_interception,
3874         [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
3875         [SVM_EXIT_READ_DR0]                     = dr_interception,
3876         [SVM_EXIT_READ_DR1]                     = dr_interception,
3877         [SVM_EXIT_READ_DR2]                     = dr_interception,
3878         [SVM_EXIT_READ_DR3]                     = dr_interception,
3879         [SVM_EXIT_READ_DR4]                     = dr_interception,
3880         [SVM_EXIT_READ_DR5]                     = dr_interception,
3881         [SVM_EXIT_READ_DR6]                     = dr_interception,
3882         [SVM_EXIT_READ_DR7]                     = dr_interception,
3883         [SVM_EXIT_WRITE_DR0]                    = dr_interception,
3884         [SVM_EXIT_WRITE_DR1]                    = dr_interception,
3885         [SVM_EXIT_WRITE_DR2]                    = dr_interception,
3886         [SVM_EXIT_WRITE_DR3]                    = dr_interception,
3887         [SVM_EXIT_WRITE_DR4]                    = dr_interception,
3888         [SVM_EXIT_WRITE_DR5]                    = dr_interception,
3889         [SVM_EXIT_WRITE_DR6]                    = dr_interception,
3890         [SVM_EXIT_WRITE_DR7]                    = dr_interception,
3891         [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
3892         [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
3893         [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
3894         [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
3895         [SVM_EXIT_EXCP_BASE + NM_VECTOR]        = nm_interception,
3896         [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
3897         [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
3898         [SVM_EXIT_INTR]                         = intr_interception,
3899         [SVM_EXIT_NMI]                          = nmi_interception,
3900         [SVM_EXIT_SMI]                          = nop_on_interception,
3901         [SVM_EXIT_INIT]                         = nop_on_interception,
3902         [SVM_EXIT_VINTR]                        = interrupt_window_interception,
3903         [SVM_EXIT_RDPMC]                        = rdpmc_interception,
3904         [SVM_EXIT_CPUID]                        = cpuid_interception,
3905         [SVM_EXIT_IRET]                         = iret_interception,
3906         [SVM_EXIT_INVD]                         = emulate_on_interception,
3907         [SVM_EXIT_PAUSE]                        = pause_interception,
3908         [SVM_EXIT_HLT]                          = halt_interception,
3909         [SVM_EXIT_INVLPG]                       = invlpg_interception,
3910         [SVM_EXIT_INVLPGA]                      = invlpga_interception,
3911         [SVM_EXIT_IOIO]                         = io_interception,
3912         [SVM_EXIT_MSR]                          = msr_interception,
3913         [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
3914         [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
3915         [SVM_EXIT_VMRUN]                        = vmrun_interception,
3916         [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
3917         [SVM_EXIT_VMLOAD]                       = vmload_interception,
3918         [SVM_EXIT_VMSAVE]                       = vmsave_interception,
3919         [SVM_EXIT_STGI]                         = stgi_interception,
3920         [SVM_EXIT_CLGI]                         = clgi_interception,
3921         [SVM_EXIT_SKINIT]                       = skinit_interception,
3922         [SVM_EXIT_WBINVD]                       = wbinvd_interception,
3923         [SVM_EXIT_MONITOR]                      = monitor_interception,
3924         [SVM_EXIT_MWAIT]                        = mwait_interception,
3925         [SVM_EXIT_XSETBV]                       = xsetbv_interception,
3926         [SVM_EXIT_NPF]                          = pf_interception,
3927         [SVM_EXIT_RSM]                          = emulate_on_interception,
3928         [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
3929         [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
3930 };
3931
3932 static void dump_vmcb(struct kvm_vcpu *vcpu)
3933 {
3934         struct vcpu_svm *svm = to_svm(vcpu);
3935         struct vmcb_control_area *control = &svm->vmcb->control;
3936         struct vmcb_save_area *save = &svm->vmcb->save;
3937
3938         pr_err("VMCB Control Area:\n");
3939         pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
3940         pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
3941         pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
3942         pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
3943         pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
3944         pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
3945         pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3946         pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3947         pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3948         pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3949         pr_err("%-20s%d\n", "asid:", control->asid);
3950         pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3951         pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3952         pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3953         pr_err("%-20s%08x\n", "int_state:", control->int_state);
3954         pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3955         pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3956         pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3957         pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3958         pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3959         pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3960         pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3961         pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3962         pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3963         pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3964         pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
3965         pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3966         pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3967         pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3968         pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3969         pr_err("VMCB State Save Area:\n");
3970         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3971                "es:",
3972                save->es.selector, save->es.attrib,
3973                save->es.limit, save->es.base);
3974         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3975                "cs:",
3976                save->cs.selector, save->cs.attrib,
3977                save->cs.limit, save->cs.base);
3978         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3979                "ss:",
3980                save->ss.selector, save->ss.attrib,
3981                save->ss.limit, save->ss.base);
3982         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3983                "ds:",
3984                save->ds.selector, save->ds.attrib,
3985                save->ds.limit, save->ds.base);
3986         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3987                "fs:",
3988                save->fs.selector, save->fs.attrib,
3989                save->fs.limit, save->fs.base);
3990         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3991                "gs:",
3992                save->gs.selector, save->gs.attrib,
3993                save->gs.limit, save->gs.base);
3994         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3995                "gdtr:",
3996                save->gdtr.selector, save->gdtr.attrib,
3997                save->gdtr.limit, save->gdtr.base);
3998         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3999                "ldtr:",
4000                save->ldtr.selector, save->ldtr.attrib,
4001                save->ldtr.limit, save->ldtr.base);
4002         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4003                "idtr:",
4004                save->idtr.selector, save->idtr.attrib,
4005                save->idtr.limit, save->idtr.base);
4006         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4007                "tr:",
4008                save->tr.selector, save->tr.attrib,
4009                save->tr.limit, save->tr.base);
4010         pr_err("cpl:            %d                efer:         %016llx\n",
4011                 save->cpl, save->efer);
4012         pr_err("%-15s %016llx %-13s %016llx\n",
4013                "cr0:", save->cr0, "cr2:", save->cr2);
4014         pr_err("%-15s %016llx %-13s %016llx\n",
4015                "cr3:", save->cr3, "cr4:", save->cr4);
4016         pr_err("%-15s %016llx %-13s %016llx\n",
4017                "dr6:", save->dr6, "dr7:", save->dr7);
4018         pr_err("%-15s %016llx %-13s %016llx\n",
4019                "rip:", save->rip, "rflags:", save->rflags);
4020         pr_err("%-15s %016llx %-13s %016llx\n",
4021                "rsp:", save->rsp, "rax:", save->rax);
4022         pr_err("%-15s %016llx %-13s %016llx\n",
4023                "star:", save->star, "lstar:", save->lstar);
4024         pr_err("%-15s %016llx %-13s %016llx\n",
4025                "cstar:", save->cstar, "sfmask:", save->sfmask);
4026         pr_err("%-15s %016llx %-13s %016llx\n",
4027                "kernel_gs_base:", save->kernel_gs_base,
4028                "sysenter_cs:", save->sysenter_cs);
4029         pr_err("%-15s %016llx %-13s %016llx\n",
4030                "sysenter_esp:", save->sysenter_esp,
4031                "sysenter_eip:", save->sysenter_eip);
4032         pr_err("%-15s %016llx %-13s %016llx\n",
4033                "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
4034         pr_err("%-15s %016llx %-13s %016llx\n",
4035                "br_from:", save->br_from, "br_to:", save->br_to);
4036         pr_err("%-15s %016llx %-13s %016llx\n",
4037                "excp_from:", save->last_excp_from,
4038                "excp_to:", save->last_excp_to);
4039 }
4040
4041 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
4042 {
4043         struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
4044
4045         *info1 = control->exit_info_1;
4046         *info2 = control->exit_info_2;
4047 }
4048
4049 static int handle_exit(struct kvm_vcpu *vcpu)
4050 {
4051         struct vcpu_svm *svm = to_svm(vcpu);
4052         struct kvm_run *kvm_run = vcpu->run;
4053         u32 exit_code = svm->vmcb->control.exit_code;
4054
4055         trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
4056
4057         if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
4058                 vcpu->arch.cr0 = svm->vmcb->save.cr0;
4059         if (npt_enabled)
4060                 vcpu->arch.cr3 = svm->vmcb->save.cr3;
4061
4062         if (unlikely(svm->nested.exit_required)) {
4063                 nested_svm_vmexit(svm);
4064                 svm->nested.exit_required = false;
4065
4066                 return 1;
4067         }
4068
4069         if (is_guest_mode(vcpu)) {
4070                 int vmexit;
4071
4072                 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
4073                                         svm->vmcb->control.exit_info_1,
4074                                         svm->vmcb->control.exit_info_2,
4075                                         svm->vmcb->control.exit_int_info,
4076                                         svm->vmcb->control.exit_int_info_err,
4077                                         KVM_ISA_SVM);
4078
4079                 vmexit = nested_svm_exit_special(svm);
4080
4081                 if (vmexit == NESTED_EXIT_CONTINUE)
4082                         vmexit = nested_svm_exit_handled(svm);
4083
4084                 if (vmexit == NESTED_EXIT_DONE)
4085                         return 1;
4086         }
4087
4088         svm_complete_interrupts(svm);
4089
4090         if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
4091                 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4092                 kvm_run->fail_entry.hardware_entry_failure_reason
4093                         = svm->vmcb->control.exit_code;
4094                 pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
4095                 dump_vmcb(vcpu);
4096                 return 0;
4097         }
4098
4099         if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
4100             exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
4101             exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
4102             exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
4103                 printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
4104                        "exit_code 0x%x\n",
4105                        __func__, svm->vmcb->control.exit_int_info,
4106                        exit_code);
4107
4108         if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
4109             || !svm_exit_handlers[exit_code]) {
4110                 WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
4111                 kvm_queue_exception(vcpu, UD_VECTOR);
4112                 return 1;
4113         }
4114
4115         return svm_exit_handlers[exit_code](svm);
4116 }
4117
4118 static void reload_tss(struct kvm_vcpu *vcpu)
4119 {
4120         int cpu = raw_smp_processor_id();
4121
4122         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
4123         sd->tss_desc->type = 9; /* available 32/64-bit TSS */
4124         load_TR_desc();
4125 }
4126
4127 static void pre_svm_run(struct vcpu_svm *svm)
4128 {
4129         int cpu = raw_smp_processor_id();
4130
4131         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
4132
4133         /* FIXME: handle wraparound of asid_generation */
4134         if (svm->asid_generation != sd->asid_generation)
4135                 new_asid(svm, sd);
4136 }
4137
4138 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
4139 {
4140         struct vcpu_svm *svm = to_svm(vcpu);
4141
4142         svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
4143         vcpu->arch.hflags |= HF_NMI_MASK;
4144         set_intercept(svm, INTERCEPT_IRET);
4145         ++vcpu->stat.nmi_injections;
4146 }
4147
4148 static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
4149 {
4150         struct vmcb_control_area *control;
4151
4152         /* The following fields are ignored when AVIC is enabled */
4153         control = &svm->vmcb->control;
4154         control->int_vector = irq;
4155         control->int_ctl &= ~V_INTR_PRIO_MASK;
4156         control->int_ctl |= V_IRQ_MASK |
4157                 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
4158         mark_dirty(svm->vmcb, VMCB_INTR);
4159 }
4160
4161 static void svm_set_irq(struct kvm_vcpu *vcpu)
4162 {
4163         struct vcpu_svm *svm = to_svm(vcpu);
4164
4165         BUG_ON(!(gif_set(svm)));
4166
4167         trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
4168         ++vcpu->stat.irq_injections;
4169
4170         svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
4171                 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
4172 }
4173
4174 static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
4175 {
4176         return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
4177 }
4178
4179 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
4180 {
4181         struct vcpu_svm *svm = to_svm(vcpu);
4182
4183         if (svm_nested_virtualize_tpr(vcpu) ||
4184             kvm_vcpu_apicv_active(vcpu))
4185                 return;
4186
4187         clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
4188
4189         if (irr == -1)
4190                 return;
4191
4192         if (tpr >= irr)
4193                 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
4194 }
4195
4196 static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
4197 {
4198         return;
4199 }
4200
4201 static bool svm_get_enable_apicv(void)
4202 {
4203         return avic;
4204 }
4205
4206 static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
4207 {
4208 }
4209
4210 static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
4211 {
4212 }
4213
4214 /* Note: Currently only used by Hyper-V. */
4215 static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4216 {
4217         struct vcpu_svm *svm = to_svm(vcpu);
4218         struct vmcb *vmcb = svm->vmcb;
4219
4220         if (!avic)
4221                 return;
4222
4223         vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
4224         mark_dirty(vmcb, VMCB_INTR);
4225 }
4226
4227 static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
4228 {
4229         return;
4230 }
4231
4232 static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
4233 {
4234         return;
4235 }
4236
4237 static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
4238 {
4239         kvm_lapic_set_irr(vec, vcpu->arch.apic);
4240         smp_mb__after_atomic();
4241
4242         if (avic_vcpu_is_running(vcpu))
4243                 wrmsrl(SVM_AVIC_DOORBELL,
4244                        kvm_cpu_get_apicid(vcpu->cpu));
4245         else
4246                 kvm_vcpu_wake_up(vcpu);
4247 }
4248
4249 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
4250 {
4251         struct vcpu_svm *svm = to_svm(vcpu);
4252         struct vmcb *vmcb = svm->vmcb;
4253         int ret;
4254         ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
4255               !(svm->vcpu.arch.hflags & HF_NMI_MASK);
4256         ret = ret && gif_set(svm) && nested_svm_nmi(svm);
4257
4258         return ret;
4259 }
4260
4261 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
4262 {
4263         struct vcpu_svm *svm = to_svm(vcpu);
4264
4265         return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
4266 }
4267
4268 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4269 {
4270         struct vcpu_svm *svm = to_svm(vcpu);
4271
4272         if (masked) {
4273                 svm->vcpu.arch.hflags |= HF_NMI_MASK;
4274                 set_intercept(svm, INTERCEPT_IRET);
4275         } else {
4276                 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
4277                 clr_intercept(svm, INTERCEPT_IRET);
4278         }
4279 }
4280
4281 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
4282 {
4283         struct vcpu_svm *svm = to_svm(vcpu);
4284         struct vmcb *vmcb = svm->vmcb;
4285         int ret;
4286
4287         if (!gif_set(svm) ||
4288              (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
4289                 return 0;
4290
4291         ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
4292
4293         if (is_guest_mode(vcpu))
4294                 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
4295
4296         return ret;
4297 }
4298
4299 static void enable_irq_window(struct kvm_vcpu *vcpu)
4300 {
4301         struct vcpu_svm *svm = to_svm(vcpu);
4302
4303         if (kvm_vcpu_apicv_active(vcpu))
4304                 return;
4305
4306         /*
4307          * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
4308          * 1, because that's a separate STGI/VMRUN intercept.  The next time we
4309          * get that intercept, this function will be called again though and
4310          * we'll get the vintr intercept.
4311          */
4312         if (gif_set(svm) && nested_svm_intr(svm)) {
4313                 svm_set_vintr(svm);
4314                 svm_inject_irq(svm, 0x0);
4315         }
4316 }
4317
4318 static void enable_nmi_window(struct kvm_vcpu *vcpu)
4319 {
4320         struct vcpu_svm *svm = to_svm(vcpu);
4321
4322         if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
4323             == HF_NMI_MASK)
4324                 return; /* IRET will cause a vm exit */
4325
4326         /*
4327          * Something prevents NMI from been injected. Single step over possible
4328          * problem (IRET or exception injection or interrupt shadow)
4329          */
4330         svm->nmi_singlestep = true;
4331         svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
4332 }
4333
4334 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
4335 {
4336         return 0;
4337 }
4338
4339 static void svm_flush_tlb(struct kvm_vcpu *vcpu)
4340 {
4341         struct vcpu_svm *svm = to_svm(vcpu);
4342
4343         if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
4344                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
4345         else
4346                 svm->asid_generation--;
4347 }
4348
4349 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
4350 {
4351 }
4352
4353 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
4354 {
4355         struct vcpu_svm *svm = to_svm(vcpu);
4356
4357         if (svm_nested_virtualize_tpr(vcpu))
4358                 return;
4359
4360         if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
4361                 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
4362                 kvm_set_cr8(vcpu, cr8);
4363         }
4364 }
4365
4366 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
4367 {
4368         struct vcpu_svm *svm = to_svm(vcpu);
4369         u64 cr8;
4370
4371         if (svm_nested_virtualize_tpr(vcpu) ||
4372             kvm_vcpu_apicv_active(vcpu))
4373                 return;
4374
4375         cr8 = kvm_get_cr8(vcpu);
4376         svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
4377         svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
4378 }
4379
4380 static void svm_complete_interrupts(struct vcpu_svm *svm)
4381 {
4382         u8 vector;
4383         int type;
4384         u32 exitintinfo = svm->vmcb->control.exit_int_info;
4385         unsigned int3_injected = svm->int3_injected;
4386
4387         svm->int3_injected = 0;
4388
4389         /*
4390          * If we've made progress since setting HF_IRET_MASK, we've
4391          * executed an IRET and can allow NMI injection.
4392          */
4393         if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
4394             && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
4395                 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
4396                 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
4397         }
4398
4399         svm->vcpu.arch.nmi_injected = false;
4400         kvm_clear_exception_queue(&svm->vcpu);
4401         kvm_clear_interrupt_queue(&svm->vcpu);
4402
4403         if (!(exitintinfo & SVM_EXITINTINFO_VALID))
4404                 return;
4405
4406         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
4407
4408         vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
4409         type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
4410
4411         switch (type) {
4412         case SVM_EXITINTINFO_TYPE_NMI:
4413                 svm->vcpu.arch.nmi_injected = true;
4414                 break;
4415         case SVM_EXITINTINFO_TYPE_EXEPT:
4416                 /*
4417                  * In case of software exceptions, do not reinject the vector,
4418                  * but re-execute the instruction instead. Rewind RIP first
4419                  * if we emulated INT3 before.
4420                  */
4421                 if (kvm_exception_is_soft(vector)) {
4422                         if (vector == BP_VECTOR && int3_injected &&
4423                             kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
4424                                 kvm_rip_write(&svm->vcpu,
4425                                               kvm_rip_read(&svm->vcpu) -
4426                                               int3_injected);
4427                         break;
4428                 }
4429                 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
4430                         u32 err = svm->vmcb->control.exit_int_info_err;
4431                         kvm_requeue_exception_e(&svm->vcpu, vector, err);
4432
4433                 } else
4434                         kvm_requeue_exception(&svm->vcpu, vector);
4435                 break;
4436         case SVM_EXITINTINFO_TYPE_INTR:
4437                 kvm_queue_interrupt(&svm->vcpu, vector, false);
4438                 break;
4439         default:
4440                 break;
4441         }
4442 }
4443
4444 static void svm_cancel_injection(struct kvm_vcpu *vcpu)
4445 {
4446         struct vcpu_svm *svm = to_svm(vcpu);
4447         struct vmcb_control_area *control = &svm->vmcb->control;
4448
4449         control->exit_int_info = control->event_inj;
4450         control->exit_int_info_err = control->event_inj_err;
4451         control->event_inj = 0;
4452         svm_complete_interrupts(svm);
4453 }
4454
4455 static void svm_vcpu_run(struct kvm_vcpu *vcpu)
4456 {
4457         struct vcpu_svm *svm = to_svm(vcpu);
4458
4459         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4460         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4461         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4462
4463         /*
4464          * A vmexit emulation is required before the vcpu can be executed
4465          * again.
4466          */
4467         if (unlikely(svm->nested.exit_required))
4468                 return;
4469
4470         pre_svm_run(svm);
4471
4472         sync_lapic_to_cr8(vcpu);
4473
4474         svm->vmcb->save.cr2 = vcpu->arch.cr2;
4475
4476         clgi();
4477
4478         local_irq_enable();
4479
4480         asm volatile (
4481                 "push %%" _ASM_BP "; \n\t"
4482                 "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
4483                 "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
4484                 "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
4485                 "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
4486                 "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
4487                 "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
4488 #ifdef CONFIG_X86_64
4489                 "mov %c[r8](%[svm]),  %%r8  \n\t"
4490                 "mov %c[r9](%[svm]),  %%r9  \n\t"
4491                 "mov %c[r10](%[svm]), %%r10 \n\t"
4492                 "mov %c[r11](%[svm]), %%r11 \n\t"
4493                 "mov %c[r12](%[svm]), %%r12 \n\t"
4494                 "mov %c[r13](%[svm]), %%r13 \n\t"
4495                 "mov %c[r14](%[svm]), %%r14 \n\t"
4496                 "mov %c[r15](%[svm]), %%r15 \n\t"
4497 #endif
4498
4499                 /* Enter guest mode */
4500                 "push %%" _ASM_AX " \n\t"
4501                 "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
4502                 __ex(SVM_VMLOAD) "\n\t"
4503                 __ex(SVM_VMRUN) "\n\t"
4504                 __ex(SVM_VMSAVE) "\n\t"
4505                 "pop %%" _ASM_AX " \n\t"
4506
4507                 /* Save guest registers, load host registers */
4508                 "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
4509                 "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
4510                 "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
4511                 "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
4512                 "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
4513                 "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
4514 #ifdef CONFIG_X86_64
4515                 "mov %%r8,  %c[r8](%[svm]) \n\t"
4516                 "mov %%r9,  %c[r9](%[svm]) \n\t"
4517                 "mov %%r10, %c[r10](%[svm]) \n\t"
4518                 "mov %%r11, %c[r11](%[svm]) \n\t"
4519                 "mov %%r12, %c[r12](%[svm]) \n\t"
4520                 "mov %%r13, %c[r13](%[svm]) \n\t"
4521                 "mov %%r14, %c[r14](%[svm]) \n\t"
4522                 "mov %%r15, %c[r15](%[svm]) \n\t"
4523 #endif
4524                 "pop %%" _ASM_BP
4525                 :
4526                 : [svm]"a"(svm),
4527                   [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
4528                   [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
4529                   [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
4530                   [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
4531                   [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
4532                   [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
4533                   [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
4534 #ifdef CONFIG_X86_64
4535                   , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
4536                   [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
4537                   [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
4538                   [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
4539                   [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
4540                   [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
4541                   [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
4542                   [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
4543 #endif
4544                 : "cc", "memory"
4545 #ifdef CONFIG_X86_64
4546                 , "rbx", "rcx", "rdx", "rsi", "rdi"
4547                 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
4548 #else
4549                 , "ebx", "ecx", "edx", "esi", "edi"
4550 #endif
4551                 );
4552
4553 #ifdef CONFIG_X86_64
4554         wrmsrl(MSR_GS_BASE, svm->host.gs_base);
4555 #else
4556         loadsegment(fs, svm->host.fs);
4557 #ifndef CONFIG_X86_32_LAZY_GS
4558         loadsegment(gs, svm->host.gs);
4559 #endif
4560 #endif
4561
4562         reload_tss(vcpu);
4563
4564         local_irq_disable();
4565
4566         vcpu->arch.cr2 = svm->vmcb->save.cr2;
4567         vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
4568         vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
4569         vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4570
4571         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4572                 kvm_before_handle_nmi(&svm->vcpu);
4573
4574         stgi();
4575
4576         /* Any pending NMI will happen here */
4577
4578         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4579                 kvm_after_handle_nmi(&svm->vcpu);
4580
4581         sync_cr8_to_lapic(vcpu);
4582
4583         svm->next_rip = 0;
4584
4585         svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4586
4587         /* if exit due to PF check for async PF */
4588         if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4589                 svm->apf_reason = kvm_read_and_reset_pf_reason();
4590
4591         if (npt_enabled) {
4592                 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
4593                 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
4594         }
4595
4596         /*
4597          * We need to handle MC intercepts here before the vcpu has a chance to
4598          * change the physical cpu
4599          */
4600         if (unlikely(svm->vmcb->control.exit_code ==
4601                      SVM_EXIT_EXCP_BASE + MC_VECTOR))
4602                 svm_handle_mce(svm);
4603
4604         mark_all_clean(svm->vmcb);
4605 }
4606
4607 static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
4608 {
4609         struct vcpu_svm *svm = to_svm(vcpu);
4610
4611         svm->vmcb->save.cr3 = root;
4612         mark_dirty(svm->vmcb, VMCB_CR);
4613         svm_flush_tlb(vcpu);
4614 }
4615
4616 static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
4617 {
4618         struct vcpu_svm *svm = to_svm(vcpu);
4619
4620         svm->vmcb->control.nested_cr3 = root;
4621         mark_dirty(svm->vmcb, VMCB_NPT);
4622
4623         /* Also sync guest cr3 here in case we live migrate */
4624         svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
4625         mark_dirty(svm->vmcb, VMCB_CR);
4626
4627         svm_flush_tlb(vcpu);
4628 }
4629
4630 static int is_disabled(void)
4631 {
4632         u64 vm_cr;
4633
4634         rdmsrl(MSR_VM_CR, vm_cr);
4635         if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
4636                 return 1;
4637
4638         return 0;
4639 }
4640
4641 static void
4642 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4643 {
4644         /*
4645          * Patch in the VMMCALL instruction:
4646          */
4647         hypercall[0] = 0x0f;
4648         hypercall[1] = 0x01;
4649         hypercall[2] = 0xd9;
4650 }
4651
4652 static void svm_check_processor_compat(void *rtn)
4653 {
4654         *(int *)rtn = 0;
4655 }
4656
4657 static bool svm_cpu_has_accelerated_tpr(void)
4658 {
4659         return false;
4660 }
4661
4662 static bool svm_has_high_real_mode_segbase(void)
4663 {
4664         return true;
4665 }
4666
4667 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
4668 {
4669         return 0;
4670 }
4671
4672 static void svm_cpuid_update(struct kvm_vcpu *vcpu)
4673 {
4674         struct vcpu_svm *svm = to_svm(vcpu);
4675         struct kvm_cpuid_entry2 *entry;
4676
4677         /* Update nrips enabled cache */
4678         svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
4679
4680         if (!kvm_vcpu_apicv_active(vcpu))
4681                 return;
4682
4683         entry = kvm_find_cpuid_entry(vcpu, 1, 0);
4684         if (entry)
4685                 entry->ecx &= ~bit(X86_FEATURE_X2APIC);
4686 }
4687
4688 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4689 {
4690         switch (func) {
4691         case 0x1:
4692                 if (avic)
4693                         entry->ecx &= ~bit(X86_FEATURE_X2APIC);
4694                 break;
4695         case 0x80000001:
4696                 if (nested)
4697                         entry->ecx |= (1 << 2); /* Set SVM bit */
4698                 break;
4699         case 0x8000000A:
4700                 entry->eax = 1; /* SVM revision 1 */
4701                 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
4702                                    ASID emulation to nested SVM */
4703                 entry->ecx = 0; /* Reserved */
4704                 entry->edx = 0; /* Per default do not support any
4705                                    additional features */
4706
4707                 /* Support next_rip if host supports it */
4708                 if (boot_cpu_has(X86_FEATURE_NRIPS))
4709                         entry->edx |= SVM_FEATURE_NRIP;
4710
4711                 /* Support NPT for the guest if enabled */
4712                 if (npt_enabled)
4713                         entry->edx |= SVM_FEATURE_NPT;
4714
4715                 break;
4716         }
4717 }
4718
4719 static int svm_get_lpage_level(void)
4720 {
4721         return PT_PDPE_LEVEL;
4722 }
4723
4724 static bool svm_rdtscp_supported(void)
4725 {
4726         return boot_cpu_has(X86_FEATURE_RDTSCP);
4727 }
4728
4729 static bool svm_invpcid_supported(void)
4730 {
4731         return false;
4732 }
4733
4734 static bool svm_mpx_supported(void)
4735 {
4736         return false;
4737 }
4738
4739 static bool svm_xsaves_supported(void)
4740 {
4741         return false;
4742 }
4743
4744 static bool svm_has_wbinvd_exit(void)
4745 {
4746         return true;
4747 }
4748
4749 static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
4750 {
4751         struct vcpu_svm *svm = to_svm(vcpu);
4752
4753         set_exception_intercept(svm, NM_VECTOR);
4754         update_cr0_intercept(svm);
4755 }
4756
4757 #define PRE_EX(exit)  { .exit_code = (exit), \
4758                         .stage = X86_ICPT_PRE_EXCEPT, }
4759 #define POST_EX(exit) { .exit_code = (exit), \
4760                         .stage = X86_ICPT_POST_EXCEPT, }
4761 #define POST_MEM(exit) { .exit_code = (exit), \
4762                         .stage = X86_ICPT_POST_MEMACCESS, }
4763
4764 static const struct __x86_intercept {
4765         u32 exit_code;
4766         enum x86_intercept_stage stage;
4767 } x86_intercept_map[] = {
4768         [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
4769         [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
4770         [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
4771         [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
4772         [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
4773         [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
4774         [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
4775         [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
4776         [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
4777         [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
4778         [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
4779         [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
4780         [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
4781         [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
4782         [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
4783         [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
4784         [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
4785         [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
4786         [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
4787         [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
4788         [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
4789         [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
4790         [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
4791         [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
4792         [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
4793         [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
4794         [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
4795         [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
4796         [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
4797         [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
4798         [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
4799         [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
4800         [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
4801         [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
4802         [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
4803         [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
4804         [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
4805         [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
4806         [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
4807         [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
4808         [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
4809         [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
4810         [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
4811         [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
4812         [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
4813         [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
4814 };
4815
4816 #undef PRE_EX
4817 #undef POST_EX
4818 #undef POST_MEM
4819
4820 static int svm_check_intercept(struct kvm_vcpu *vcpu,
4821                                struct x86_instruction_info *info,
4822                                enum x86_intercept_stage stage)
4823 {
4824         struct vcpu_svm *svm = to_svm(vcpu);
4825         int vmexit, ret = X86EMUL_CONTINUE;
4826         struct __x86_intercept icpt_info;
4827         struct vmcb *vmcb = svm->vmcb;
4828
4829         if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4830                 goto out;
4831
4832         icpt_info = x86_intercept_map[info->intercept];
4833
4834         if (stage != icpt_info.stage)
4835                 goto out;
4836
4837         switch (icpt_info.exit_code) {
4838         case SVM_EXIT_READ_CR0:
4839                 if (info->intercept == x86_intercept_cr_read)
4840                         icpt_info.exit_code += info->modrm_reg;
4841                 break;
4842         case SVM_EXIT_WRITE_CR0: {
4843                 unsigned long cr0, val;
4844                 u64 intercept;
4845
4846                 if (info->intercept == x86_intercept_cr_write)
4847                         icpt_info.exit_code += info->modrm_reg;
4848
4849                 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4850                     info->intercept == x86_intercept_clts)
4851                         break;
4852
4853                 intercept = svm->nested.intercept;
4854
4855                 if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
4856                         break;
4857
4858                 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4859                 val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4860
4861                 if (info->intercept == x86_intercept_lmsw) {
4862                         cr0 &= 0xfUL;
4863                         val &= 0xfUL;
4864                         /* lmsw can't clear PE - catch this here */
4865                         if (cr0 & X86_CR0_PE)
4866                                 val |= X86_CR0_PE;
4867                 }
4868
4869                 if (cr0 ^ val)
4870                         icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4871
4872                 break;
4873         }
4874         case SVM_EXIT_READ_DR0:
4875         case SVM_EXIT_WRITE_DR0:
4876                 icpt_info.exit_code += info->modrm_reg;
4877                 break;
4878         case SVM_EXIT_MSR:
4879                 if (info->intercept == x86_intercept_wrmsr)
4880                         vmcb->control.exit_info_1 = 1;
4881                 else
4882                         vmcb->control.exit_info_1 = 0;
4883                 break;
4884         case SVM_EXIT_PAUSE:
4885                 /*
4886                  * We get this for NOP only, but pause
4887                  * is rep not, check this here
4888                  */
4889                 if (info->rep_prefix != REPE_PREFIX)
4890                         goto out;
4891         case SVM_EXIT_IOIO: {
4892                 u64 exit_info;
4893                 u32 bytes;
4894
4895                 if (info->intercept == x86_intercept_in ||
4896                     info->intercept == x86_intercept_ins) {
4897                         exit_info = ((info->src_val & 0xffff) << 16) |
4898                                 SVM_IOIO_TYPE_MASK;
4899                         bytes = info->dst_bytes;
4900                 } else {
4901                         exit_info = (info->dst_val & 0xffff) << 16;
4902                         bytes = info->src_bytes;
4903                 }
4904
4905                 if (info->intercept == x86_intercept_outs ||
4906                     info->intercept == x86_intercept_ins)
4907                         exit_info |= SVM_IOIO_STR_MASK;
4908
4909                 if (info->rep_prefix)
4910                         exit_info |= SVM_IOIO_REP_MASK;
4911
4912                 bytes = min(bytes, 4u);
4913
4914                 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4915
4916                 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4917
4918                 vmcb->control.exit_info_1 = exit_info;
4919                 vmcb->control.exit_info_2 = info->next_rip;
4920
4921                 break;
4922         }
4923         default:
4924                 break;
4925         }
4926
4927         /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4928         if (static_cpu_has(X86_FEATURE_NRIPS))
4929                 vmcb->control.next_rip  = info->next_rip;
4930         vmcb->control.exit_code = icpt_info.exit_code;
4931         vmexit = nested_svm_exit_handled(svm);
4932
4933         ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4934                                            : X86EMUL_CONTINUE;
4935
4936 out:
4937         return ret;
4938 }
4939
4940 static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
4941 {
4942         local_irq_enable();
4943 }
4944
4945 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4946 {
4947 }
4948
4949 static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
4950 {
4951         if (avic_handle_apic_id_update(vcpu) != 0)
4952                 return;
4953         if (avic_handle_dfr_update(vcpu) != 0)
4954                 return;
4955         avic_handle_ldr_update(vcpu);
4956 }
4957
4958 static struct kvm_x86_ops svm_x86_ops = {
4959         .cpu_has_kvm_support = has_svm,
4960         .disabled_by_bios = is_disabled,
4961         .hardware_setup = svm_hardware_setup,
4962         .hardware_unsetup = svm_hardware_unsetup,
4963         .check_processor_compatibility = svm_check_processor_compat,
4964         .hardware_enable = svm_hardware_enable,
4965         .hardware_disable = svm_hardware_disable,
4966         .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
4967         .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase,
4968
4969         .vcpu_create = svm_create_vcpu,
4970         .vcpu_free = svm_free_vcpu,
4971         .vcpu_reset = svm_vcpu_reset,
4972
4973         .vm_init = avic_vm_init,
4974         .vm_destroy = avic_vm_destroy,
4975
4976         .prepare_guest_switch = svm_prepare_guest_switch,
4977         .vcpu_load = svm_vcpu_load,
4978         .vcpu_put = svm_vcpu_put,
4979         .vcpu_blocking = svm_vcpu_blocking,
4980         .vcpu_unblocking = svm_vcpu_unblocking,
4981
4982         .update_bp_intercept = update_bp_intercept,
4983         .get_msr = svm_get_msr,
4984         .set_msr = svm_set_msr,
4985         .get_segment_base = svm_get_segment_base,
4986         .get_segment = svm_get_segment,
4987         .set_segment = svm_set_segment,
4988         .get_cpl = svm_get_cpl,
4989         .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
4990         .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
4991         .decache_cr3 = svm_decache_cr3,
4992         .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
4993         .set_cr0 = svm_set_cr0,
4994         .set_cr3 = svm_set_cr3,
4995         .set_cr4 = svm_set_cr4,
4996         .set_efer = svm_set_efer,
4997         .get_idt = svm_get_idt,
4998         .set_idt = svm_set_idt,
4999         .get_gdt = svm_get_gdt,
5000         .set_gdt = svm_set_gdt,
5001         .get_dr6 = svm_get_dr6,
5002         .set_dr6 = svm_set_dr6,
5003         .set_dr7 = svm_set_dr7,
5004         .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
5005         .cache_reg = svm_cache_reg,
5006         .get_rflags = svm_get_rflags,
5007         .set_rflags = svm_set_rflags,
5008
5009         .get_pkru = svm_get_pkru,
5010
5011         .fpu_activate = svm_fpu_activate,
5012         .fpu_deactivate = svm_fpu_deactivate,
5013
5014         .tlb_flush = svm_flush_tlb,
5015
5016         .run = svm_vcpu_run,
5017         .handle_exit = handle_exit,
5018         .skip_emulated_instruction = skip_emulated_instruction,
5019         .set_interrupt_shadow = svm_set_interrupt_shadow,
5020         .get_interrupt_shadow = svm_get_interrupt_shadow,
5021         .patch_hypercall = svm_patch_hypercall,
5022         .set_irq = svm_set_irq,
5023         .set_nmi = svm_inject_nmi,
5024         .queue_exception = svm_queue_exception,
5025         .cancel_injection = svm_cancel_injection,
5026         .interrupt_allowed = svm_interrupt_allowed,
5027         .nmi_allowed = svm_nmi_allowed,
5028         .get_nmi_mask = svm_get_nmi_mask,
5029         .set_nmi_mask = svm_set_nmi_mask,
5030         .enable_nmi_window = enable_nmi_window,
5031         .enable_irq_window = enable_irq_window,
5032         .update_cr8_intercept = update_cr8_intercept,
5033         .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
5034         .get_enable_apicv = svm_get_enable_apicv,
5035         .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
5036         .load_eoi_exitmap = svm_load_eoi_exitmap,
5037         .sync_pir_to_irr = svm_sync_pir_to_irr,
5038         .hwapic_irr_update = svm_hwapic_irr_update,
5039         .hwapic_isr_update = svm_hwapic_isr_update,
5040         .apicv_post_state_restore = avic_post_state_restore,
5041
5042         .set_tss_addr = svm_set_tss_addr,
5043         .get_tdp_level = get_npt_level,
5044         .get_mt_mask = svm_get_mt_mask,
5045
5046         .get_exit_info = svm_get_exit_info,
5047
5048         .get_lpage_level = svm_get_lpage_level,
5049
5050         .cpuid_update = svm_cpuid_update,
5051
5052         .rdtscp_supported = svm_rdtscp_supported,
5053         .invpcid_supported = svm_invpcid_supported,
5054         .mpx_supported = svm_mpx_supported,
5055         .xsaves_supported = svm_xsaves_supported,
5056
5057         .set_supported_cpuid = svm_set_supported_cpuid,
5058
5059         .has_wbinvd_exit = svm_has_wbinvd_exit,
5060
5061         .read_tsc_offset = svm_read_tsc_offset,
5062         .write_tsc_offset = svm_write_tsc_offset,
5063         .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
5064         .read_l1_tsc = svm_read_l1_tsc,
5065
5066         .set_tdp_cr3 = set_tdp_cr3,
5067
5068         .check_intercept = svm_check_intercept,
5069         .handle_external_intr = svm_handle_external_intr,
5070
5071         .sched_in = svm_sched_in,
5072
5073         .pmu_ops = &amd_pmu_ops,
5074         .deliver_posted_interrupt = svm_deliver_avic_intr,
5075 };
5076
5077 static int __init svm_init(void)
5078 {
5079         return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
5080                         __alignof__(struct vcpu_svm), THIS_MODULE);
5081 }
5082
5083 static void __exit svm_exit(void)
5084 {
5085         kvm_exit();
5086 }
5087
5088 module_init(svm_init)
5089 module_exit(svm_exit)