]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - arch/x86/kvm/x86.c
KVM: add "new" argument to kvm_arch_commit_memory_region
[karo-tx-linux.git] / arch / x86 / kvm / x86.c
index ed31c31b2485b1e06476f93ab7934b216ed1cacc..ba7b0cc52fedad6d2bc2fa594a432dfaa88d90b1 100644 (file)
@@ -572,8 +572,7 @@ out:
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
        unsigned long old_cr0 = kvm_read_cr0(vcpu);
-       unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
-                                   X86_CR0_CD | X86_CR0_NW;
+       unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
 
        cr0 |= X86_CR0_ET;
 
@@ -702,8 +701,9 @@ EXPORT_SYMBOL_GPL(kvm_set_xcr);
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        unsigned long old_cr4 = kvm_read_cr4(vcpu);
-       unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
-                                  X86_CR4_PAE | X86_CR4_SMEP;
+       unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
+                                  X86_CR4_SMEP | X86_CR4_SMAP;
+
        if (cr4 & CR4_RESERVED_BITS)
                return 1;
 
@@ -744,9 +744,6 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
            (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
                kvm_mmu_reset_context(vcpu);
 
-       if ((cr4 ^ old_cr4) & X86_CR4_SMAP)
-               update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false);
-
        if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
                kvm_update_cpuid(vcpu);
 
@@ -1669,12 +1666,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
                &guest_hv_clock, sizeof(guest_hv_clock))))
                return 0;
 
-       /*
-        * The interface expects us to write an even number signaling that the
-        * update is finished. Since the guest won't see the intermediate
-        * state, we just increase by 2 at the end.
+       /* This VCPU is paused, but it's legal for a guest to read another
+        * VCPU's kvmclock, so we really have to follow the specification where
+        * it says that version is odd if data is being modified, and even after
+        * it is consistent.
+        *
+        * Version field updates must be kept separate.  This is because
+        * kvm_write_guest_cached might use a "rep movs" instruction, and
+        * writes within a string instruction are weakly ordered.  So there
+        * are three writes overall.
+        *
+        * As a small optimization, only write the version field in the first
+        * and third write.  The vcpu->pv_time cache is still valid, because the
+        * version field is the first in the struct.
         */
-       vcpu->hv_clock.version = guest_hv_clock.version + 2;
+       BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+       vcpu->hv_clock.version = guest_hv_clock.version + 1;
+       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+                               &vcpu->hv_clock,
+                               sizeof(vcpu->hv_clock.version));
+
+       smp_wmb();
 
        /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
        pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
@@ -1695,6 +1708,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
                                &vcpu->hv_clock,
                                sizeof(vcpu->hv_clock));
+
+       smp_wmb();
+
+       vcpu->hv_clock.version++;
+       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+                               &vcpu->hv_clock,
+                               sizeof(vcpu->hv_clock.version));
        return 0;
 }
 
@@ -1831,6 +1851,63 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 }
 EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
 
+static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
+{
+       struct mtrr_state_type *mtrr_state = &vcpu->arch.mtrr_state;
+       unsigned char mtrr_enabled = mtrr_state->enabled;
+       gfn_t start, end, mask;
+       int index;
+       bool is_fixed = true;
+
+       if (msr == MSR_IA32_CR_PAT || !tdp_enabled ||
+             !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+               return;
+
+       if (!(mtrr_enabled & 0x2) && msr != MSR_MTRRdefType)
+               return;
+
+       switch (msr) {
+       case MSR_MTRRfix64K_00000:
+               start = 0x0;
+               end = 0x80000;
+               break;
+       case MSR_MTRRfix16K_80000:
+               start = 0x80000;
+               end = 0xa0000;
+               break;
+       case MSR_MTRRfix16K_A0000:
+               start = 0xa0000;
+               end = 0xc0000;
+               break;
+       case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
+               index = msr - MSR_MTRRfix4K_C0000;
+               start = 0xc0000 + index * (32 << 10);
+               end = start + (32 << 10);
+               break;
+       case MSR_MTRRdefType:
+               is_fixed = false;
+               start = 0x0;
+               end = ~0ULL;
+               break;
+       default:
+               /* variable range MTRRs. */
+               is_fixed = false;
+               index = (msr - 0x200) / 2;
+               start = (((u64)mtrr_state->var_ranges[index].base_hi) << 32) +
+                      (mtrr_state->var_ranges[index].base_lo & PAGE_MASK);
+               mask = (((u64)mtrr_state->var_ranges[index].mask_hi) << 32) +
+                      (mtrr_state->var_ranges[index].mask_lo & PAGE_MASK);
+               mask |= ~0ULL << cpuid_maxphyaddr(vcpu);
+
+               end = ((start & mask) | ~mask) + 1;
+       }
+
+       if (is_fixed && !(mtrr_enabled & 0x1))
+               return;
+
+       kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
+}
+
 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
        u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
@@ -1864,7 +1941,7 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                *pt = data;
        }
 
-       kvm_mmu_reset_context(vcpu);
+       update_mtrr(vcpu, msr);
        return 0;
 }
 
@@ -2777,6 +2854,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_HYPERV_TIME:
        case KVM_CAP_IOAPIC_POLARITY_IGNORED:
        case KVM_CAP_TSC_DEADLINE_TIMER:
+       case KVM_CAP_ENABLE_CAP_VM:
+       case KVM_CAP_DISABLE_QUIRKS:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
        case KVM_CAP_ASSIGN_DEV_IRQ:
        case KVM_CAP_PCI_2_3:
@@ -3824,6 +3903,26 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
        return 0;
 }
 
+static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+                                  struct kvm_enable_cap *cap)
+{
+       int r;
+
+       if (cap->flags)
+               return -EINVAL;
+
+       switch (cap->cap) {
+       case KVM_CAP_DISABLE_QUIRKS:
+               kvm->arch.disabled_quirks = cap->args[0];
+               r = 0;
+               break;
+       default:
+               r = -EINVAL;
+               break;
+       }
+       return r;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
                       unsigned int ioctl, unsigned long arg)
 {
@@ -4076,7 +4175,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = 0;
                break;
        }
+       case KVM_ENABLE_CAP: {
+               struct kvm_enable_cap cap;
 
+               r = -EFAULT;
+               if (copy_from_user(&cap, argp, sizeof(cap)))
+                       goto out;
+               r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+               break;
+       }
        default:
                r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
        }
@@ -5931,6 +6038,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
        lapic_irq.shorthand = 0;
        lapic_irq.dest_mode = 0;
        lapic_irq.dest_id = apicid;
+       lapic_irq.msi_redir_hint = false;
 
        lapic_irq.delivery_mode = APIC_DM_REMRD;
        kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
@@ -6174,6 +6282,8 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
                return;
 
        page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+       if (is_error_page(page))
+               return;
        kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
 
        /*
@@ -6324,7 +6434,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        if (req_immediate_exit)
                smp_send_reschedule(vcpu->cpu);
 
-       kvm_guest_enter();
+       __kvm_guest_enter();
 
        if (unlikely(vcpu->arch.switch_db_regs)) {
                set_debugreg(0, 7);
@@ -6980,7 +7090,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
        return 0;
 }
 
-int fx_init(struct kvm_vcpu *vcpu)
+int fx_init(struct kvm_vcpu *vcpu, bool init_event)
 {
        int err;
 
@@ -6988,7 +7098,9 @@ int fx_init(struct kvm_vcpu *vcpu)
        if (err)
                return err;
 
-       fpu_finit(&vcpu->arch.guest_fpu);
+       if (!init_event)
+               fpu_finit(&vcpu->arch.guest_fpu);
+
        if (cpu_has_xsaves)
                vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv =
                        host_xcr0 | XSTATE_COMPACTION_ENABLED;
@@ -7030,14 +7142,25 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
        kvm_put_guest_xcr0(vcpu);
 
-       if (!vcpu->guest_fpu_loaded)
+       if (!vcpu->guest_fpu_loaded) {
+               vcpu->fpu_counter = 0;
                return;
+       }
 
        vcpu->guest_fpu_loaded = 0;
        fpu_save_init(&vcpu->arch.guest_fpu);
        __kernel_fpu_end();
        ++vcpu->stat.fpu_reload;
-       kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+       /*
+        * If using eager FPU mode, or if the guest is a frequent user
+        * of the FPU, just leave the FPU active for next time.
+        * Every 255 times fpu_counter rolls over to 0; a guest that uses
+        * the FPU in bursts will revert to loading it on demand.
+        */
+       if (!vcpu->arch.eager_fpu) {
+               if (++vcpu->fpu_counter < 5)
+                       kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+       }
        trace_kvm_fpu(0);
 }
 
@@ -7053,11 +7176,21 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
                                                unsigned int id)
 {
+       struct kvm_vcpu *vcpu;
+
        if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
                printk_once(KERN_WARNING
                "kvm: SMP vm created on host with unstable TSC; "
                "guest TSC will not be reliable\n");
-       return kvm_x86_ops->vcpu_create(kvm, id);
+
+       vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+
+       /*
+        * Activate fpu unconditionally in case the guest needs eager FPU.  It will be
+        * deactivated soon if it doesn't.
+        */
+       kvm_x86_ops->fpu_activate(vcpu);
+       return vcpu;
 }
 
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
@@ -7068,7 +7201,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        r = vcpu_load(vcpu);
        if (r)
                return r;
-       kvm_vcpu_reset(vcpu);
+       kvm_vcpu_reset(vcpu, false);
        kvm_mmu_setup(vcpu);
        vcpu_put(vcpu);
 
@@ -7106,7 +7239,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        kvm_x86_ops->vcpu_free(vcpu);
 }
 
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
        atomic_set(&vcpu->arch.nmi_queued, 0);
        vcpu->arch.nmi_pending = 0;
@@ -7133,13 +7266,14 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
        kvm_async_pf_hash_reset(vcpu);
        vcpu->arch.apf.halted = false;
 
-       kvm_pmu_reset(vcpu);
+       if (!init_event)
+               kvm_pmu_reset(vcpu);
 
        memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
        vcpu->arch.regs_avail = ~0;
        vcpu->arch.regs_dirty = ~0;
 
-       kvm_x86_ops->vcpu_reset(vcpu);
+       kvm_x86_ops->vcpu_reset(vcpu, init_event);
 }
 
 void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -7328,7 +7462,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
                goto fail_free_mce_banks;
        }
 
-       r = fx_init(vcpu);
+       r = fx_init(vcpu, false);
        if (r)
                goto fail_free_wbinvd_dirty_mask;
 
@@ -7340,6 +7474,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
        vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 
+       vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
        kvm_async_pf_hash_reset(vcpu);
        kvm_pmu_init(vcpu);
 
@@ -7553,7 +7689,7 @@ out_free:
        return -ENOMEM;
 }
 
-void kvm_arch_memslots_updated(struct kvm *kvm)
+void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
 {
        /*
         * memslots->generation has been incremented.
@@ -7564,7 +7700,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm)
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
-                               struct kvm_userspace_memory_region *mem,
+                               const struct kvm_userspace_memory_region *mem,
                                enum kvm_mr_change change)
 {
        /*
@@ -7642,14 +7778,14 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-                               struct kvm_userspace_memory_region *mem,
+                               const struct kvm_userspace_memory_region *mem,
                                const struct kvm_memory_slot *old,
+                               const struct kvm_memory_slot *new,
                                enum kvm_mr_change change)
 {
-       struct kvm_memory_slot *new;
        int nr_mmu_pages = 0;
 
-       if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
+       if (change == KVM_MR_DELETE && old->id >= KVM_USER_MEM_SLOTS) {
                int ret;
 
                ret = vm_munmap(old->userspace_addr,
@@ -7666,9 +7802,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
        if (nr_mmu_pages)
                kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
 
-       /* It's OK to get 'new' slot here as it has already been installed */
-       new = id_to_memslot(kvm->memslots, mem->slot);
-
        /*
         * Dirty logging tracks sptes in 4k granularity, meaning that large
         * sptes have to be split.  If live migration is successful, the guest
@@ -7693,9 +7826,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
         * been zapped so no dirty logging staff is needed for old slot. For
         * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
         * new and it's also covered when dealing with the new slot.
+        *
+        * FIXME: const-ify all uses of struct kvm_memory_slot.
         */
        if (change != KVM_MR_DELETE)
-               kvm_mmu_slot_apply_flags(kvm, new);
+               kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)