};
/* for kvm_memory_region::flags */
-#define KVM_MEM_LOG_DIRTY_PAGES 1UL
+#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0)
+#define KVM_MEM_READONLY (1UL << 1)
This ioctl allows the user to create or modify a guest physical memory
slot. When changing an existing slot, it may be moved in the guest
be identical. This allows large pages in the guest to be backed by large
pages in the host.
-The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
-instructs kvm to keep track of writes to memory within the slot. See
-the KVM_GET_DIRTY_LOG ioctl.
+The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which instructs
+kvm to keep track of writes to memory within the slot. See KVM_GET_DIRTY_LOG
+ioctl. The KVM_CAP_READONLY_MEM capability indicates the availability of the
+KVM_MEM_READONLY flag. When this flag is set for a memory region, KVM only
+allows read accesses. Writes will be posted to userspace as KVM_EXIT_MMIO
+exits.
-When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory
-region are automatically reflected into the guest. For example, an mmap()
-that affects the region will be made visible immediately. Another example
-is madvise(MADV_DROP).
+When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of
+the memory region are automatically reflected into the guest. For example, an
+mmap() that affects the region will be made visible immediately. Another
+example is madvise(MADV_DROP).
It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl.
The KVM_SET_MEMORY_REGION does not allow fine grained control over memory
the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd
and kvm_irqfd.gsi.
+With KVM_CAP_IRQFD_RESAMPLE, KVM_IRQFD supports a de-assert and notify
+mechanism allowing emulation of level-triggered, irqfd-based
+interrupts. When KVM_IRQFD_FLAG_RESAMPLE is set the user must pass an
+additional eventfd in the kvm_irqfd.resamplefd field. When operating
+in resample mode, posting of an interrupt through kvm_irq.fd asserts
+the specified gsi in the irqchip. When the irqchip is resampled, such
+as from an EOI, the gsi is de-asserted and the user is notifed via
+kvm_irqfd.resamplefd. It is the user's responsibility to re-queue
+the interrupt if the device making use of it still requires service.
+Note that closing the resamplefd is not sufficient to disable the
+irqfd. The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment
+and need not be specified with KVM_IRQFD_FLAG_DEASSIGN.
+
4.76 KVM_PPC_ALLOCATE_HTAB
Capability: KVM_CAP_PPC_ALLOC_HTAB
--- /dev/null
+Linux KVM Hypercall:
+===================
+X86:
+ KVM Hypercalls have a three-byte sequence of either the vmcall or the vmmcall
+ instruction. The hypervisor can replace it with instructions that are
+ guaranteed to be supported.
+
+ Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
+ The hypercall number should be placed in rax and the return value will be
+ placed in rax. No other registers will be clobbered unless explicitly stated
+ by the particular hypercall.
+
+S390:
+ R2-R7 are used for parameters 1-6. In addition, R1 is used for hypercall
+ number. The return value is written to R2.
+
+ S390 uses diagnose instruction as hypercall (0x500) along with hypercall
+ number in R1.
+
+ PowerPC:
+ It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers.
+ Return value is placed in R3.
+
+ KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions'
+ property inside the device tree's /hypervisor node.
+ For more information refer to Documentation/virtual/kvm/ppc-pv.txt
+
+KVM Hypercalls Documentation
+===========================
+The template for each hypercall is:
+1. Hypercall name.
+2. Architecture(s)
+3. Status (deprecated, obsolete, active)
+4. Purpose
+
+1. KVM_HC_VAPIC_POLL_IRQ
+------------------------
+Architecture: x86
+Status: active
+Purpose: Trigger guest exit so that the host can check for pending
+interrupts on reentry.
+
+2. KVM_HC_MMU_OP
+------------------------
+Architecture: x86
+Status: deprecated.
+Purpose: Support MMU operations such as writing to PTE,
+flushing TLB, release PT.
+
+3. KVM_HC_FEATURES
+------------------------
+Architecture: PPC
+Status: active
+Purpose: Expose hypercall availability to the guest. On x86 platforms, cpuid
+used to enumerate which hypercalls are available. On PPC, either device tree
+based lookup ( which is also what EPAPR dictates) OR KVM specific enumeration
+mechanism (which is this hypercall) can be used.
+
+4. KVM_HC_PPC_MAP_MAGIC_PAGE
+------------------------
+Architecture: PPC
+Status: active
+Purpose: To enable communication between the hypervisor and guest there is a
+shared page that contains parts of supervisor visible register state.
+The guest can map this shared page to access its supervisor register through
+memory using this hypercall.
time information and check that they are both equal and even.
An odd version indicates an in-progress update.
- sec: number of seconds for wallclock.
+ sec: number of seconds for wallclock at time of boot.
- nsec: number of nanoseconds for wallclock.
+ nsec: number of nanoseconds for wallclock at time of boot.
+
+ In order to get the current wallclock time, the system_time from
+ MSR_KVM_SYSTEM_TIME_NEW needs to be added.
Note that although MSRs are per-CPU entities, the effect of this
particular MSR is global.
time at the time this structure was last updated. Unit is
nanoseconds.
- tsc_to_system_mul: a function of the tsc frequency. One has
- to multiply any tsc-related quantity by this value to get
- a value in nanoseconds, besides dividing by 2^tsc_shift
+ tsc_to_system_mul: multiplier to be used when converting
+ tsc-related quantity to nanoseconds
- tsc_shift: cycle to nanosecond divider, as a power of two, to
- allow for shift rights. One has to shift right any tsc-related
- quantity by this value to get a value in nanoseconds, besides
- multiplying by tsc_to_system_mul.
+ tsc_shift: shift to be used when converting tsc-related
+ quantity to nanoseconds. This shift will ensure that
+ multiplication with tsc_to_system_mul does not overflow.
+ A positive value denotes a left shift, a negative value
+ a right shift.
- With this information, guests can derive per-CPU time by
- doing:
+ The conversion from tsc to nanoseconds involves an additional
+ right shift by 32 bits. With this information, guests can
+ derive per-CPU time by doing:
time = (current_tsc - tsc_timestamp)
- time = (time * tsc_to_system_mul) >> tsc_shift
+ if (tsc_shift >= 0)
+ time <<= tsc_shift;
+ else
+ time >>= -tsc_shift;
+ time = (time * tsc_to_system_mul) >> 32
time = time + system_time
flags: bits in this field indicate extended capabilities
That way we can inject an arbitrary amount of code as replacement for a single
instruction. This allows us to check for pending interrupts when setting EE=1
for example.
+
+Hypercall ABIs in KVM on PowerPC
+=================================
+1) KVM hypercalls (ePAPR)
+
+These are ePAPR compliant hypercall implementation (mentioned above). Even
+generic hypercalls are implemented here, like the ePAPR idle hcall. These are
+available on all targets.
+
+2) PAPR hypercalls
+
+PAPR hypercalls are needed to run server PowerPC PAPR guests (-M pseries in QEMU).
+These are the same hypercalls that pHyp, the POWER hypervisor implements. Some of
+them are handled in the kernel, some are handled in user space. This is only
+available on book3s_64.
+
+3) OSI hypercalls
+
+Mac-on-Linux is another user of KVM on PowerPC, which has its own hypercall (long
+before KVM). This is supported to maintain compatibility. All these hypercalls get
+forwarded to user space. This is only useful on book3s_32, but can be used with
+book3s_64 as well.
return 0;
}
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level);
+ return 0;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
goto out;
}
break;
- case KVM_IRQ_LINE_STATUS:
- case KVM_IRQ_LINE: {
- struct kvm_irq_level irq_event;
-
- r = -EFAULT;
- if (copy_from_user(&irq_event, argp, sizeof irq_event))
- goto out;
- r = -ENXIO;
- if (irqchip_in_kernel(kvm)) {
- __s32 status;
- status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event.irq, irq_event.level);
- if (ioctl == KVM_IRQ_LINE_STATUS) {
- r = -EFAULT;
- irq_event.status = status;
- if (copy_to_user(argp, &irq_event,
- sizeof irq_event))
- goto out;
- }
- r = 0;
- }
- break;
- }
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
struct kvm_irqchip chip;
return;
}
-void kvm_arch_flush_shadow(struct kvm *kvm)
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
kvm_flush_remote_tlbs(kvm);
}
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_arch_flush_shadow_all();
+}
+
long kvm_arch_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm;
extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+extern int kvm_unmap_hva_range(struct kvm *kvm,
+ unsigned long start, unsigned long end);
extern int kvm_age_hva(struct kvm *kvm, unsigned long hva);
extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
#define KVMPPC_GOT_PAGE 0x80
struct kvm_arch_memory_slot {
+ unsigned long *rmap;
};
struct kvm_arch {
if (is_error_page(new_page)) {
printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n",
(unsigned long long)gfn);
- kvm_release_page_clean(new_page);
return;
}
hpaddr = page_to_phys(new_page);
goto out_unlock;
hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
- rmap = &memslot->rmap[gfn - memslot->base_gfn];
+ rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
lock_rmap(rmap);
/* Check if we might have been invalidated; let the guest retry if so */
goto out_put;
}
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
- unsigned long gfn))
+static int kvm_handle_hva_range(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ int (*handler)(struct kvm *kvm,
+ unsigned long *rmapp,
+ unsigned long gfn))
{
int ret;
int retval = 0;
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
- unsigned long start = memslot->userspace_addr;
- unsigned long end;
+ unsigned long hva_start, hva_end;
+ gfn_t gfn, gfn_end;
- end = start + (memslot->npages << PAGE_SHIFT);
- if (hva >= start && hva < end) {
- gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+ hva_start = max(start, memslot->userspace_addr);
+ hva_end = min(end, memslot->userspace_addr +
+ (memslot->npages << PAGE_SHIFT));
+ if (hva_start >= hva_end)
+ continue;
+ /*
+ * {gfn(page) | page intersects with [hva_start, hva_end)} =
+ * {gfn, gfn+1, ..., gfn_end-1}.
+ */
+ gfn = hva_to_gfn_memslot(hva_start, memslot);
+ gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+
+ for (; gfn < gfn_end; ++gfn) {
+ gfn_t gfn_offset = gfn - memslot->base_gfn;
- ret = handler(kvm, &memslot->rmap[gfn_offset],
- memslot->base_gfn + gfn_offset);
+ ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn);
retval |= ret;
}
}
return retval;
}
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long gfn))
+{
+ return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
+}
+
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long gfn)
{
return 0;
}
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+ if (kvm->arch.using_mmu_notifiers)
+ kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
+ return 0;
+}
+
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long gfn)
{
unsigned long *rmapp, *map;
preempt_disable();
- rmapp = memslot->rmap;
+ rmapp = memslot->arch.rmap;
map = memslot->dirty_bitmap;
for (i = 0; i < memslot->npages; ++i) {
if (kvm_test_clear_dirty(kvm, rmapp))
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
return;
- rmap = real_vmalloc_addr(&memslot->rmap[gfn - memslot->base_gfn]);
+ rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
lock_rmap(rmap);
head = *rmap & KVMPPC_RMAP_INDEX;
if (!slot_is_aligned(memslot, psize))
return H_PARAMETER;
slot_fn = gfn - memslot->base_gfn;
- rmap = &memslot->rmap[slot_fn];
+ rmap = &memslot->arch.rmap[slot_fn];
if (!kvm->arch.using_mmu_notifiers) {
physp = kvm->arch.slot_phys[memslot->id];
pa &= PAGE_MASK;
} else {
/* Translate to host virtual address */
- hva = gfn_to_hva_memslot(memslot, gfn);
+ hva = __gfn_to_hva_memslot(memslot, gfn);
/* Look up the Linux PTE for the backing page */
pte_size = psize;
int i;
hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
- if (is_error_page(hpage)) {
- kvm_release_page_clean(hpage);
+ if (is_error_page(hpage))
return;
- }
hpage_offset = pte->raddr & ~PAGE_MASK;
hpage_offset &= ~0xFFFULL;
if (likely(!pfnmap)) {
unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
- pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn);
+ pfn = gfn_to_pfn_memslot(slot, gfn);
if (is_error_pfn(pfn)) {
printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
(long)gfn);
- kvm_release_pfn_clean(pfn);
return;
}
void kvm_arch_free_memslot(struct kvm_memory_slot *free,
struct kvm_memory_slot *dont)
{
+ if (!dont || free->arch.rmap != dont->arch.rmap) {
+ vfree(free->arch.rmap);
+ free->arch.rmap = NULL;
+ }
}
int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
{
+ slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
+ if (!slot->arch.rmap)
+ return -ENOMEM;
+
return 0;
}
kvmppc_core_commit_memory_region(kvm, mem);
}
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+}
-void kvm_arch_flush_shadow(struct kvm *kvm)
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
{
}
extern void show_code(struct pt_regs *regs);
extern void print_fn_code(unsigned char *code, unsigned long len);
+extern int insn_to_mnemonic(unsigned char *instruction, char buf[8]);
unsigned long get_wchan(struct task_struct *p);
#define task_pt_regs(tsk) ((struct pt_regs *) \
return NULL;
}
+/**
+ * insn_to_mnemonic - decode an s390 instruction
+ * @instruction: instruction to decode
+ * @buf: buffer to fill with mnemonic
+ *
+ * Decode the instruction at @instruction and store the corresponding
+ * mnemonic into @buf.
+ * @buf is left unchanged if the instruction could not be decoded.
+ * Returns:
+ * %0 on success, %-ENOENT if the instruction was not found.
+ */
+int insn_to_mnemonic(unsigned char *instruction, char buf[8])
+{
+ struct insn *insn;
+
+ insn = find_insn(instruction);
+ if (!insn)
+ return -ENOENT;
+ if (insn->name[0] == '\0')
+ snprintf(buf, sizeof(buf), "%s",
+ long_insn_name[(int) insn->name[1]]);
+ else
+ snprintf(buf, sizeof(buf), "%.5s", insn->name);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(insn_to_mnemonic);
+
static int print_insn(char *buffer, unsigned char *code, unsigned long addr)
{
struct insn *insn;
depends on HAVE_KVM && EXPERIMENTAL
select PREEMPT_NOTIFIERS
select ANON_INODES
+ select HAVE_KVM_CPU_RELAX_INTERCEPT
---help---
Support hosting paravirtualized guest machines using the SIE
virtualization capability on the mainframe. This should work
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include "kvm-s390.h"
+#include "trace.h"
+#include "trace-s390.h"
static int diag_release_pages(struct kvm_vcpu *vcpu)
{
vcpu->run->exit_reason = KVM_EXIT_S390_RESET;
VCPU_EVENT(vcpu, 3, "requesting userspace resets %llx",
vcpu->run->s390_reset_flags);
+ trace_kvm_s390_request_resets(vcpu->run->s390_reset_flags);
return -EREMOTE;
}
{
int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16;
+ trace_kvm_s390_handle_diag(vcpu, code);
switch (code) {
case 0x10:
return diag_release_pages(vcpu);
#include "kvm-s390.h"
#include "gaccess.h"
+#include "trace.h"
+#include "trace-s390.h"
static int handle_lctlg(struct kvm_vcpu *vcpu)
{
VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2,
disp2);
+ trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
do {
rc = get_guest_u64(vcpu, useraddr,
VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2,
disp2);
+ trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr);
reg = reg1;
do {
vcpu->stat.exit_stop_request++;
spin_lock_bh(&vcpu->arch.local_int.lock);
+ trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits);
+
if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) {
vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP;
rc = SIE_INTERCEPT_RERUNVCPU;
int rc;
vcpu->stat.exit_validity++;
+ trace_kvm_s390_intercept_validity(vcpu, viwhy);
if (viwhy == 0x37) {
vmaddr = gmap_fault(vcpu->arch.sie_block->prefix,
vcpu->arch.gmap);
intercept_handler_t handler;
vcpu->stat.exit_instruction++;
+ trace_kvm_s390_intercept_instruction(vcpu,
+ vcpu->arch.sie_block->ipa,
+ vcpu->arch.sie_block->ipb);
handler = instruction_handlers[vcpu->arch.sie_block->ipa >> 8];
if (handler)
return handler(vcpu);
static int handle_prog(struct kvm_vcpu *vcpu)
{
vcpu->stat.exit_program_interruption++;
+ trace_kvm_s390_intercept_prog(vcpu, vcpu->arch.sie_block->iprcc);
return kvm_s390_inject_program_int(vcpu, vcpu->arch.sie_block->iprcc);
}
#include <asm/uaccess.h>
#include "kvm-s390.h"
#include "gaccess.h"
+#include "trace-s390.h"
static int psw_extint_disabled(struct kvm_vcpu *vcpu)
{
case KVM_S390_INT_EMERGENCY:
VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp emerg");
vcpu->stat.deliver_emergency_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->emerg.code, 0);
rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1201);
if (rc == -EFAULT)
exception = 1;
case KVM_S390_INT_EXTERNAL_CALL:
VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call");
vcpu->stat.deliver_external_call++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->extcall.code, 0);
rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202);
if (rc == -EFAULT)
exception = 1;
VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
inti->ext.ext_params);
vcpu->stat.deliver_service_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->ext.ext_params, 0);
rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2401);
if (rc == -EFAULT)
exception = 1;
VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx",
inti->ext.ext_params, inti->ext.ext_params2);
vcpu->stat.deliver_virtio_interrupt++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->ext.ext_params,
+ inti->ext.ext_params2);
rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2603);
if (rc == -EFAULT)
exception = 1;
case KVM_S390_SIGP_STOP:
VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop");
vcpu->stat.deliver_stop_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ 0, 0);
__set_intercept_indicator(vcpu, inti);
break;
VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x",
inti->prefix.address);
vcpu->stat.deliver_prefix_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->prefix.address, 0);
kvm_s390_set_prefix(vcpu, inti->prefix.address);
break;
case KVM_S390_RESTART:
VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu restart");
vcpu->stat.deliver_restart_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ 0, 0);
rc = copy_to_guest(vcpu, offsetof(struct _lowcore,
restart_old_psw), &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
if (rc == -EFAULT)
inti->pgm.code,
table[vcpu->arch.sie_block->ipa >> 14]);
vcpu->stat.deliver_program_int++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->pgm.code, 0);
rc = put_guest_u16(vcpu, __LC_PGM_INT_CODE, inti->pgm.code);
if (rc == -EFAULT)
exception = 1;
set_current_state(TASK_INTERRUPTIBLE);
spin_unlock_bh(&vcpu->arch.local_int.lock);
spin_unlock(&vcpu->arch.local_int.float_int->lock);
- vcpu_put(vcpu);
schedule();
- vcpu_load(vcpu);
spin_lock(&vcpu->arch.local_int.float_int->lock);
spin_lock_bh(&vcpu->arch.local_int.lock);
}
inti->pgm.code = code;
VCPU_EVENT(vcpu, 3, "inject: program check %d (from kernel)", code);
+ trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, inti->type, code, 0, 1);
spin_lock_bh(&li->lock);
list_add(&inti->list, &li->list);
atomic_set(&li->active, 1);
kfree(inti);
return -EINVAL;
}
+ trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
+ 2);
mutex_lock(&kvm->lock);
fi = &kvm->arch.float_int;
kfree(inti);
return -EINVAL;
}
+ trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, s390int->type, s390int->parm,
+ s390int->parm64, 2);
mutex_lock(&vcpu->kvm->lock);
li = &vcpu->arch.local_int;
#include "kvm-s390.h"
#include "gaccess.h"
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+#include "trace-s390.h"
+
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
struct kvm_stats_debugfs_item debugfs_entries[] = {
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
VCPU_EVENT(vcpu, 3, "%s", "free cpu");
+ trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id);
if (!kvm_is_ucontrol(vcpu->kvm)) {
clear_bit(63 - vcpu->vcpu_id,
(unsigned long *) &vcpu->kvm->arch.sca->mcn);
goto out_free_sie_block;
VM_EVENT(kvm, 3, "create cpu %d at %p, sie block at %p", id, vcpu,
vcpu->arch.sie_block);
+ trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block);
return vcpu;
out_free_sie_block:
local_irq_enable();
VCPU_EVENT(vcpu, 6, "entering sie flags %x",
atomic_read(&vcpu->arch.sie_block->cpuflags));
+ trace_kvm_s390_sie_enter(vcpu,
+ atomic_read(&vcpu->arch.sie_block->cpuflags));
rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
if (rc) {
if (kvm_is_ucontrol(vcpu->kvm)) {
rc = SIE_INTERCEPT_UCONTROL;
} else {
VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
+ trace_kvm_s390_sie_fault(vcpu);
kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
rc = 0;
}
}
VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
vcpu->arch.sie_block->icptcode);
+ trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
local_irq_disable();
kvm_guest_exit();
local_irq_enable();
return;
}
-void kvm_arch_flush_shadow(struct kvm *kvm)
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
{
}
#include <asm/sysinfo.h>
#include "gaccess.h"
#include "kvm-s390.h"
+#include "trace.h"
static int handle_set_prefix(struct kvm_vcpu *vcpu)
{
kvm_s390_set_prefix(vcpu, address);
VCPU_EVENT(vcpu, 5, "setting prefix to %x", address);
+ trace_kvm_s390_handle_prefix(vcpu, 1, address);
out:
return 0;
}
}
VCPU_EVENT(vcpu, 5, "storing prefix to %x", address);
+ trace_kvm_s390_handle_prefix(vcpu, 0, address);
out:
return 0;
}
}
VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr);
+ trace_kvm_s390_handle_stap(vcpu, useraddr);
out:
return 0;
}
&facility_list, sizeof(facility_list));
if (rc == -EFAULT)
kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
- else
+ else {
VCPU_EVENT(vcpu, 5, "store facility list value %x",
facility_list);
+ trace_kvm_s390_handle_stfl(vcpu, facility_list);
+ }
return 0;
}
kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
goto out_mem;
}
+ trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
free_page(mem);
vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
vcpu->run->s.regs.gprs[0] = 0;
#include <asm/sigp.h>
#include "gaccess.h"
#include "kvm-s390.h"
+#include "trace.h"
static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
u64 *reg)
else
parameter = vcpu->run->s.regs.gprs[r1 + 1];
+ trace_kvm_s390_handle_sigp(vcpu, order_code, cpu_addr, parameter);
switch (order_code) {
case SIGP_SENSE:
vcpu->stat.instruction_sigp_sense++;
--- /dev/null
+#if !defined(_TRACE_KVMS390_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVMS390_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm-s390
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace-s390
+
+/*
+ * Trace point for the creation of the kvm instance.
+ */
+TRACE_EVENT(kvm_s390_create_vm,
+ TP_PROTO(unsigned long type),
+ TP_ARGS(type),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, type)
+ ),
+
+ TP_fast_assign(
+ __entry->type = type;
+ ),
+
+ TP_printk("create vm%s",
+ __entry->type & KVM_VM_S390_UCONTROL ? " (UCONTROL)" : "")
+ );
+
+/*
+ * Trace points for creation and destruction of vpcus.
+ */
+TRACE_EVENT(kvm_s390_create_vcpu,
+ TP_PROTO(unsigned int id, struct kvm_vcpu *vcpu,
+ struct kvm_s390_sie_block *sie_block),
+ TP_ARGS(id, vcpu, sie_block),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, id)
+ __field(struct kvm_vcpu *, vcpu)
+ __field(struct kvm_s390_sie_block *, sie_block)
+ ),
+
+ TP_fast_assign(
+ __entry->id = id;
+ __entry->vcpu = vcpu;
+ __entry->sie_block = sie_block;
+ ),
+
+ TP_printk("create cpu %d at %p, sie block at %p", __entry->id,
+ __entry->vcpu, __entry->sie_block)
+ );
+
+TRACE_EVENT(kvm_s390_destroy_vcpu,
+ TP_PROTO(unsigned int id),
+ TP_ARGS(id),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, id)
+ ),
+
+ TP_fast_assign(
+ __entry->id = id;
+ ),
+
+ TP_printk("destroy cpu %d", __entry->id)
+ );
+
+/*
+ * Trace points for injection of interrupts, either per machine or
+ * per vcpu.
+ */
+
+#define kvm_s390_int_type \
+ {KVM_S390_SIGP_STOP, "sigp stop"}, \
+ {KVM_S390_PROGRAM_INT, "program interrupt"}, \
+ {KVM_S390_SIGP_SET_PREFIX, "sigp set prefix"}, \
+ {KVM_S390_RESTART, "sigp restart"}, \
+ {KVM_S390_INT_VIRTIO, "virtio interrupt"}, \
+ {KVM_S390_INT_SERVICE, "sclp interrupt"}, \
+ {KVM_S390_INT_EMERGENCY, "sigp emergency"}, \
+ {KVM_S390_INT_EXTERNAL_CALL, "sigp ext call"}
+
+TRACE_EVENT(kvm_s390_inject_vm,
+ TP_PROTO(__u64 type, __u32 parm, __u64 parm64, int who),
+ TP_ARGS(type, parm, parm64, who),
+
+ TP_STRUCT__entry(
+ __field(__u32, inttype)
+ __field(__u32, parm)
+ __field(__u64, parm64)
+ __field(int, who)
+ ),
+
+ TP_fast_assign(
+ __entry->inttype = type & 0x00000000ffffffff;
+ __entry->parm = parm;
+ __entry->parm64 = parm64;
+ __entry->who = who;
+ ),
+
+ TP_printk("inject%s: type:%x (%s) parm:%x parm64:%llx",
+ (__entry->who == 1) ? " (from kernel)" :
+ (__entry->who == 2) ? " (from user)" : "",
+ __entry->inttype,
+ __print_symbolic(__entry->inttype, kvm_s390_int_type),
+ __entry->parm, __entry->parm64)
+ );
+
+TRACE_EVENT(kvm_s390_inject_vcpu,
+ TP_PROTO(unsigned int id, __u64 type, __u32 parm, __u64 parm64, \
+ int who),
+ TP_ARGS(id, type, parm, parm64, who),
+
+ TP_STRUCT__entry(
+ __field(int, id)
+ __field(__u32, inttype)
+ __field(__u32, parm)
+ __field(__u64, parm64)
+ __field(int, who)
+ ),
+
+ TP_fast_assign(
+ __entry->id = id;
+ __entry->inttype = type & 0x00000000ffffffff;
+ __entry->parm = parm;
+ __entry->parm64 = parm64;
+ __entry->who = who;
+ ),
+
+ TP_printk("inject%s (vcpu %d): type:%x (%s) parm:%x parm64:%llx",
+ (__entry->who == 1) ? " (from kernel)" :
+ (__entry->who == 2) ? " (from user)" : "",
+ __entry->id, __entry->inttype,
+ __print_symbolic(__entry->inttype, kvm_s390_int_type),
+ __entry->parm, __entry->parm64)
+ );
+
+/*
+ * Trace point for the actual delivery of interrupts.
+ */
+TRACE_EVENT(kvm_s390_deliver_interrupt,
+ TP_PROTO(unsigned int id, __u64 type, __u32 data0, __u64 data1),
+ TP_ARGS(id, type, data0, data1),
+
+ TP_STRUCT__entry(
+ __field(int, id)
+ __field(__u32, inttype)
+ __field(__u32, data0)
+ __field(__u64, data1)
+ ),
+
+ TP_fast_assign(
+ __entry->id = id;
+ __entry->inttype = type & 0x00000000ffffffff;
+ __entry->data0 = data0;
+ __entry->data1 = data1;
+ ),
+
+ TP_printk("deliver interrupt (vcpu %d): type:%x (%s) " \
+ "data:%08x %016llx",
+ __entry->id, __entry->inttype,
+ __print_symbolic(__entry->inttype, kvm_s390_int_type),
+ __entry->data0, __entry->data1)
+ );
+
+/*
+ * Trace point for resets that may be requested from userspace.
+ */
+TRACE_EVENT(kvm_s390_request_resets,
+ TP_PROTO(__u64 resets),
+ TP_ARGS(resets),
+
+ TP_STRUCT__entry(
+ __field(__u64, resets)
+ ),
+
+ TP_fast_assign(
+ __entry->resets = resets;
+ ),
+
+ TP_printk("requesting userspace resets %llx",
+ __entry->resets)
+ );
+
+/*
+ * Trace point for a vcpu's stop requests.
+ */
+TRACE_EVENT(kvm_s390_stop_request,
+ TP_PROTO(unsigned int action_bits),
+ TP_ARGS(action_bits),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, action_bits)
+ ),
+
+ TP_fast_assign(
+ __entry->action_bits = action_bits;
+ ),
+
+ TP_printk("stop request, action_bits = %08x",
+ __entry->action_bits)
+ );
+
+
+#endif /* _TRACE_KVMS390_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--- /dev/null
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+#include <asm/sigp.h>
+#include <asm/debug.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/*
+ * Helpers for vcpu-specific tracepoints containing the same information
+ * as s390dbf VCPU_EVENTs.
+ */
+#define VCPU_PROTO_COMMON struct kvm_vcpu *vcpu
+#define VCPU_ARGS_COMMON vcpu
+#define VCPU_FIELD_COMMON __field(int, id) \
+ __field(unsigned long, pswmask) \
+ __field(unsigned long, pswaddr)
+#define VCPU_ASSIGN_COMMON do { \
+ __entry->id = vcpu->vcpu_id; \
+ __entry->pswmask = vcpu->arch.sie_block->gpsw.mask; \
+ __entry->pswaddr = vcpu->arch.sie_block->gpsw.addr; \
+ } while (0);
+#define VCPU_TP_PRINTK(p_str, p_args...) \
+ TP_printk("%02d[%016lx-%016lx]: " p_str, __entry->id, \
+ __entry->pswmask, __entry->pswaddr, p_args)
+
+/*
+ * Tracepoints for SIE entry and exit.
+ */
+TRACE_EVENT(kvm_s390_sie_enter,
+ TP_PROTO(VCPU_PROTO_COMMON, int cpuflags),
+ TP_ARGS(VCPU_ARGS_COMMON, cpuflags),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(int, cpuflags)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->cpuflags = cpuflags;
+ ),
+
+ VCPU_TP_PRINTK("entering sie flags %x", __entry->cpuflags)
+ );
+
+TRACE_EVENT(kvm_s390_sie_fault,
+ TP_PROTO(VCPU_PROTO_COMMON),
+ TP_ARGS(VCPU_ARGS_COMMON),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ ),
+
+ VCPU_TP_PRINTK("%s", "fault in sie instruction")
+ );
+
+#define sie_intercept_code \
+ {0x04, "Instruction"}, \
+ {0x08, "Program interruption"}, \
+ {0x0C, "Instruction and program interuption"}, \
+ {0x10, "External request"}, \
+ {0x14, "External interruption"}, \
+ {0x18, "I/O request"}, \
+ {0x1C, "Wait state"}, \
+ {0x20, "Validity"}, \
+ {0x28, "Stop request"}
+
+TRACE_EVENT(kvm_s390_sie_exit,
+ TP_PROTO(VCPU_PROTO_COMMON, u8 icptcode),
+ TP_ARGS(VCPU_ARGS_COMMON, icptcode),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(u8, icptcode)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->icptcode = icptcode;
+ ),
+
+ VCPU_TP_PRINTK("exit sie icptcode %d (%s)", __entry->icptcode,
+ __print_symbolic(__entry->icptcode,
+ sie_intercept_code))
+ );
+
+/*
+ * Trace point for intercepted instructions.
+ */
+TRACE_EVENT(kvm_s390_intercept_instruction,
+ TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb),
+ TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u64, instruction)
+ __field(char, insn[8])
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->instruction = ((__u64)ipa << 48) |
+ ((__u64)ipb << 16);
+ ),
+
+ VCPU_TP_PRINTK("intercepted instruction %016llx (%s)",
+ __entry->instruction,
+ insn_to_mnemonic((unsigned char *)
+ &__entry->instruction,
+ __entry->insn) ?
+ "unknown" : __entry->insn)
+ );
+
+/*
+ * Trace point for intercepted program interruptions.
+ */
+TRACE_EVENT(kvm_s390_intercept_prog,
+ TP_PROTO(VCPU_PROTO_COMMON, __u16 code),
+ TP_ARGS(VCPU_ARGS_COMMON, code),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u16, code)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->code = code;
+ ),
+
+ VCPU_TP_PRINTK("intercepted program interruption %04x",
+ __entry->code)
+ );
+
+/*
+ * Trace point for validity intercepts.
+ */
+TRACE_EVENT(kvm_s390_intercept_validity,
+ TP_PROTO(VCPU_PROTO_COMMON, __u16 viwhy),
+ TP_ARGS(VCPU_ARGS_COMMON, viwhy),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u16, viwhy)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->viwhy = viwhy;
+ ),
+
+ VCPU_TP_PRINTK("got validity intercept %04x", __entry->viwhy)
+ );
+
+/*
+ * Trace points for instructions that are of special interest.
+ */
+
+#define sigp_order_codes \
+ {SIGP_SENSE, "sense"}, \
+ {SIGP_EXTERNAL_CALL, "external call"}, \
+ {SIGP_EMERGENCY_SIGNAL, "emergency signal"}, \
+ {SIGP_STOP, "stop"}, \
+ {SIGP_STOP_AND_STORE_STATUS, "stop and store status"}, \
+ {SIGP_SET_ARCHITECTURE, "set architecture"}, \
+ {SIGP_SET_PREFIX, "set prefix"}, \
+ {SIGP_SENSE_RUNNING, "sense running"}, \
+ {SIGP_RESTART, "restart"}
+
+TRACE_EVENT(kvm_s390_handle_sigp,
+ TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr, \
+ __u32 parameter),
+ TP_ARGS(VCPU_ARGS_COMMON, order_code, cpu_addr, parameter),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u8, order_code)
+ __field(__u16, cpu_addr)
+ __field(__u32, parameter)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->order_code = order_code;
+ __entry->cpu_addr = cpu_addr;
+ __entry->parameter = parameter;
+ ),
+
+ VCPU_TP_PRINTK("handle sigp order %02x (%s), cpu address %04x, " \
+ "parameter %08x", __entry->order_code,
+ __print_symbolic(__entry->order_code,
+ sigp_order_codes),
+ __entry->cpu_addr, __entry->parameter)
+ );
+
+#define diagnose_codes \
+ {0x10, "release pages"}, \
+ {0x44, "time slice end"}, \
+ {0x308, "ipl functions"}, \
+ {0x500, "kvm hypercall"}, \
+ {0x501, "kvm breakpoint"}
+
+TRACE_EVENT(kvm_s390_handle_diag,
+ TP_PROTO(VCPU_PROTO_COMMON, __u16 code),
+ TP_ARGS(VCPU_ARGS_COMMON, code),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u16, code)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->code = code;
+ ),
+
+ VCPU_TP_PRINTK("handle diagnose call %04x (%s)", __entry->code,
+ __print_symbolic(__entry->code, diagnose_codes))
+ );
+
+TRACE_EVENT(kvm_s390_handle_lctl,
+ TP_PROTO(VCPU_PROTO_COMMON, int g, int reg1, int reg3, u64 addr),
+ TP_ARGS(VCPU_ARGS_COMMON, g, reg1, reg3, addr),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(int, g)
+ __field(int, reg1)
+ __field(int, reg3)
+ __field(u64, addr)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->g = g;
+ __entry->reg1 = reg1;
+ __entry->reg3 = reg3;
+ __entry->addr = addr;
+ ),
+
+ VCPU_TP_PRINTK("%s: loading cr %x-%x from %016llx",
+ __entry->g ? "lctlg" : "lctl",
+ __entry->reg1, __entry->reg3, __entry->addr)
+ );
+
+TRACE_EVENT(kvm_s390_handle_prefix,
+ TP_PROTO(VCPU_PROTO_COMMON, int set, u32 address),
+ TP_ARGS(VCPU_ARGS_COMMON, set, address),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(int, set)
+ __field(u32, address)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->set = set;
+ __entry->address = address;
+ ),
+
+ VCPU_TP_PRINTK("%s prefix to %08x",
+ __entry->set ? "setting" : "storing",
+ __entry->address)
+ );
+
+TRACE_EVENT(kvm_s390_handle_stap,
+ TP_PROTO(VCPU_PROTO_COMMON, u64 address),
+ TP_ARGS(VCPU_ARGS_COMMON, address),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(u64, address)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->address = address;
+ ),
+
+ VCPU_TP_PRINTK("storing cpu address to %016llx",
+ __entry->address)
+ );
+
+TRACE_EVENT(kvm_s390_handle_stfl,
+ TP_PROTO(VCPU_PROTO_COMMON, unsigned int facility_list),
+ TP_ARGS(VCPU_ARGS_COMMON, facility_list),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(unsigned int, facility_list)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->facility_list = facility_list;
+ ),
+
+ VCPU_TP_PRINTK("store facility list value %08x",
+ __entry->facility_list)
+ );
+
+TRACE_EVENT(kvm_s390_handle_stsi,
+ TP_PROTO(VCPU_PROTO_COMMON, int fc, int sel1, int sel2, u64 addr),
+ TP_ARGS(VCPU_ARGS_COMMON, fc, sel1, sel2, addr),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(int, fc)
+ __field(int, sel1)
+ __field(int, sel2)
+ __field(u64, addr)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->fc = fc;
+ __entry->sel1 = sel1;
+ __entry->sel2 = sel2;
+ __entry->addr = addr;
+ ),
+
+ VCPU_TP_PRINTK("STSI %d.%d.%d information stored to %016llx",
+ __entry->fc, __entry->sel1, __entry->sel2,
+ __entry->addr)
+ );
+
+#endif /* _TRACE_KVM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
source "arch/x86/xen/Kconfig"
-config KVM_CLOCK
- bool "KVM paravirtualized clock"
- select PARAVIRT
- select PARAVIRT_CLOCK
- ---help---
- Turning on this option will allow you to run a paravirtualized clock
- when running over the KVM hypervisor. Instead of relying on a PIT
- (or probably other) emulation by the underlying device model, the host
- provides the guest with timing infrastructure such as time of day, and
- system time
-
config KVM_GUEST
- bool "KVM Guest support"
+ bool "KVM Guest support (including kvmclock)"
+ select PARAVIRT
select PARAVIRT
+ select PARAVIRT_CLOCK
+ default y if PARAVIRT_GUEST
---help---
This option enables various optimizations for running under the KVM
- hypervisor.
+ hypervisor. It includes a paravirtualized clock, so that instead
+ of relying on a PIT (or probably other) emulation by the
+ underlying device model, the host provides the guest with
+ timing infrastructure such as time of day, and system time
source "arch/x86/lguest/Kconfig"
#define __KVM_HAVE_DEBUGREGS
#define __KVM_HAVE_XSAVE
#define __KVM_HAVE_XCRS
+#define __KVM_HAVE_READONLY_MEM
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */
struct x86_emulate_ops {
+ /*
+ * read_gpr: read a general purpose register (rax - r15)
+ *
+ * @reg: gpr number.
+ */
+ ulong (*read_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg);
+ /*
+ * write_gpr: write a general purpose register (rax - r15)
+ *
+ * @reg: gpr number.
+ * @val: value to write.
+ */
+ void (*write_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val);
/*
* read_std: Read bytes of standard (non-emulated/special) memory.
* Used for descriptor reading.
/* Type, address-of, and value of an instruction's operand. */
struct operand {
- enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
+ enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
unsigned int bytes;
+ unsigned int count;
union {
unsigned long orig_val;
u64 orig_val64;
char valptr[sizeof(unsigned long) + 2];
sse128_t vec_val;
u64 mm_val;
+ void *data;
};
};
unsigned long end;
};
+/* Execution mode, passed to the emulator. */
+enum x86emul_mode {
+ X86EMUL_MODE_REAL, /* Real mode. */
+ X86EMUL_MODE_VM86, /* Virtual 8086 mode. */
+ X86EMUL_MODE_PROT16, /* 16-bit protected mode. */
+ X86EMUL_MODE_PROT32, /* 32-bit protected mode. */
+ X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */
+};
+
struct x86_emulate_ctxt {
- struct x86_emulate_ops *ops;
+ const struct x86_emulate_ops *ops;
/* Register state before/after emulation. */
unsigned long eflags;
unsigned long eip; /* eip before instruction emulation */
/* Emulated execution mode, represented by an X86EMUL_MODE value. */
- int mode;
+ enum x86emul_mode mode;
/* interruptibility state, as a result of execution of STI or MOV SS */
int interruptibility;
bool rip_relative;
unsigned long _eip;
struct operand memop;
+ u32 regs_valid; /* bitmaps of registers in _regs[] that can be read */
+ u32 regs_dirty; /* bitmaps of registers in _regs[] that have been written */
/* Fields above regs are cleared together. */
- unsigned long regs[NR_VCPU_REGS];
+ unsigned long _regs[NR_VCPU_REGS];
struct operand *memopp;
struct fetch_cache fetch;
struct read_cache io_read;
#define REPE_PREFIX 0xf3
#define REPNE_PREFIX 0xf2
-/* Execution mode, passed to the emulator. */
-#define X86EMUL_MODE_REAL 0 /* Real mode. */
-#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */
-#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
-#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
-#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
-
-/* any protected mode */
-#define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \
- X86EMUL_MODE_PROT64)
-
/* CPUID vendors */
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
u16 tss_selector, int idt_index, int reason,
bool has_error_code, u32 error_code);
int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
+void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
+void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
+
#endif /* _ASM_X86_KVM_X86_EMULATE_H */
union kvm_mmu_page_role base_role;
bool direct_map;
+ /*
+ * Bitmap; bit set = permission fault
+ * Byte index: page fault error code [4:1]
+ * Bit index: pte permissions in ACC_* format
+ */
+ u8 permissions[16];
+
u64 *pae_root;
u64 *lm_root;
u64 rsvd_bits_mask[2][4];
+ /*
+ * Bitmap: bit set = last pte in walk
+ * index[0:1]: level (zero-based)
+ * index[2]: pte.ps
+ */
+ u8 last_pte_bitmap;
+
bool nx;
u64 pdptrs[4]; /* pae */
struct x86_emulate_ctxt emulate_ctxt;
bool emulate_regs_need_sync_to_vcpu;
bool emulate_regs_need_sync_from_vcpu;
+ int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
gpa_t time;
struct pvclock_vcpu_time_info hv_clock;
unsigned int hw_tsc_khz;
unsigned int time_offset;
struct page *time_page;
+ /* set guest stopped flag in pvclock flags field */
+ bool pvclock_set_guest_stopped_request;
struct {
u64 msr_val;
unsigned long dr6;
unsigned long dr7;
unsigned long eff_db[KVM_NR_DB_REGS];
+ unsigned long guest_debug_dr7;
u64 mcg_cap;
u64 mcg_status;
};
struct kvm_lpage_info {
- unsigned long rmap_pde;
int write_count;
};
struct kvm_arch_memory_slot {
+ unsigned long *rmap[KVM_NR_PAGE_SIZES];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
};
+struct kvm_apic_map {
+ struct rcu_head rcu;
+ u8 ldr_bits;
+ /* fields bellow are used to decode ldr values in different modes */
+ u32 cid_shift, cid_mask, lid_mask;
+ struct kvm_lapic *phys_map[256];
+ /* first index is cluster id second is cpu id in a cluster */
+ struct kvm_lapic *logical_map[16][16];
+};
+
struct kvm_arch {
unsigned int n_used_mmu_pages;
unsigned int n_requested_mmu_pages;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
int vapics_in_nmi_mode;
+ struct mutex apic_map_lock;
+ struct kvm_apic_map *apic_map;
unsigned int tss_addr;
struct page *apic_access_page;
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
- void (*set_guest_debug)(struct kvm_vcpu *vcpu,
- struct kvm_guest_debug *dbg);
+ void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
extern void kvmclock_init(void);
extern int kvm_register_clock(char *txt);
-#ifdef CONFIG_KVM_CLOCK
+#ifdef CONFIG_KVM_GUEST
bool kvm_check_and_clear_guest_paused(void);
#else
static inline bool kvm_check_and_clear_guest_paused(void)
{
return false;
}
-#endif /* CONFIG_KVMCLOCK */
+#endif /* CONFIG_KVM_GUEST */
/* This instruction is vmcall. On non-VT architectures, it will generate a
* trap that we will then rewrite to the appropriate instruction.
*/
#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
-/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
+/* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
* instruction. The hypervisor may replace it with something else but only the
* instructions are guaranteed to be supported.
*
obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
-obj-$(CONFIG_KVM_GUEST) += kvm.o
-obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
+obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
wrmsrl(MSR_KVM_PV_EOI_EN, 0);
kvm_pv_disable_apf();
+ kvm_disable_steal_time();
}
static int kvm_pv_reboot_notify(struct notifier_block *nb,
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
-#ifdef CONFIG_KVM_CLOCK
WARN_ON(kvm_register_clock("primary cpu clock"));
-#endif
kvm_guest_cpu_init();
native_smp_prepare_boot_cpu();
}
initmem_init();
memblock_find_dma_reserve();
-#ifdef CONFIG_KVM_CLOCK
+#ifdef CONFIG_KVM_GUEST
kvmclock_init();
#endif
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
+ depends on HIGH_RES_TIMERS
# for device assignment:
depends on PCI
# for TASKSTATS/TASK_DELAY_ACCT:
select TASK_DELAY_ACCT
select PERF_EVENTS
select HAVE_KVM_MSI
+ select HAVE_KVM_CPU_RELAX_INTERCEPT
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
- i8254.o timer.o cpuid.o pmu.o
+ i8254.o cpuid.o pmu.o
kvm-intel-y += vmx.o
kvm-amd-y += svm.o
}
case 7: {
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- /* Mask ebx against host capbability word 9 */
+ /* Mask ebx against host capability word 9 */
if (index == 0) {
entry->ebx &= kvm_supported_word9_x86_features;
cpuid_mask(&entry->ebx, 9);
break;
}
case KVM_CPUID_SIGNATURE: {
- char signature[12] = "KVMKVMKVM\0\0";
- u32 *sigptr = (u32 *)signature;
+ static const char signature[12] = "KVMKVMKVM\0\0";
+ const u32 *sigptr = (const u32 *)signature;
entry->eax = KVM_CPUID_FEATURES;
entry->ebx = sigptr[0];
entry->ecx = sigptr[1];
u32 func;
u32 idx;
bool has_leaf_count;
- bool (*qualifier)(struct kvm_cpuid_param *param);
+ bool (*qualifier)(const struct kvm_cpuid_param *param);
};
-static bool is_centaur_cpu(struct kvm_cpuid_param *param)
+static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
{
return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
}
struct kvm_cpuid_entry2 *cpuid_entries;
int limit, nent = 0, r = -E2BIG, i;
u32 func;
- static struct kvm_cpuid_param param[] = {
+ static const struct kvm_cpuid_param param[] = {
{ .func = 0, .has_leaf_count = true },
{ .func = 0x80000000, .has_leaf_count = true },
{ .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true },
r = 0;
for (i = 0; i < ARRAY_SIZE(param); i++) {
- struct kvm_cpuid_param *ent = ¶m[i];
+ const struct kvm_cpuid_param *ent = ¶m[i];
if (ent->qualifier && !ent->qualifier(ent))
continue;
u64 intercept : 8;
union {
int (*execute)(struct x86_emulate_ctxt *ctxt);
- struct opcode *group;
- struct group_dual *gdual;
- struct gprefix *gprefix;
+ const struct opcode *group;
+ const struct group_dual *gdual;
+ const struct gprefix *gprefix;
} u;
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
};
#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
#define EFLG_RESERVED_ONE_MASK 2
+static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ if (!(ctxt->regs_valid & (1 << nr))) {
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
+ }
+ return ctxt->_regs[nr];
+}
+
+static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->regs_dirty |= 1 << nr;
+ return &ctxt->_regs[nr];
+}
+
+static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ reg_read(ctxt, nr);
+ return reg_write(ctxt, nr);
+}
+
+static void writeback_registers(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned reg;
+
+ for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16)
+ ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]);
+}
+
+static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->regs_dirty = 0;
+ ctxt->regs_valid = 0;
+}
+
/*
* Instruction emulation:
* Most instructions are emulated directly via a fragment of inline assembly
#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
do { \
unsigned long _tmp; \
- ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX]; \
- ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX]; \
+ ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX); \
+ ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX); \
\
__asm__ __volatile__ ( \
_PRE_EFLAGS("0", "5", "1") \
static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
{
- masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc);
+ masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc);
}
static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
la = seg_base(ctxt, addr.seg) + addr.ea;
switch (ctxt->mode) {
- case X86EMUL_MODE_REAL:
- break;
case X86EMUL_MODE_PROT64:
if (((signed long)la << 16) >> 16 != la)
return emulate_gp(ctxt, 0);
if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
goto bad;
} else {
- /* exapand-down segment */
+ /* expand-down segment */
if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
goto bad;
lim = desc.d ? 0xffffffff : 0xffff;
goto bad;
}
cpl = ctxt->ops->cpl(ctxt);
- rpl = sel & 3;
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ rpl = 0;
+ else
+ rpl = sel & 3;
cpl = max(cpl, rpl);
if (!(desc.type & 8)) {
/* data segment */
return X86EMUL_CONTINUE;
bad:
if (addr.seg == VCPU_SREG_SS)
- return emulate_ss(ctxt, addr.seg);
+ return emulate_ss(ctxt, sel);
else
- return emulate_gp(ctxt, addr.seg);
+ return emulate_gp(ctxt, sel);
}
static int linearize(struct x86_emulate_ctxt *ctxt,
* pointer into the block that addresses the relevant register.
* @highbyte_regs specifies whether to decode AH,CH,DH,BH.
*/
-static void *decode_register(u8 modrm_reg, unsigned long *regs,
+static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg,
int highbyte_regs)
{
void *p;
- p = ®s[modrm_reg];
if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
- p = (unsigned char *)®s[modrm_reg & 3] + 1;
+ p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1;
+ else
+ p = reg_rmw(ctxt, modrm_reg);
return p;
}
{
ctxt->ops->get_fpu(ctxt);
switch (reg) {
- case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break;
- case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break;
- case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break;
- case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break;
- case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break;
- case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break;
- case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break;
- case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break;
+ case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
+ case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
+ case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break;
+ case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break;
+ case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break;
+ case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break;
+ case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break;
+ case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break;
#ifdef CONFIG_X86_64
- case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break;
- case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break;
- case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break;
- case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break;
- case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break;
- case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break;
- case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break;
- case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break;
+ case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break;
+ case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break;
+ case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break;
+ case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break;
+ case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break;
+ case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break;
+ case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break;
+ case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break;
#endif
default: BUG();
}
{
ctxt->ops->get_fpu(ctxt);
switch (reg) {
- case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break;
- case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break;
- case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break;
- case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break;
- case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break;
- case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break;
- case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break;
- case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break;
+ case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
+ case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
+ case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break;
+ case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break;
+ case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break;
+ case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break;
+ case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break;
+ case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break;
#ifdef CONFIG_X86_64
- case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break;
- case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break;
- case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break;
- case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break;
- case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break;
- case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break;
- case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break;
- case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break;
+ case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break;
+ case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break;
+ case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break;
+ case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break;
+ case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break;
+ case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break;
+ case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break;
+ case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break;
#endif
default: BUG();
}
op->type = OP_REG;
if (ctxt->d & ByteOp) {
- op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
+ op->addr.reg = decode_register(ctxt, reg, highbyte_regs);
op->bytes = 1;
} else {
- op->addr.reg = decode_register(reg, ctxt->regs, 0);
+ op->addr.reg = decode_register(ctxt, reg, 0);
op->bytes = ctxt->op_bytes;
}
fetch_register_operand(op);
if (ctxt->modrm_mod == 3) {
op->type = OP_REG;
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
- op->addr.reg = decode_register(ctxt->modrm_rm,
- ctxt->regs, ctxt->d & ByteOp);
+ op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, ctxt->d & ByteOp);
if (ctxt->d & Sse) {
op->type = OP_XMM;
op->bytes = 16;
op->type = OP_MEM;
if (ctxt->ad_bytes == 2) {
- unsigned bx = ctxt->regs[VCPU_REGS_RBX];
- unsigned bp = ctxt->regs[VCPU_REGS_RBP];
- unsigned si = ctxt->regs[VCPU_REGS_RSI];
- unsigned di = ctxt->regs[VCPU_REGS_RDI];
+ unsigned bx = reg_read(ctxt, VCPU_REGS_RBX);
+ unsigned bp = reg_read(ctxt, VCPU_REGS_RBP);
+ unsigned si = reg_read(ctxt, VCPU_REGS_RSI);
+ unsigned di = reg_read(ctxt, VCPU_REGS_RDI);
/* 16-bit ModR/M decode. */
switch (ctxt->modrm_mod) {
if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
modrm_ea += insn_fetch(s32, ctxt);
else {
- modrm_ea += ctxt->regs[base_reg];
+ modrm_ea += reg_read(ctxt, base_reg);
adjust_modrm_seg(ctxt, base_reg);
}
if (index_reg != 4)
- modrm_ea += ctxt->regs[index_reg] << scale;
+ modrm_ea += reg_read(ctxt, index_reg) << scale;
} else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
if (ctxt->mode == X86EMUL_MODE_PROT64)
ctxt->rip_relative = 1;
} else {
base_reg = ctxt->modrm_rm;
- modrm_ea += ctxt->regs[base_reg];
+ modrm_ea += reg_read(ctxt, base_reg);
adjust_modrm_seg(ctxt, base_reg);
}
switch (ctxt->modrm_mod) {
int rc;
struct read_cache *mc = &ctxt->mem_read;
- while (size) {
- int n = min(size, 8u);
- size -= n;
- if (mc->pos < mc->end)
- goto read_cached;
+ if (mc->pos < mc->end)
+ goto read_cached;
- rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
- &ctxt->exception);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- mc->end += n;
+ WARN_ON((mc->end + size) >= sizeof(mc->data));
- read_cached:
- memcpy(dest, mc->data + mc->pos, n);
- mc->pos += n;
- dest += n;
- addr += n;
- }
+ rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size,
+ &ctxt->exception);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ mc->end += size;
+
+read_cached:
+ memcpy(dest, mc->data + mc->pos, size);
+ mc->pos += size;
return X86EMUL_CONTINUE;
}
if (rc->pos == rc->end) { /* refill pio read ahead */
unsigned int in_page, n;
unsigned int count = ctxt->rep_prefix ?
- address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1;
+ address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
in_page = (ctxt->eflags & EFLG_DF) ?
- offset_in_page(ctxt->regs[VCPU_REGS_RDI]) :
- PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]);
+ offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
+ PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
count);
if (n == 0)
rc->end = n * size;
}
- memcpy(dest, rc->data + rc->pos, size);
- rc->pos += size;
+ if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
+ ctxt->dst.data = rc->data + rc->pos;
+ ctxt->dst.type = OP_MEM_STR;
+ ctxt->dst.count = (rc->end - rc->pos) / size;
+ rc->pos = rc->end;
+ } else {
+ memcpy(dest, rc->data + rc->pos, size);
+ rc->pos += size;
+ }
return 1;
}
static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
u16 selector, struct desc_ptr *dt)
{
- struct x86_emulate_ops *ops = ctxt->ops;
+ const struct x86_emulate_ops *ops = ctxt->ops;
if (selector & 1 << 2) {
struct desc_struct desc;
bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
ulong desc_addr;
int ret;
+ u16 dummy;
memset(&seg_desc, 0, sizeof seg_desc);
if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
|| ctxt->mode == X86EMUL_MODE_REAL) {
/* set real mode segment descriptor */
+ ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg);
set_desc_base(&seg_desc, selector << 4);
- set_desc_limit(&seg_desc, 0xffff);
- seg_desc.type = 3;
- seg_desc.p = 1;
- seg_desc.s = 1;
- if (ctxt->mode == X86EMUL_MODE_VM86)
- seg_desc.dpl = 3;
goto load;
}
err_code = selector & 0xfffc;
err_vec = GP_VECTOR;
- /* can't load system descriptor into segment selecor */
+ /* can't load system descriptor into segment selector */
if (seg <= VCPU_SREG_GS && !seg_desc.s)
goto exception;
if (rc != X86EMUL_CONTINUE)
return rc;
break;
+ case OP_MEM_STR:
+ rc = segmented_write(ctxt,
+ ctxt->dst.addr.mem,
+ ctxt->dst.data,
+ ctxt->dst.bytes * ctxt->dst.count);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ break;
case OP_XMM:
write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
break;
struct segmented_address addr;
rsp_increment(ctxt, -bytes);
- addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt);
+ addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
addr.seg = VCPU_SREG_SS;
return segmented_write(ctxt, addr, data, bytes);
int rc;
struct segmented_address addr;
- addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt);
+ addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
addr.seg = VCPU_SREG_SS;
rc = segmented_read(ctxt, addr, dest, len);
if (rc != X86EMUL_CONTINUE)
int rc;
unsigned frame_size = ctxt->src.val;
unsigned nesting_level = ctxt->src2.val & 31;
+ ulong rbp;
if (nesting_level)
return X86EMUL_UNHANDLEABLE;
- rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt));
+ rbp = reg_read(ctxt, VCPU_REGS_RBP);
+ rc = push(ctxt, &rbp, stack_size(ctxt));
if (rc != X86EMUL_CONTINUE)
return rc;
- assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP],
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP),
stack_mask(ctxt));
- assign_masked(&ctxt->regs[VCPU_REGS_RSP],
- ctxt->regs[VCPU_REGS_RSP] - frame_size,
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP),
+ reg_read(ctxt, VCPU_REGS_RSP) - frame_size,
stack_mask(ctxt));
return X86EMUL_CONTINUE;
}
static int em_leave(struct x86_emulate_ctxt *ctxt)
{
- assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP],
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), reg_read(ctxt, VCPU_REGS_RBP),
stack_mask(ctxt));
- return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes);
+ return emulate_pop(ctxt, reg_rmw(ctxt, VCPU_REGS_RBP), ctxt->op_bytes);
}
static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
static int em_pusha(struct x86_emulate_ctxt *ctxt)
{
- unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP];
+ unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP);
int rc = X86EMUL_CONTINUE;
int reg = VCPU_REGS_RAX;
while (reg <= VCPU_REGS_RDI) {
(reg == VCPU_REGS_RSP) ?
- (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]);
+ (ctxt->src.val = old_esp) : (ctxt->src.val = reg_read(ctxt, reg));
rc = em_push(ctxt);
if (rc != X86EMUL_CONTINUE)
--reg;
}
- rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes);
+ rc = emulate_pop(ctxt, reg_rmw(ctxt, reg), ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
break;
--reg;
return rc;
}
-int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
+static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
{
- struct x86_emulate_ops *ops = ctxt->ops;
+ const struct x86_emulate_ops *ops = ctxt->ops;
int rc;
struct desc_ptr dt;
gva_t cs_addr;
return rc;
}
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
+{
+ int rc;
+
+ invalidate_registers(ctxt);
+ rc = __emulate_int_real(ctxt, irq);
+ if (rc == X86EMUL_CONTINUE)
+ writeback_registers(ctxt);
+ return rc;
+}
+
static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
{
switch(ctxt->mode) {
case X86EMUL_MODE_REAL:
- return emulate_int_real(ctxt, irq);
+ return __emulate_int_real(ctxt, irq);
case X86EMUL_MODE_VM86:
case X86EMUL_MODE_PROT16:
case X86EMUL_MODE_PROT32:
{
u64 old = ctxt->dst.orig_val64;
- if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) ||
- ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) {
- ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
- ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
+ if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
+ ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
+ *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
ctxt->eflags &= ~EFLG_ZF;
} else {
- ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) |
- (u32) ctxt->regs[VCPU_REGS_RBX];
+ ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
+ (u32) reg_read(ctxt, VCPU_REGS_RBX);
ctxt->eflags |= EFLG_ZF;
}
{
/* Save real source value, then compare EAX against destination. */
ctxt->src.orig_val = ctxt->src.val;
- ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
+ ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX);
emulate_2op_SrcV(ctxt, "cmp");
if (ctxt->eflags & EFLG_ZF) {
} else {
/* Failure: write the value we saw to EAX. */
ctxt->dst.type = OP_REG;
- ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
+ ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
}
return X86EMUL_CONTINUE;
}
setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
struct desc_struct *cs, struct desc_struct *ss)
{
- u16 selector;
-
- memset(cs, 0, sizeof(struct desc_struct));
- ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
- memset(ss, 0, sizeof(struct desc_struct));
-
cs->l = 0; /* will be adjusted later */
set_desc_base(cs, 0); /* flat segment */
cs->g = 1; /* 4kb granularity */
cs->dpl = 0; /* will be adjusted later */
cs->p = 1;
cs->d = 1;
+ cs->avl = 0;
set_desc_base(ss, 0); /* flat segment */
set_desc_limit(ss, 0xfffff); /* 4GB limit */
ss->d = 1; /* 32bit stack segment */
ss->dpl = 0;
ss->p = 1;
+ ss->l = 0;
+ ss->avl = 0;
}
static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
{
- struct x86_emulate_ops *ops = ctxt->ops;
+ const struct x86_emulate_ops *ops = ctxt->ops;
u32 eax, ebx, ecx, edx;
/*
static int em_syscall(struct x86_emulate_ctxt *ctxt)
{
- struct x86_emulate_ops *ops = ctxt->ops;
+ const struct x86_emulate_ops *ops = ctxt->ops;
struct desc_struct cs, ss;
u64 msr_data;
u16 cs_sel, ss_sel;
ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
- ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip;
+ *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip;
if (efer & EFER_LMA) {
#ifdef CONFIG_X86_64
- ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
+ *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags & ~EFLG_RF;
ops->get_msr(ctxt,
ctxt->mode == X86EMUL_MODE_PROT64 ?
static int em_sysenter(struct x86_emulate_ctxt *ctxt)
{
- struct x86_emulate_ops *ops = ctxt->ops;
+ const struct x86_emulate_ops *ops = ctxt->ops;
struct desc_struct cs, ss;
u64 msr_data;
u16 cs_sel, ss_sel;
if (msr_data == 0x0)
return emulate_gp(ctxt, 0);
break;
+ default:
+ break;
}
ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
ctxt->_eip = msr_data;
ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
- ctxt->regs[VCPU_REGS_RSP] = msr_data;
+ *reg_write(ctxt, VCPU_REGS_RSP) = msr_data;
return X86EMUL_CONTINUE;
}
static int em_sysexit(struct x86_emulate_ctxt *ctxt)
{
- struct x86_emulate_ops *ops = ctxt->ops;
+ const struct x86_emulate_ops *ops = ctxt->ops;
struct desc_struct cs, ss;
u64 msr_data;
int usermode;
ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
- ctxt->_eip = ctxt->regs[VCPU_REGS_RDX];
- ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX];
+ ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX);
+ *reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX);
return X86EMUL_CONTINUE;
}
static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
u16 port, u16 len)
{
- struct x86_emulate_ops *ops = ctxt->ops;
+ const struct x86_emulate_ops *ops = ctxt->ops;
struct desc_struct tr_seg;
u32 base3;
int r;
{
tss->ip = ctxt->_eip;
tss->flag = ctxt->eflags;
- tss->ax = ctxt->regs[VCPU_REGS_RAX];
- tss->cx = ctxt->regs[VCPU_REGS_RCX];
- tss->dx = ctxt->regs[VCPU_REGS_RDX];
- tss->bx = ctxt->regs[VCPU_REGS_RBX];
- tss->sp = ctxt->regs[VCPU_REGS_RSP];
- tss->bp = ctxt->regs[VCPU_REGS_RBP];
- tss->si = ctxt->regs[VCPU_REGS_RSI];
- tss->di = ctxt->regs[VCPU_REGS_RDI];
+ tss->ax = reg_read(ctxt, VCPU_REGS_RAX);
+ tss->cx = reg_read(ctxt, VCPU_REGS_RCX);
+ tss->dx = reg_read(ctxt, VCPU_REGS_RDX);
+ tss->bx = reg_read(ctxt, VCPU_REGS_RBX);
+ tss->sp = reg_read(ctxt, VCPU_REGS_RSP);
+ tss->bp = reg_read(ctxt, VCPU_REGS_RBP);
+ tss->si = reg_read(ctxt, VCPU_REGS_RSI);
+ tss->di = reg_read(ctxt, VCPU_REGS_RDI);
tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
ctxt->_eip = tss->ip;
ctxt->eflags = tss->flag | 2;
- ctxt->regs[VCPU_REGS_RAX] = tss->ax;
- ctxt->regs[VCPU_REGS_RCX] = tss->cx;
- ctxt->regs[VCPU_REGS_RDX] = tss->dx;
- ctxt->regs[VCPU_REGS_RBX] = tss->bx;
- ctxt->regs[VCPU_REGS_RSP] = tss->sp;
- ctxt->regs[VCPU_REGS_RBP] = tss->bp;
- ctxt->regs[VCPU_REGS_RSI] = tss->si;
- ctxt->regs[VCPU_REGS_RDI] = tss->di;
+ *reg_write(ctxt, VCPU_REGS_RAX) = tss->ax;
+ *reg_write(ctxt, VCPU_REGS_RCX) = tss->cx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tss->dx;
+ *reg_write(ctxt, VCPU_REGS_RBX) = tss->bx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = tss->sp;
+ *reg_write(ctxt, VCPU_REGS_RBP) = tss->bp;
+ *reg_write(ctxt, VCPU_REGS_RSI) = tss->si;
+ *reg_write(ctxt, VCPU_REGS_RDI) = tss->di;
/*
* SDM says that segment selectors are loaded before segment
set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
/*
- * Now load segment descriptors. If fault happenes at this stage
+ * Now load segment descriptors. If fault happens at this stage
* it is handled in a context of new task
*/
ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
u16 tss_selector, u16 old_tss_sel,
ulong old_tss_base, struct desc_struct *new_desc)
{
- struct x86_emulate_ops *ops = ctxt->ops;
+ const struct x86_emulate_ops *ops = ctxt->ops;
struct tss_segment_16 tss_seg;
int ret;
u32 new_tss_base = get_desc_base(new_desc);
tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
tss->eip = ctxt->_eip;
tss->eflags = ctxt->eflags;
- tss->eax = ctxt->regs[VCPU_REGS_RAX];
- tss->ecx = ctxt->regs[VCPU_REGS_RCX];
- tss->edx = ctxt->regs[VCPU_REGS_RDX];
- tss->ebx = ctxt->regs[VCPU_REGS_RBX];
- tss->esp = ctxt->regs[VCPU_REGS_RSP];
- tss->ebp = ctxt->regs[VCPU_REGS_RBP];
- tss->esi = ctxt->regs[VCPU_REGS_RSI];
- tss->edi = ctxt->regs[VCPU_REGS_RDI];
+ tss->eax = reg_read(ctxt, VCPU_REGS_RAX);
+ tss->ecx = reg_read(ctxt, VCPU_REGS_RCX);
+ tss->edx = reg_read(ctxt, VCPU_REGS_RDX);
+ tss->ebx = reg_read(ctxt, VCPU_REGS_RBX);
+ tss->esp = reg_read(ctxt, VCPU_REGS_RSP);
+ tss->ebp = reg_read(ctxt, VCPU_REGS_RBP);
+ tss->esi = reg_read(ctxt, VCPU_REGS_RSI);
+ tss->edi = reg_read(ctxt, VCPU_REGS_RDI);
tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
ctxt->eflags = tss->eflags | 2;
/* General purpose registers */
- ctxt->regs[VCPU_REGS_RAX] = tss->eax;
- ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
- ctxt->regs[VCPU_REGS_RDX] = tss->edx;
- ctxt->regs[VCPU_REGS_RBX] = tss->ebx;
- ctxt->regs[VCPU_REGS_RSP] = tss->esp;
- ctxt->regs[VCPU_REGS_RBP] = tss->ebp;
- ctxt->regs[VCPU_REGS_RSI] = tss->esi;
- ctxt->regs[VCPU_REGS_RDI] = tss->edi;
+ *reg_write(ctxt, VCPU_REGS_RAX) = tss->eax;
+ *reg_write(ctxt, VCPU_REGS_RCX) = tss->ecx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tss->edx;
+ *reg_write(ctxt, VCPU_REGS_RBX) = tss->ebx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = tss->esp;
+ *reg_write(ctxt, VCPU_REGS_RBP) = tss->ebp;
+ *reg_write(ctxt, VCPU_REGS_RSI) = tss->esi;
+ *reg_write(ctxt, VCPU_REGS_RDI) = tss->edi;
/*
* SDM says that segment selectors are loaded before segment
u16 tss_selector, u16 old_tss_sel,
ulong old_tss_base, struct desc_struct *new_desc)
{
- struct x86_emulate_ops *ops = ctxt->ops;
+ const struct x86_emulate_ops *ops = ctxt->ops;
struct tss_segment_32 tss_seg;
int ret;
u32 new_tss_base = get_desc_base(new_desc);
u16 tss_selector, int idt_index, int reason,
bool has_error_code, u32 error_code)
{
- struct x86_emulate_ops *ops = ctxt->ops;
+ const struct x86_emulate_ops *ops = ctxt->ops;
struct desc_struct curr_tss_desc, next_tss_desc;
int ret;
u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
*
* 1. jmp/call/int to task gate: Check against DPL of the task gate
* 2. Exception/IRQ/iret: No check is performed
- * 3. jmp/call to TSS: Check agains DPL of the TSS
+ * 3. jmp/call to TSS: Check against DPL of the TSS
*/
if (reason == TASK_SWITCH_GATE) {
if (idt_index != -1) {
ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
/* set back link to prev task only if NT bit is set in eflags
- note that old_tss_sel is not used afetr this point */
+ note that old_tss_sel is not used after this point */
if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
old_tss_sel = 0xffff;
{
int rc;
+ invalidate_registers(ctxt);
ctxt->_eip = ctxt->eip;
ctxt->dst.type = OP_NONE;
rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
has_error_code, error_code);
- if (rc == X86EMUL_CONTINUE)
+ if (rc == X86EMUL_CONTINUE) {
ctxt->eip = ctxt->_eip;
+ writeback_registers(ctxt);
+ }
return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
}
-static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
- int reg, struct operand *op)
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
+ struct operand *op)
{
- int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
+ int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
- register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
- op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
- op->addr.mem.seg = seg;
+ register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes);
+ op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg));
}
static int em_das(struct x86_emulate_ctxt *ctxt)
{
ctxt->dst.type = OP_REG;
ctxt->dst.bytes = ctxt->src.bytes;
- ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+ ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
return X86EMUL_CONTINUE;
u64 tsc = 0;
ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
- ctxt->regs[VCPU_REGS_RAX] = (u32)tsc;
- ctxt->regs[VCPU_REGS_RDX] = tsc >> 32;
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)tsc;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tsc >> 32;
return X86EMUL_CONTINUE;
}
{
u64 pmc;
- if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc))
+ if (ctxt->ops->read_pmc(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &pmc))
return emulate_gp(ctxt, 0);
- ctxt->regs[VCPU_REGS_RAX] = (u32)pmc;
- ctxt->regs[VCPU_REGS_RDX] = pmc >> 32;
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)pmc;
+ *reg_write(ctxt, VCPU_REGS_RDX) = pmc >> 32;
return X86EMUL_CONTINUE;
}
{
u64 msr_data;
- msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
- | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
- if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data))
+ msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
+ | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
+ if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
{
u64 msr_data;
- if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data))
+ if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data))
return emulate_gp(ctxt, 0);
- ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
- ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
+ *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
return X86EMUL_CONTINUE;
}
static int em_loop(struct x86_emulate_ctxt *ctxt)
{
- register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
- if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) &&
+ register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1);
+ if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) &&
(ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
jmp_rel(ctxt, ctxt->src.val);
static int em_jcxz(struct x86_emulate_ctxt *ctxt)
{
- if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0)
+ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0)
jmp_rel(ctxt, ctxt->src.val);
return X86EMUL_CONTINUE;
{
u32 eax, ebx, ecx, edx;
- eax = ctxt->regs[VCPU_REGS_RAX];
- ecx = ctxt->regs[VCPU_REGS_RCX];
+ eax = reg_read(ctxt, VCPU_REGS_RAX);
+ ecx = reg_read(ctxt, VCPU_REGS_RCX);
ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
- ctxt->regs[VCPU_REGS_RAX] = eax;
- ctxt->regs[VCPU_REGS_RBX] = ebx;
- ctxt->regs[VCPU_REGS_RCX] = ecx;
- ctxt->regs[VCPU_REGS_RDX] = edx;
+ *reg_write(ctxt, VCPU_REGS_RAX) = eax;
+ *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
+ *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = edx;
return X86EMUL_CONTINUE;
}
static int em_lahf(struct x86_emulate_ctxt *ctxt)
{
- ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL;
- ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8;
+ *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL;
+ *reg_rmw(ctxt, VCPU_REGS_RAX) |= (ctxt->eflags & 0xff) << 8;
return X86EMUL_CONTINUE;
}
static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
{
- u64 rax = ctxt->regs[VCPU_REGS_RAX];
+ u64 rax = reg_read(ctxt, VCPU_REGS_RAX);
/* Valid physical address? */
if (rax & 0xffff000000000000ULL)
static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
{
u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
- u64 rcx = ctxt->regs[VCPU_REGS_RCX];
+ u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
(rcx > 3))
I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
-static struct opcode group7_rm1[] = {
+static const struct opcode group7_rm1[] = {
DI(SrcNone | Priv, monitor),
DI(SrcNone | Priv, mwait),
N, N, N, N, N, N,
};
-static struct opcode group7_rm3[] = {
+static const struct opcode group7_rm3[] = {
DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall),
DIP(SrcNone | Prot | Priv, vmload, check_svme_pa),
DIP(SrcNone | Prot | Priv, invlpga, check_svme),
};
-static struct opcode group7_rm7[] = {
+static const struct opcode group7_rm7[] = {
N,
DIP(SrcNone, rdtscp, check_rdtsc),
N, N, N, N, N, N,
};
-static struct opcode group1[] = {
+static const struct opcode group1[] = {
I(Lock, em_add),
I(Lock | PageTable, em_or),
I(Lock, em_adc),
I(0, em_cmp),
};
-static struct opcode group1A[] = {
+static const struct opcode group1A[] = {
I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
};
-static struct opcode group3[] = {
+static const struct opcode group3[] = {
I(DstMem | SrcImm, em_test),
I(DstMem | SrcImm, em_test),
I(DstMem | SrcNone | Lock, em_not),
I(SrcMem, em_idiv_ex),
};
-static struct opcode group4[] = {
+static const struct opcode group4[] = {
I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
N, N, N, N, N, N,
};
-static struct opcode group5[] = {
+static const struct opcode group5[] = {
I(DstMem | SrcNone | Lock, em_grp45),
I(DstMem | SrcNone | Lock, em_grp45),
I(SrcMem | Stack, em_grp45),
I(SrcMem | Stack, em_grp45), N,
};
-static struct opcode group6[] = {
+static const struct opcode group6[] = {
DI(Prot, sldt),
DI(Prot, str),
II(Prot | Priv | SrcMem16, em_lldt, lldt),
N, N, N, N,
};
-static struct group_dual group7 = { {
+static const struct group_dual group7 = { {
II(Mov | DstMem | Priv, em_sgdt, sgdt),
II(Mov | DstMem | Priv, em_sidt, sidt),
II(SrcMem | Priv, em_lgdt, lgdt),
EXT(0, group7_rm7),
} };
-static struct opcode group8[] = {
+static const struct opcode group8[] = {
N, N, N, N,
I(DstMem | SrcImmByte, em_bt),
I(DstMem | SrcImmByte | Lock | PageTable, em_bts),
I(DstMem | SrcImmByte | Lock | PageTable, em_btc),
};
-static struct group_dual group9 = { {
+static const struct group_dual group9 = { {
N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
}, {
N, N, N, N, N, N, N, N,
} };
-static struct opcode group11[] = {
+static const struct opcode group11[] = {
I(DstMem | SrcImm | Mov | PageTable, em_mov),
X7(D(Undefined)),
};
-static struct gprefix pfx_0f_6f_0f_7f = {
+static const struct gprefix pfx_0f_6f_0f_7f = {
I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
};
-static struct gprefix pfx_vmovntpx = {
+static const struct gprefix pfx_vmovntpx = {
I(0, em_mov), N, N, N,
};
-static struct opcode opcode_table[256] = {
+static const struct opcode opcode_table[256] = {
/* 0x00 - 0x07 */
I6ALU(Lock, em_add),
I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
I(SrcImmByte | Mov | Stack, em_push),
I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
- I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */
+ I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
/* 0x70 - 0x7F */
X16(D(SrcImmByte)),
D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
};
-static struct opcode twobyte_table[256] = {
+static const struct opcode twobyte_table[256] = {
/* 0x00 - 0x0F */
G(0, group6), GD(0, &group7), N, N,
N, I(ImplicitOps | VendorSpecific, em_syscall),
case OpAcc:
op->type = OP_REG;
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
- op->addr.reg = &ctxt->regs[VCPU_REGS_RAX];
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
fetch_register_operand(op);
op->orig_val = op->val;
break;
op->type = OP_MEM;
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.mem.ea =
- register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
+ register_address(ctxt, reg_read(ctxt, VCPU_REGS_RDI));
op->addr.mem.seg = VCPU_SREG_ES;
op->val = 0;
+ op->count = 1;
break;
case OpDX:
op->type = OP_REG;
op->bytes = 2;
- op->addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
fetch_register_operand(op);
break;
case OpCL:
op->bytes = 1;
- op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff;
+ op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff;
break;
case OpImmByte:
rc = decode_imm(ctxt, op, 1, true);
op->type = OP_MEM;
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.mem.ea =
- register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
+ register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI));
op->addr.mem.seg = seg_override(ctxt);
op->val = 0;
+ op->count = 1;
break;
case OpImmFAddr:
op->type = OP_IMM;
read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
}
+
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
{
- struct x86_emulate_ops *ops = ctxt->ops;
+ const struct x86_emulate_ops *ops = ctxt->ops;
int rc = X86EMUL_CONTINUE;
int saved_dst_type = ctxt->dst.type;
}
/* Instruction can only be executed in protected mode */
- if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
+ if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
rc = emulate_ud(ctxt);
goto done;
}
if (ctxt->rep_prefix && (ctxt->d & String)) {
/* All REP prefixes have the same first termination condition */
- if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) {
+ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
ctxt->eip = ctxt->_eip;
goto done;
}
ctxt->dst.val = ctxt->src.addr.mem.ea;
break;
case 0x90 ... 0x97: /* nop / xchg reg, rax */
- if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX])
+ if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX))
break;
rc = em_xchg(ctxt);
break;
rc = em_grp2(ctxt);
break;
case 0xd2 ... 0xd3: /* Grp2 */
- ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
+ ctxt->src.val = reg_read(ctxt, VCPU_REGS_RCX);
rc = em_grp2(ctxt);
break;
case 0xe9: /* jmp rel */
ctxt->dst.type = saved_dst_type;
if ((ctxt->d & SrcMask) == SrcSI)
- string_addr_inc(ctxt, seg_override(ctxt),
- VCPU_REGS_RSI, &ctxt->src);
+ string_addr_inc(ctxt, VCPU_REGS_RSI, &ctxt->src);
if ((ctxt->d & DstMask) == DstDI)
- string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
- &ctxt->dst);
+ string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
if (ctxt->rep_prefix && (ctxt->d & String)) {
+ unsigned int count;
struct read_cache *r = &ctxt->io_read;
- register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
+ if ((ctxt->d & SrcMask) == SrcSI)
+ count = ctxt->src.count;
+ else
+ count = ctxt->dst.count;
+ register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX),
+ -count);
if (!string_insn_completed(ctxt)) {
/*
* Re-enter guest when pio read ahead buffer is empty
* or, if it is not used, after each 1024 iteration.
*/
- if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) &&
+ if ((r->end != 0 || reg_read(ctxt, VCPU_REGS_RCX) & 0x3ff) &&
(r->end == 0 || r->end != r->pos)) {
/*
* Reset read cache. Usually happens before
* we have to do it here.
*/
ctxt->mem_read.end = 0;
+ writeback_registers(ctxt);
return EMULATION_RESTART;
}
goto done; /* skip rip writeback */
if (rc == X86EMUL_INTERCEPTED)
return EMULATION_INTERCEPTED;
+ if (rc == X86EMUL_CONTINUE)
+ writeback_registers(ctxt);
+
return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
twobyte_insn:
cannot_emulate:
return EMULATION_FAILED;
}
+
+void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt)
+{
+ invalidate_registers(ctxt);
+}
+
+void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
+{
+ writeback_registers(ctxt);
+}
ktime_t remaining;
struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
- if (!ps->pit_timer.period)
+ if (!ps->period)
return 0;
/*
* itself with the initial count and continues counting
* from there.
*/
- remaining = hrtimer_get_remaining(&ps->pit_timer.timer);
- elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
- elapsed = mod_64(elapsed, ps->pit_timer.period);
+ remaining = hrtimer_get_remaining(&ps->timer);
+ elapsed = ps->period - ktime_to_ns(remaining);
+ elapsed = mod_64(elapsed, ps->period);
return elapsed;
}
int value;
spin_lock(&ps->inject_lock);
- value = atomic_dec_return(&ps->pit_timer.pending);
+ value = atomic_dec_return(&ps->pending);
if (value < 0)
/* spurious acks can be generated if, for example, the
* PIC is being reset. Handle it gracefully here
*/
- atomic_inc(&ps->pit_timer.pending);
+ atomic_inc(&ps->pending);
else if (value > 0)
/* in this case, we had multiple outstanding pit interrupts
* that we needed to inject. Reinject
if (!kvm_vcpu_is_bsp(vcpu) || !pit)
return;
- timer = &pit->pit_state.pit_timer.timer;
+ timer = &pit->pit_state.timer;
if (hrtimer_cancel(timer))
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
static void destroy_pit_timer(struct kvm_pit *pit)
{
- hrtimer_cancel(&pit->pit_state.pit_timer.timer);
+ hrtimer_cancel(&pit->pit_state.timer);
flush_kthread_work(&pit->expired);
}
-static bool kpit_is_periodic(struct kvm_timer *ktimer)
-{
- struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
- pit_timer);
- return ps->is_periodic;
-}
-
-static struct kvm_timer_ops kpit_ops = {
- .is_periodic = kpit_is_periodic,
-};
-
static void pit_do_work(struct kthread_work *work)
{
struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
{
- struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
- struct kvm_pit *pt = ktimer->kvm->arch.vpit;
+ struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
+ struct kvm_pit *pt = ps->kvm->arch.vpit;
- if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
- atomic_inc(&ktimer->pending);
+ if (ps->reinject || !atomic_read(&ps->pending)) {
+ atomic_inc(&ps->pending);
queue_kthread_work(&pt->worker, &pt->expired);
}
- if (ktimer->t_ops->is_periodic(ktimer)) {
- hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ if (ps->is_periodic) {
+ hrtimer_add_expires_ns(&ps->timer, ps->period);
return HRTIMER_RESTART;
} else
return HRTIMER_NORESTART;
static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
{
struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
- struct kvm_timer *pt = &ps->pit_timer;
s64 interval;
if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
pr_debug("create pit timer, interval is %llu nsec\n", interval);
/* TODO The new value only affected after the retriggered */
- hrtimer_cancel(&pt->timer);
+ hrtimer_cancel(&ps->timer);
flush_kthread_work(&ps->pit->expired);
- pt->period = interval;
+ ps->period = interval;
ps->is_periodic = is_period;
- pt->timer.function = pit_timer_fn;
- pt->t_ops = &kpit_ops;
- pt->kvm = ps->pit->kvm;
+ ps->timer.function = pit_timer_fn;
+ ps->kvm = ps->pit->kvm;
- atomic_set(&pt->pending, 0);
+ atomic_set(&ps->pending, 0);
ps->irq_ack = 1;
- hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
+ hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
HRTIMER_MODE_ABS);
}
}
mutex_unlock(&pit->pit_state.lock);
- atomic_set(&pit->pit_state.pit_timer.pending, 0);
+ atomic_set(&pit->pit_state.pending, 0);
pit->pit_state.irq_ack = 1;
}
struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
if (!mask) {
- atomic_set(&pit->pit_state.pit_timer.pending, 0);
+ atomic_set(&pit->pit_state.pending, 0);
pit->pit_state.irq_ack = 1;
}
}
pit_state = &pit->pit_state;
pit_state->pit = pit;
- hrtimer_init(&pit_state->pit_timer.timer,
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
pit_state->irq_ack_notifier.gsi = 0;
pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
- pit_state->pit_timer.reinject = true;
+ pit_state->reinject = true;
mutex_unlock(&pit->pit_state.lock);
kvm_pit_reset(pit);
kvm_unregister_irq_ack_notifier(kvm,
&kvm->arch.vpit->pit_state.irq_ack_notifier);
mutex_lock(&kvm->arch.vpit->pit_state.lock);
- timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
+ timer = &kvm->arch.vpit->pit_state.timer;
hrtimer_cancel(timer);
flush_kthread_work(&kvm->arch.vpit->expired);
kthread_stop(kvm->arch.vpit->worker_task);
struct kvm_kpit_state {
struct kvm_kpit_channel_state channels[3];
u32 flags;
- struct kvm_timer pit_timer;
bool is_periodic;
+ s64 period; /* unit: ns */
+ struct hrtimer timer;
+ atomic_t pending; /* accumulated triggered timers */
+ bool reinject;
+ struct kvm *kvm;
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
{
- int ret = -1;
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
pic_lock(s);
- if (irq >= 0 && irq < PIC_NUM_PINS) {
- int irq_level = __kvm_irq_line_state(&s->irq_states[irq],
- irq_source_id, level);
- ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
- pic_update_irq(s);
- trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
- s->pics[irq >> 3].imr, ret == 0);
- }
+ irq_level = __kvm_irq_line_state(&s->irq_states[irq],
+ irq_source_id, level);
+ ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
+ pic_update_irq(s);
+ trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
+ s->pics[irq >> 3].imr, ret == 0);
pic_unlock(s);
return ret;
{
int irq, i;
struct kvm_vcpu *vcpu;
- u8 irr = s->irr, isr = s->imr;
+ u8 edge_irr = s->irr & ~s->elcr;
bool found = false;
s->last_irr = 0;
- s->irr = 0;
+ s->irr &= s->elcr;
s->imr = 0;
- s->isr = 0;
s->priority_add = 0;
- s->irq_base = 0;
- s->read_reg_select = 0;
- s->poll = 0;
s->special_mask = 0;
- s->init_state = 0;
- s->auto_eoi = 0;
- s->rotate_on_auto_eoi = 0;
- s->special_fully_nested_mode = 0;
- s->init4 = 0;
+ s->read_reg_select = 0;
+ if (!s->init4) {
+ s->special_fully_nested_mode = 0;
+ s->auto_eoi = 0;
+ }
+ s->init_state = 1;
kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
if (kvm_apic_accept_pic_intr(vcpu)) {
return;
for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
- if (irr & (1 << irq) || isr & (1 << irq))
+ if (edge_irr & (1 << irq))
pic_clear_isr(s, irq);
}
addr &= 1;
if (addr == 0) {
if (val & 0x10) {
- u8 edge_irr = s->irr & ~s->elcr;
- int i;
- bool found = false;
- struct kvm_vcpu *vcpu;
-
s->init4 = val & 1;
- s->last_irr = 0;
- s->irr &= s->elcr;
- s->imr = 0;
- s->priority_add = 0;
- s->special_mask = 0;
- s->read_reg_select = 0;
- if (!s->init4) {
- s->special_fully_nested_mode = 0;
- s->auto_eoi = 0;
- }
- s->init_state = 1;
if (val & 0x02)
pr_pic_unimpl("single mode not supported");
if (val & 0x08)
pr_pic_unimpl(
- "level sensitive irq not supported");
-
- kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
- if (kvm_apic_accept_pic_intr(vcpu)) {
- found = true;
- break;
- }
-
-
- if (found)
- for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
- if (edge_irr & (1 << irq))
- pic_clear_isr(s, irq);
+ "level sensitive irq not supported");
+ kvm_pic_reset(s);
} else if (val & 0x08) {
if (val & 0x04)
s->poll = 1;
struct kvm_io_device dev_slave;
struct kvm_io_device dev_eclr;
void (*ack_notifier)(void *opaque, int irq);
- unsigned long irq_states[16];
+ unsigned long irq_states[PIC_NUM_PINS];
};
struct kvm_pic *kvm_create_pic(struct kvm *kvm);
+++ /dev/null
-
-struct kvm_timer {
- struct hrtimer timer;
- s64 period; /* unit: ns */
- u32 timer_mode_mask;
- u64 tscdeadline;
- atomic_t pending; /* accumulated triggered timers */
- bool reinject;
- struct kvm_timer_ops *t_ops;
- struct kvm *kvm;
- struct kvm_vcpu *vcpu;
-};
-
-struct kvm_timer_ops {
- bool (*is_periodic)(struct kvm_timer *);
-};
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
#include <asm/current.h>
#include <asm/apicdef.h>
#include <linux/atomic.h>
+#include <linux/jump_label.h>
#include "kvm_cache_regs.h"
#include "irq.h"
#include "trace.h"
#define APIC_DEST_NOSHORT 0x0
#define APIC_DEST_MASK 0x800
#define MAX_APIC_VECTOR 256
+#define APIC_VECTORS_PER_REG 32
#define VEC_POS(v) ((v) & (32 - 1))
#define REG_POS(v) (((v) >> 5) << 4)
static unsigned int min_timer_period_us = 500;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
-static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
-{
- return *((u32 *) (apic->regs + reg_off));
-}
-
static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
{
*((u32 *) (apic->regs + reg_off)) = val;
return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
}
-static inline int apic_hw_enabled(struct kvm_lapic *apic)
-{
- return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
-}
+struct static_key_deferred apic_hw_disabled __read_mostly;
+struct static_key_deferred apic_sw_disabled __read_mostly;
-static inline int apic_sw_enabled(struct kvm_lapic *apic)
+static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
{
- return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+ if ((kvm_apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) {
+ if (val & APIC_SPIV_APIC_ENABLED)
+ static_key_slow_dec_deferred(&apic_sw_disabled);
+ else
+ static_key_slow_inc(&apic_sw_disabled.key);
+ }
+ apic_set_reg(apic, APIC_SPIV, val);
}
static inline int apic_enabled(struct kvm_lapic *apic)
{
- return apic_sw_enabled(apic) && apic_hw_enabled(apic);
+ return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic);
}
#define LVT_MASK \
(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
+static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+{
+ return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+}
+
static inline int kvm_apic_id(struct kvm_lapic *apic)
{
- return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+ return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+}
+
+static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
+{
+ u16 cid;
+ ldr >>= 32 - map->ldr_bits;
+ cid = (ldr >> map->cid_shift) & map->cid_mask;
+
+ BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
+
+ return cid;
+}
+
+static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
+{
+ ldr >>= (32 - map->ldr_bits);
+ return ldr & map->lid_mask;
+}
+
+static void recalculate_apic_map(struct kvm *kvm)
+{
+ struct kvm_apic_map *new, *old = NULL;
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
+
+ mutex_lock(&kvm->arch.apic_map_lock);
+
+ if (!new)
+ goto out;
+
+ new->ldr_bits = 8;
+ /* flat mode is default */
+ new->cid_shift = 8;
+ new->cid_mask = 0;
+ new->lid_mask = 0xff;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u16 cid, lid;
+ u32 ldr;
+
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ /*
+ * All APICs have to be configured in the same mode by an OS.
+ * We take advatage of this while building logical id loockup
+ * table. After reset APICs are in xapic/flat mode, so if we
+ * find apic with different setting we assume this is the mode
+ * OS wants all apics to be in; build lookup table accordingly.
+ */
+ if (apic_x2apic_mode(apic)) {
+ new->ldr_bits = 32;
+ new->cid_shift = 16;
+ new->cid_mask = new->lid_mask = 0xffff;
+ } else if (kvm_apic_sw_enabled(apic) &&
+ !new->cid_mask /* flat mode */ &&
+ kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER) {
+ new->cid_shift = 4;
+ new->cid_mask = 0xf;
+ new->lid_mask = 0xf;
+ }
+
+ new->phys_map[kvm_apic_id(apic)] = apic;
+
+ ldr = kvm_apic_get_reg(apic, APIC_LDR);
+ cid = apic_cluster_id(new, ldr);
+ lid = apic_logical_id(new, ldr);
+
+ if (lid)
+ new->logical_map[cid][ffs(lid) - 1] = apic;
+ }
+out:
+ old = rcu_dereference_protected(kvm->arch.apic_map,
+ lockdep_is_held(&kvm->arch.apic_map_lock));
+ rcu_assign_pointer(kvm->arch.apic_map, new);
+ mutex_unlock(&kvm->arch.apic_map_lock);
+
+ if (old)
+ kfree_rcu(old, rcu);
+}
+
+static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
+{
+ apic_set_reg(apic, APIC_ID, id << 24);
+ recalculate_apic_map(apic->vcpu->kvm);
+}
+
+static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
+{
+ apic_set_reg(apic, APIC_LDR, id);
+ recalculate_apic_map(apic->vcpu->kvm);
}
static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
{
- return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
+ return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
}
static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
{
- return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
+ return kvm_apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
}
static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
{
- return ((apic_get_reg(apic, APIC_LVTT) &
+ return ((kvm_apic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
}
static inline int apic_lvtt_period(struct kvm_lapic *apic)
{
- return ((apic_get_reg(apic, APIC_LVTT) &
+ return ((kvm_apic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
}
static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
{
- return ((apic_get_reg(apic, APIC_LVTT) &
+ return ((kvm_apic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask) ==
APIC_LVT_TIMER_TSCDEADLINE);
}
struct kvm_cpuid_entry2 *feat;
u32 v = APIC_VERSION;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!kvm_vcpu_has_lapic(vcpu))
return;
feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
apic_set_reg(apic, APIC_LVR, v);
}
-static inline int apic_x2apic_mode(struct kvm_lapic *apic)
-{
- return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
-}
-
-static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
+static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
LVT_MASK | APIC_MODE_MASK, /* LVTPC */
static int find_highest_vector(void *bitmap)
{
- u32 *word = bitmap;
- int word_offset = MAX_APIC_VECTOR >> 5;
+ int vec;
+ u32 *reg;
- while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
- continue;
+ for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG;
+ vec >= 0; vec -= APIC_VECTORS_PER_REG) {
+ reg = bitmap + REG_POS(vec);
+ if (*reg)
+ return fls(*reg) - 1 + vec;
+ }
- if (likely(!word_offset && !word[0]))
- return -1;
- else
- return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
+ return -1;
}
static u8 count_vectors(void *bitmap)
{
- u32 *word = bitmap;
- int word_offset;
+ int vec;
+ u32 *reg;
u8 count = 0;
- for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset)
- count += hweight32(word[word_offset << 2]);
+
+ for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
+ reg = bitmap + REG_POS(vec);
+ count += hweight32(*reg);
+ }
+
return count;
}
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
/* This may race with setting of irr in __apic_accept_irq() and
* will cause vmexit immediately and the value will be recalculated
* on the next vmentry.
*/
- if (!apic)
+ if (!kvm_vcpu_has_lapic(vcpu))
return 0;
- highest_irr = apic_find_highest_irr(apic);
+ highest_irr = apic_find_highest_irr(vcpu->arch.apic);
return highest_irr;
}
u32 tpr, isrv, ppr, old_ppr;
int isr;
- old_ppr = apic_get_reg(apic, APIC_PROCPRI);
- tpr = apic_get_reg(apic, APIC_TASKPRI);
+ old_ppr = kvm_apic_get_reg(apic, APIC_PROCPRI);
+ tpr = kvm_apic_get_reg(apic, APIC_TASKPRI);
isr = apic_find_highest_isr(apic);
isrv = (isr != -1) ? isr : 0;
u32 logical_id;
if (apic_x2apic_mode(apic)) {
- logical_id = apic_get_reg(apic, APIC_LDR);
+ logical_id = kvm_apic_get_reg(apic, APIC_LDR);
return logical_id & mda;
}
- logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
+ logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR));
- switch (apic_get_reg(apic, APIC_DFR)) {
+ switch (kvm_apic_get_reg(apic, APIC_DFR)) {
case APIC_DFR_FLAT:
if (logical_id & mda)
result = 1;
break;
default:
apic_debug("Bad DFR vcpu %d: %08x\n",
- apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
+ apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR));
break;
}
return result;
}
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, int *r)
+{
+ struct kvm_apic_map *map;
+ unsigned long bitmap = 1;
+ struct kvm_lapic **dst;
+ int i;
+ bool ret = false;
+
+ *r = -1;
+
+ if (irq->shorthand == APIC_DEST_SELF) {
+ *r = kvm_apic_set_irq(src->vcpu, irq);
+ return true;
+ }
+
+ if (irq->shorthand)
+ return false;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ if (!map)
+ goto out;
+
+ if (irq->dest_mode == 0) { /* physical mode */
+ if (irq->delivery_mode == APIC_DM_LOWEST ||
+ irq->dest_id == 0xff)
+ goto out;
+ dst = &map->phys_map[irq->dest_id & 0xff];
+ } else {
+ u32 mda = irq->dest_id << (32 - map->ldr_bits);
+
+ dst = map->logical_map[apic_cluster_id(map, mda)];
+
+ bitmap = apic_logical_id(map, mda);
+
+ if (irq->delivery_mode == APIC_DM_LOWEST) {
+ int l = -1;
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dst[i])
+ continue;
+ if (l < 0)
+ l = i;
+ else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
+ l = i;
+ }
+
+ bitmap = (l >= 0) ? 1 << l : 0;
+ }
+ }
+
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dst[i])
+ continue;
+ if (*r < 0)
+ *r = 0;
+ *r += kvm_apic_set_irq(dst[i]->vcpu, irq);
+ }
+
+ ret = true;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
/*
* Add a pending IRQ into lapic.
* Return 1 if successfully added and 0 if discarded.
apic_clear_isr(vector, apic);
apic_update_ppr(apic);
- if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
+ if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
int trigger_mode;
if (apic_test_vector(vector, apic->regs + APIC_TMR))
static void apic_send_ipi(struct kvm_lapic *apic)
{
- u32 icr_low = apic_get_reg(apic, APIC_ICR);
- u32 icr_high = apic_get_reg(apic, APIC_ICR2);
+ u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
+ u32 icr_high = kvm_apic_get_reg(apic, APIC_ICR2);
struct kvm_lapic_irq irq;
irq.vector = icr_low & APIC_VECTOR_MASK;
ASSERT(apic != NULL);
/* if initial count is 0, current count should also be 0 */
- if (apic_get_reg(apic, APIC_TMICT) == 0)
+ if (kvm_apic_get_reg(apic, APIC_TMICT) == 0)
return 0;
remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
val = apic_get_tmcct(apic);
break;
-
+ case APIC_PROCPRI:
+ apic_update_ppr(apic);
+ val = kvm_apic_get_reg(apic, offset);
+ break;
case APIC_TASKPRI:
report_tpr_access(apic, false);
/* fall thru */
default:
- apic_update_ppr(apic);
- val = apic_get_reg(apic, offset);
+ val = kvm_apic_get_reg(apic, offset);
break;
}
{
unsigned char alignment = offset & 0xf;
u32 result;
- /* this bitmask has a bit cleared for each reserver register */
+ /* this bitmask has a bit cleared for each reserved register */
static const u64 rmask = 0x43ff01ffffffe70cULL;
if ((alignment + len) > 4) {
static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
{
- return apic_hw_enabled(apic) &&
+ return kvm_apic_hw_enabled(apic) &&
addr >= apic->base_address &&
addr < apic->base_address + LAPIC_MMIO_LENGTH;
}
{
u32 tmp1, tmp2, tdcr;
- tdcr = apic_get_reg(apic, APIC_TDCR);
+ tdcr = kvm_apic_get_reg(apic, APIC_TDCR);
tmp1 = tdcr & 0xf;
tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
apic->divide_count = 0x1 << (tmp2 & 0x7);
atomic_set(&apic->lapic_timer.pending, 0);
if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
- /* lapic timer in oneshot or peroidic mode */
+ /* lapic timer in oneshot or periodic mode */
now = apic->lapic_timer.timer.base->get_time();
- apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
+ apic->lapic_timer.period = (u64)kvm_apic_get_reg(apic, APIC_TMICT)
* APIC_BUS_CYCLE_NS * apic->divide_count;
if (!apic->lapic_timer.period)
"timer initial count 0x%x, period %lldns, "
"expire @ 0x%016" PRIx64 ".\n", __func__,
APIC_BUS_CYCLE_NS, ktime_to_ns(now),
- apic_get_reg(apic, APIC_TMICT),
+ kvm_apic_get_reg(apic, APIC_TMICT),
apic->lapic_timer.period,
ktime_to_ns(ktime_add_ns(now,
apic->lapic_timer.period)));
static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
{
- int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
+ int nmi_wd_enabled = apic_lvt_nmi_mode(kvm_apic_get_reg(apic, APIC_LVT0));
if (apic_lvt_nmi_mode(lvt0_val)) {
if (!nmi_wd_enabled) {
switch (reg) {
case APIC_ID: /* Local APIC ID */
if (!apic_x2apic_mode(apic))
- apic_set_reg(apic, APIC_ID, val);
+ kvm_apic_set_id(apic, val >> 24);
else
ret = 1;
break;
case APIC_LDR:
if (!apic_x2apic_mode(apic))
- apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+ kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
else
ret = 1;
break;
case APIC_DFR:
- if (!apic_x2apic_mode(apic))
+ if (!apic_x2apic_mode(apic)) {
apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
- else
+ recalculate_apic_map(apic->vcpu->kvm);
+ } else
ret = 1;
break;
case APIC_SPIV: {
u32 mask = 0x3ff;
- if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
+ if (kvm_apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
mask |= APIC_SPIV_DIRECTED_EOI;
- apic_set_reg(apic, APIC_SPIV, val & mask);
+ apic_set_spiv(apic, val & mask);
if (!(val & APIC_SPIV_APIC_ENABLED)) {
int i;
u32 lvt_val;
for (i = 0; i < APIC_LVT_NUM; i++) {
- lvt_val = apic_get_reg(apic,
+ lvt_val = kvm_apic_get_reg(apic,
APIC_LVTT + 0x10 * i);
apic_set_reg(apic, APIC_LVTT + 0x10 * i,
lvt_val | APIC_LVT_MASKED);
case APIC_LVT1:
case APIC_LVTERR:
/* TODO: Check vector */
- if (!apic_sw_enabled(apic))
+ if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
break;
case APIC_LVTT:
- if ((apic_get_reg(apic, APIC_LVTT) &
+ if ((kvm_apic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask) !=
(val & apic->lapic_timer.timer_mode_mask))
hrtimer_cancel(&apic->lapic_timer.timer);
- if (!apic_sw_enabled(apic))
+ if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
apic_set_reg(apic, APIC_LVTT, val);
void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- if (apic)
+ if (kvm_vcpu_has_lapic(vcpu))
apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
}
EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
void kvm_free_lapic(struct kvm_vcpu *vcpu)
{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
if (!vcpu->arch.apic)
return;
- hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
+ hrtimer_cancel(&apic->lapic_timer.timer);
+
+ if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
+ static_key_slow_dec_deferred(&apic_hw_disabled);
+
+ if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED))
+ static_key_slow_dec_deferred(&apic_sw_disabled);
- if (vcpu->arch.apic->regs)
- free_page((unsigned long)vcpu->arch.apic->regs);
+ if (apic->regs)
+ free_page((unsigned long)apic->regs);
- kfree(vcpu->arch.apic);
+ kfree(apic);
}
/*
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic)
- return 0;
- if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+ if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+ apic_lvtt_period(apic))
return 0;
return apic->lapic_timer.tscdeadline;
void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic)
- return;
- if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+ if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+ apic_lvtt_period(apic))
return;
hrtimer_cancel(&apic->lapic_timer.timer);
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic)
+ if (!kvm_vcpu_has_lapic(vcpu))
return;
+
apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
- | (apic_get_reg(apic, APIC_TASKPRI) & 4));
+ | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
}
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
u64 tpr;
- if (!apic)
+ if (!kvm_vcpu_has_lapic(vcpu))
return 0;
- tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
+
+ tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
return (tpr & 0xf0) >> 4;
}
return;
}
+ /* update jump label if enable bit changes */
+ if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) {
+ if (value & MSR_IA32_APICBASE_ENABLE)
+ static_key_slow_dec_deferred(&apic_hw_disabled);
+ else
+ static_key_slow_inc(&apic_hw_disabled.key);
+ recalculate_apic_map(vcpu->kvm);
+ }
+
if (!kvm_vcpu_is_bsp(apic->vcpu))
value &= ~MSR_IA32_APICBASE_BSP;
if (apic_x2apic_mode(apic)) {
u32 id = kvm_apic_id(apic);
u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
- apic_set_reg(apic, APIC_LDR, ldr);
+ kvm_apic_set_ldr(apic, ldr);
}
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);
- apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
+ kvm_apic_set_id(apic, vcpu->vcpu_id);
kvm_apic_set_version(apic->vcpu);
for (i = 0; i < APIC_LVT_NUM; i++)
SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
apic_set_reg(apic, APIC_DFR, 0xffffffffU);
- apic_set_reg(apic, APIC_SPIV, 0xff);
+ apic_set_spiv(apic, 0xff);
apic_set_reg(apic, APIC_TASKPRI, 0);
- apic_set_reg(apic, APIC_LDR, 0);
+ kvm_apic_set_ldr(apic, 0);
apic_set_reg(apic, APIC_ESR, 0);
apic_set_reg(apic, APIC_ICR, 0);
apic_set_reg(apic, APIC_ICR2, 0);
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
if (kvm_vcpu_is_bsp(vcpu))
- vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ kvm_lapic_set_base(vcpu,
+ vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
vcpu->arch.apic_base, apic->base_address);
}
-bool kvm_apic_present(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
-}
-
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
-{
- return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
-}
-
/*
*----------------------------------------------------------------------
* timer interface
*----------------------------------------------------------------------
*/
-static bool lapic_is_periodic(struct kvm_timer *ktimer)
+static bool lapic_is_periodic(struct kvm_lapic *apic)
{
- struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
- lapic_timer);
return apic_lvtt_period(apic);
}
int apic_has_pending_timer(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *lapic = vcpu->arch.apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
- if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
- return atomic_read(&lapic->lapic_timer.pending);
+ if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
+ apic_lvt_enabled(apic, APIC_LVTT))
+ return atomic_read(&apic->lapic_timer.pending);
return 0;
}
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
{
- u32 reg = apic_get_reg(apic, lvt_type);
+ u32 reg = kvm_apic_get_reg(apic, lvt_type);
int vector, mode, trig_mode;
- if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+ if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
vector = reg & APIC_VECTOR_MASK;
mode = reg & APIC_MODE_MASK;
trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
kvm_apic_local_deliver(apic, APIC_LVT0);
}
-static struct kvm_timer_ops lapic_timer_ops = {
- .is_periodic = lapic_is_periodic,
-};
-
static const struct kvm_io_device_ops apic_mmio_ops = {
.read = apic_mmio_read,
.write = apic_mmio_write,
};
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
+{
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+ struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ wait_queue_head_t *q = &vcpu->wq;
+
+ /*
+ * There is a race window between reading and incrementing, but we do
+ * not care about potentially losing timer events in the !reinject
+ * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
+ * in vcpu_enter_guest.
+ */
+ if (!atomic_read(&ktimer->pending)) {
+ atomic_inc(&ktimer->pending);
+ /* FIXME: this code should not know anything about vcpus */
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ }
+
+ if (waitqueue_active(q))
+ wake_up_interruptible(q);
+
+ if (lapic_is_periodic(apic)) {
+ hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
int kvm_create_lapic(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic;
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS);
- apic->lapic_timer.timer.function = kvm_timer_fn;
- apic->lapic_timer.t_ops = &lapic_timer_ops;
- apic->lapic_timer.kvm = vcpu->kvm;
- apic->lapic_timer.vcpu = vcpu;
+ apic->lapic_timer.timer.function = apic_timer_fn;
- apic->base_address = APIC_DEFAULT_PHYS_BASE;
- vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
+ /*
+ * APIC is created enabled. This will prevent kvm_lapic_set_base from
+ * thinking that APIC satet has changed.
+ */
+ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
+ kvm_lapic_set_base(vcpu,
+ APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
+ static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_lapic_reset(vcpu);
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
- if (!apic || !apic_enabled(apic))
+ if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
return -1;
apic_update_ppr(apic);
highest_irr = apic_find_highest_irr(apic);
if ((highest_irr == -1) ||
- ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
+ ((highest_irr & 0xF0) <= kvm_apic_get_reg(apic, APIC_PROCPRI)))
return -1;
return highest_irr;
}
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
{
- u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
+ u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
int r = 0;
- if (!apic_hw_enabled(vcpu->arch.apic))
+ if (!kvm_apic_hw_enabled(vcpu->arch.apic))
r = 1;
if ((lvt0 & APIC_LVT_MASKED) == 0 &&
GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
+ if (!kvm_vcpu_has_lapic(vcpu))
+ return;
+
+ if (atomic_read(&apic->lapic_timer.pending) > 0) {
if (kvm_apic_local_deliver(apic, APIC_LVTT))
atomic_dec(&apic->lapic_timer.pending);
}
return vector;
}
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- apic->base_address = vcpu->arch.apic_base &
- MSR_IA32_APICBASE_BASE;
+ kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
+ /* set SPIV separately to get count of SW disabled APICs right */
+ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+ memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
+ /* call kvm_apic_set_id() to put apic into apic_map */
+ kvm_apic_set_id(apic, kvm_apic_id(apic));
kvm_apic_set_version(vcpu);
apic_update_ppr(apic);
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
struct hrtimer *timer;
- if (!apic)
+ if (!kvm_vcpu_has_lapic(vcpu))
return;
- timer = &apic->lapic_timer.timer;
+ timer = &vcpu->arch.apic->lapic_timer.timer;
if (hrtimer_cancel(timer))
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
return;
- tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
+ tpr = kvm_apic_get_reg(apic, APIC_TASKPRI) & 0xff;
max_irr = apic_find_highest_irr(apic);
if (max_irr < 0)
max_irr = 0;
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!kvm_vcpu_has_lapic(vcpu))
return 1;
/* if this is ICR write vector before command */
struct kvm_lapic *apic = vcpu->arch.apic;
u32 low, high = 0;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!kvm_vcpu_has_lapic(vcpu))
return 1;
if (apic_reg_read(apic, reg, 4, &low))
return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
addr);
}
+
+void kvm_lapic_init(void)
+{
+ /* do not patch jump label more than once per second */
+ jump_label_rate_limit(&apic_hw_disabled, HZ);
+ jump_label_rate_limit(&apic_sw_disabled, HZ);
+}
#define __KVM_X86_LAPIC_H
#include "iodev.h"
-#include "kvm_timer.h"
#include <linux/kvm_host.h>
+struct kvm_timer {
+ struct hrtimer timer;
+ s64 period; /* unit: ns */
+ u32 timer_mode_mask;
+ u64 tscdeadline;
+ atomic_t pending; /* accumulated triggered timers */
+};
+
struct kvm_lapic {
unsigned long base_address;
struct kvm_io_device dev;
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, int *r);
+
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
-bool kvm_apic_present(struct kvm_vcpu *vcpu);
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
}
int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
+void kvm_lapic_init(void);
+
+static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off)
+{
+ return *((u32 *) (apic->regs + reg_off));
+}
+
+extern struct static_key kvm_no_apic_vcpu;
+
+static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu)
+{
+ if (static_key_false(&kvm_no_apic_vcpu))
+ return vcpu->arch.apic;
+ return true;
+}
+
+extern struct static_key_deferred apic_hw_disabled;
+
+static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
+{
+ if (static_key_false(&apic_hw_disabled.key))
+ return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+ return MSR_IA32_APICBASE_ENABLE;
+}
+
+extern struct static_key_deferred apic_sw_disabled;
+
+static inline int kvm_apic_sw_enabled(struct kvm_lapic *apic)
+{
+ if (static_key_false(&apic_sw_disabled.key))
+ return kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+ return APIC_SPIV_APIC_ENABLED;
+}
+
+static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
+}
+
+static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
+}
+
#endif
return 0;
pfn = spte_to_pfn(old_spte);
+
+ /*
+ * KVM does not hold the refcount of the page used by
+ * kvm mmu, before reclaiming the page, we should
+ * unmap it from mmu first.
+ */
+ WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
+
if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
struct kvm_memory_slot *slot)
{
- struct kvm_lpage_info *linfo;
-
- if (likely(level == PT_PAGE_TABLE_LEVEL))
- return &slot->rmap[gfn - slot->base_gfn];
+ unsigned long idx;
- linfo = lpage_info_slot(gfn, slot, level);
- return &linfo->rmap_pde;
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
}
/*
unsigned long *rmapp;
while (mask) {
- rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
+ rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PT_PAGE_TABLE_LEVEL, slot);
__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
/* clear the first set bit */
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
}
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
return 0;
}
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- unsigned long data,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data))
+static int kvm_handle_hva_range(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm,
+ unsigned long *rmapp,
+ struct kvm_memory_slot *slot,
+ unsigned long data))
{
int j;
- int ret;
- int retval = 0;
+ int ret = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
- unsigned long start = memslot->userspace_addr;
- unsigned long end;
+ unsigned long hva_start, hva_end;
+ gfn_t gfn_start, gfn_end;
- end = start + (memslot->npages << PAGE_SHIFT);
- if (hva >= start && hva < end) {
- gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
- gfn_t gfn = memslot->base_gfn + gfn_offset;
+ hva_start = max(start, memslot->userspace_addr);
+ hva_end = min(end, memslot->userspace_addr +
+ (memslot->npages << PAGE_SHIFT));
+ if (hva_start >= hva_end)
+ continue;
+ /*
+ * {gfn(page) | page intersects with [hva_start, hva_end)} =
+ * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+ */
+ gfn_start = hva_to_gfn_memslot(hva_start, memslot);
+ gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
- ret = handler(kvm, &memslot->rmap[gfn_offset], data);
+ for (j = PT_PAGE_TABLE_LEVEL;
+ j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
+ unsigned long idx, idx_end;
+ unsigned long *rmapp;
- for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
- struct kvm_lpage_info *linfo;
+ /*
+ * {idx(page_j) | page_j intersects with
+ * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
+ */
+ idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
+ idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
- linfo = lpage_info_slot(gfn, memslot,
- PT_DIRECTORY_LEVEL + j);
- ret |= handler(kvm, &linfo->rmap_pde, data);
- }
- trace_kvm_age_page(hva, memslot, ret);
- retval |= ret;
+ rmapp = __gfn_to_rmap(gfn_start, j, memslot);
+
+ for (; idx <= idx_end; ++idx)
+ ret |= handler(kvm, rmapp++, memslot, data);
}
}
- return retval;
+ return ret;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+ struct kvm_memory_slot *slot,
+ unsigned long data))
+{
+ return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
}
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
}
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+ return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+}
+
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
}
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator uninitialized_var(iter);
* This has some overhead, but not as much as the cost of swapping
* out actively used pages or breaking up actively used hugepages.
*/
- if (!shadow_accessed_mask)
- return kvm_unmap_rmapp(kvm, rmapp, data);
+ if (!shadow_accessed_mask) {
+ young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
+ goto out;
+ }
for (sptep = rmap_get_first(*rmapp, &iter); sptep;
sptep = rmap_get_next(&iter)) {
(unsigned long *)sptep);
}
}
-
+out:
+ /* @data has hva passed to kvm_age_hva(). */
+ trace_kvm_age_page(data, slot, young);
return young;
}
static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
- kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
+ kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
}
int kvm_age_hva(struct kvm *kvm, unsigned long hva)
{
- return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
+ return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
}
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
rmap_recycle(vcpu, sptep, gfn);
}
}
- kvm_release_pfn_clean(pfn);
+
+ if (!is_error_pfn(pfn))
+ kvm_release_pfn_clean(pfn);
}
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
bool no_dirty_log)
{
struct kvm_memory_slot *slot;
- unsigned long hva;
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
- if (!slot) {
- get_page(fault_page);
- return page_to_pfn(fault_page);
- }
+ if (!slot)
+ return KVM_PFN_ERR_FAULT;
- hva = gfn_to_hva_memslot(slot, gfn);
-
- return hva_to_pfn_atomic(vcpu->kvm, hva);
+ return gfn_to_pfn_memslot_atomic(slot, gfn);
}
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
iterator.level - 1,
1, ACC_ALL, iterator.sptep);
- if (!sp) {
- pgprintk("nonpaging_map: ENOMEM\n");
- kvm_release_pfn_clean(pfn);
- return -ENOMEM;
- }
mmu_spte_set(iterator.sptep,
__pa(sp->spt)
static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
{
- kvm_release_pfn_clean(pfn);
- if (is_hwpoison_pfn(pfn)) {
+ /*
+ * Do not cache the mmio info caused by writing the readonly gfn
+ * into the spte otherwise read access on readonly gfn also can
+ * caused mmio page fault and treat it as mmio access.
+ * Return 1 to tell kvm to emulate it.
+ */
+ if (pfn == KVM_PFN_ERR_RO_FAULT)
+ return 1;
+
+ if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
return 0;
}
if (!async)
return false; /* *pfn has correct page already */
- put_page(pfn_to_page(*pfn));
-
if (!prefault && can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(gva, gfn);
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
}
+static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
+{
+ unsigned mask;
+
+ BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+ mask = (unsigned)~ACC_WRITE_MASK;
+ /* Allow write access to dirty gptes */
+ mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
+ *access &= mask;
+}
+
static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
int *nr_present)
{
return false;
}
+static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
+{
+ unsigned access;
+
+ access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+ access &= ~(gpte >> PT64_NX_SHIFT);
+
+ return access;
+}
+
+static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
+{
+ unsigned index;
+
+ index = level - 1;
+ index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
+ return mmu->last_pte_bitmap & (1 << index);
+}
+
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE
}
}
+static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+{
+ unsigned bit, byte, pfec;
+ u8 map;
+ bool fault, x, w, u, wf, uf, ff, smep;
+
+ smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+ for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
+ pfec = byte << 1;
+ map = 0;
+ wf = pfec & PFERR_WRITE_MASK;
+ uf = pfec & PFERR_USER_MASK;
+ ff = pfec & PFERR_FETCH_MASK;
+ for (bit = 0; bit < 8; ++bit) {
+ x = bit & ACC_EXEC_MASK;
+ w = bit & ACC_WRITE_MASK;
+ u = bit & ACC_USER_MASK;
+
+ /* Not really needed: !nx will cause pte.nx to fault */
+ x |= !mmu->nx;
+ /* Allow supervisor writes if !cr0.wp */
+ w |= !is_write_protection(vcpu) && !uf;
+ /* Disallow supervisor fetches of user code if cr4.smep */
+ x &= !(smep && u && !uf);
+
+ fault = (ff && !x) || (uf && !u) || (wf && !w);
+ map |= fault << bit;
+ }
+ mmu->permissions[byte] = map;
+ }
+}
+
+static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+{
+ u8 map;
+ unsigned level, root_level = mmu->root_level;
+ const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */
+
+ if (root_level == PT32E_ROOT_LEVEL)
+ --root_level;
+ /* PT_PAGE_TABLE_LEVEL always terminates */
+ map = 1 | (1 << ps_set_index);
+ for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
+ if (level <= PT_PDPE_LEVEL
+ && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
+ map |= 1 << (ps_set_index | (level - 1));
+ }
+ mmu->last_pte_bitmap = map;
+}
+
static int paging64_init_context_common(struct kvm_vcpu *vcpu,
struct kvm_mmu *context,
int level)
context->root_level = level;
reset_rsvds_bits_mask(vcpu, context);
+ update_permission_bitmask(vcpu, context);
+ update_last_pte_bitmap(vcpu, context);
ASSERT(is_pae(vcpu));
context->new_cr3 = paging_new_cr3;
context->root_level = PT32_ROOT_LEVEL;
reset_rsvds_bits_mask(vcpu, context);
+ update_permission_bitmask(vcpu, context);
+ update_last_pte_bitmap(vcpu, context);
context->new_cr3 = paging_new_cr3;
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
}
+ update_permission_bitmask(vcpu, context);
+ update_last_pte_bitmap(vcpu, context);
+
return 0;
}
g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
}
+ update_permission_bitmask(vcpu, g_context);
+ update_last_pte_bitmap(vcpu, g_context);
+
return 0;
}
#define PT_PCD_MASK (1ULL << 4)
#define PT_ACCESSED_SHIFT 5
#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
-#define PT_DIRTY_MASK (1ULL << 6)
-#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_DIRTY_SHIFT 6
+#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
+#define PT_PAGE_SIZE_SHIFT 7
+#define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT)
#define PT_PAT_MASK (1ULL << 7)
#define PT_GLOBAL_MASK (1ULL << 8)
#define PT64_NX_SHIFT 63
return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
}
-static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
- bool write_fault, bool user_fault,
- unsigned long pte)
+/*
+ * Will a fault with a given page-fault error code (pfec) cause a permission
+ * fault with the given access (in ACC_* format)?
+ */
+static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
+ unsigned pfec)
{
- if (unlikely(write_fault && !is_writable_pte(pte)
- && (user_fault || is_write_protection(vcpu))))
- return false;
-
- if (unlikely(user_fault && !(pte & PT_USER_MASK)))
- return false;
-
- return true;
+ return (mmu->permissions[pfec >> 1] >> pte_access) & 1;
}
+
#endif
gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
+ if (is_error_pfn(pfn))
return;
- }
hpa = pfn << PAGE_SHIFT;
if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- struct kvm_memory_slot *slot;
unsigned long *rmapp;
u64 *sptep;
struct rmap_iterator iter;
if (sp->role.direct || sp->unsync || sp->role.invalid)
return;
- slot = gfn_to_memslot(kvm, sp->gfn);
- rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+ rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL);
for (sptep = rmap_get_first(*rmapp, &iter); sptep;
sptep = rmap_get_next(&iter)) {
*/
struct guest_walker {
int level;
+ unsigned max_level;
gfn_t table_gfn[PT_MAX_FULL_LEVELS];
pt_element_t ptes[PT_MAX_FULL_LEVELS];
pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+ pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
unsigned pt_access;
unsigned pte_access;
gfn_t gfn;
return (ret != orig_pte);
}
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
- bool last)
+static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu,
+ struct guest_walker *walker,
+ int write_fault)
{
- unsigned access;
-
- access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
- if (last && !is_dirty_gpte(gpte))
- access &= ~ACC_WRITE_MASK;
-
-#if PTTYPE == 64
- if (vcpu->arch.mmu.nx)
- access &= ~(gpte >> PT64_NX_SHIFT);
-#endif
- return access;
-}
-
-static bool FNAME(is_last_gpte)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- pt_element_t gpte)
-{
- if (walker->level == PT_PAGE_TABLE_LEVEL)
- return true;
-
- if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) &&
- (PTTYPE == 64 || is_pse(vcpu)))
- return true;
+ unsigned level, index;
+ pt_element_t pte, orig_pte;
+ pt_element_t __user *ptep_user;
+ gfn_t table_gfn;
+ int ret;
+
+ for (level = walker->max_level; level >= walker->level; --level) {
+ pte = orig_pte = walker->ptes[level - 1];
+ table_gfn = walker->table_gfn[level - 1];
+ ptep_user = walker->ptep_user[level - 1];
+ index = offset_in_page(ptep_user) / sizeof(pt_element_t);
+ if (!(pte & PT_ACCESSED_MASK)) {
+ trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
+ pte |= PT_ACCESSED_MASK;
+ }
+ if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
+ trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
+ pte |= PT_DIRTY_MASK;
+ }
+ if (pte == orig_pte)
+ continue;
- if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) &&
- (mmu->root_level == PT64_ROOT_LEVEL))
- return true;
+ ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
+ if (ret)
+ return ret;
- return false;
+ mark_page_dirty(vcpu->kvm, table_gfn);
+ walker->ptes[level] = pte;
+ }
+ return 0;
}
/*
struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gva_t addr, u32 access)
{
+ int ret;
pt_element_t pte;
pt_element_t __user *uninitialized_var(ptep_user);
gfn_t table_gfn;
- unsigned index, pt_access, uninitialized_var(pte_access);
+ unsigned index, pt_access, pte_access, accessed_dirty, shift;
gpa_t pte_gpa;
- bool eperm, last_gpte;
int offset;
const int write_fault = access & PFERR_WRITE_MASK;
const int user_fault = access & PFERR_USER_MASK;
const int fetch_fault = access & PFERR_FETCH_MASK;
u16 errcode = 0;
+ gpa_t real_gpa;
+ gfn_t gfn;
trace_kvm_mmu_pagetable_walk(addr, access);
retry_walk:
- eperm = false;
walker->level = mmu->root_level;
pte = mmu->get_cr3(vcpu);
--walker->level;
}
#endif
+ walker->max_level = walker->level;
ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
(mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
- pt_access = ACC_ALL;
+ accessed_dirty = PT_ACCESSED_MASK;
+ pt_access = pte_access = ACC_ALL;
+ ++walker->level;
- for (;;) {
+ do {
gfn_t real_gfn;
unsigned long host_addr;
+ pt_access &= pte_access;
+ --walker->level;
+
index = PT_INDEX(addr, walker->level);
table_gfn = gpte_to_gfn(pte);
ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
goto error;
+ walker->ptep_user[walker->level - 1] = ptep_user;
trace_kvm_mmu_paging_element(pte, walker->level);
goto error;
}
- if (!check_write_user_access(vcpu, write_fault, user_fault,
- pte))
- eperm = true;
-
-#if PTTYPE == 64
- if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
- eperm = true;
-#endif
-
- last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
- if (last_gpte) {
- pte_access = pt_access &
- FNAME(gpte_access)(vcpu, pte, true);
- /* check if the kernel is fetching from user page */
- if (unlikely(pte_access & PT_USER_MASK) &&
- kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
- if (fetch_fault && !user_fault)
- eperm = true;
- }
-
- if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
- int ret;
- trace_kvm_mmu_set_accessed_bit(table_gfn, index,
- sizeof(pte));
- ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
- pte, pte|PT_ACCESSED_MASK);
- if (unlikely(ret < 0))
- goto error;
- else if (ret)
- goto retry_walk;
-
- mark_page_dirty(vcpu->kvm, table_gfn);
- pte |= PT_ACCESSED_MASK;
- }
+ accessed_dirty &= pte;
+ pte_access = pt_access & gpte_access(vcpu, pte);
walker->ptes[walker->level - 1] = pte;
+ } while (!is_last_gpte(mmu, walker->level, pte));
- if (last_gpte) {
- int lvl = walker->level;
- gpa_t real_gpa;
- gfn_t gfn;
- u32 ac;
-
- gfn = gpte_to_gfn_lvl(pte, lvl);
- gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
-
- if (PTTYPE == 32 &&
- walker->level == PT_DIRECTORY_LEVEL &&
- is_cpuid_PSE36())
- gfn += pse36_gfn_delta(pte);
-
- ac = write_fault | fetch_fault | user_fault;
+ if (unlikely(permission_fault(mmu, pte_access, access))) {
+ errcode |= PFERR_PRESENT_MASK;
+ goto error;
+ }
- real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
- ac);
- if (real_gpa == UNMAPPED_GVA)
- return 0;
+ gfn = gpte_to_gfn_lvl(pte, walker->level);
+ gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
- walker->gfn = real_gpa >> PAGE_SHIFT;
+ if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
+ gfn += pse36_gfn_delta(pte);
- break;
- }
+ real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access);
+ if (real_gpa == UNMAPPED_GVA)
+ return 0;
- pt_access &= FNAME(gpte_access)(vcpu, pte, false);
- --walker->level;
- }
+ walker->gfn = real_gpa >> PAGE_SHIFT;
- if (unlikely(eperm)) {
- errcode |= PFERR_PRESENT_MASK;
- goto error;
- }
+ if (!write_fault)
+ protect_clean_gpte(&pte_access, pte);
- if (write_fault && unlikely(!is_dirty_gpte(pte))) {
- int ret;
+ /*
+ * On a write fault, fold the dirty bit into accessed_dirty by shifting it one
+ * place right.
+ *
+ * On a read fault, do nothing.
+ */
+ shift = write_fault >> ilog2(PFERR_WRITE_MASK);
+ shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT;
+ accessed_dirty &= pte >> shift;
- trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
- ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
- pte, pte|PT_DIRTY_MASK);
+ if (unlikely(!accessed_dirty)) {
+ ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
if (unlikely(ret < 0))
goto error;
else if (ret)
goto retry_walk;
-
- mark_page_dirty(vcpu->kvm, table_gfn);
- pte |= PT_DIRTY_MASK;
- walker->ptes[walker->level - 1] = pte;
}
walker->pt_access = pt_access;
return;
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
+ pte_access = sp->role.access & gpte_access(vcpu, gpte);
+ protect_clean_gpte(&pte_access, gpte);
pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
- if (mmu_invalid_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
+ if (mmu_invalid_pfn(pfn))
return;
- }
/*
* we call mmu_set_spte() with host_writable = true because that
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
continue;
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
- true);
+ pte_access = sp->role.access & gpte_access(vcpu, gpte);
+ protect_clean_gpte(&pte_access, gpte);
gfn = gpte_to_gfn(gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
pte_access & ACC_WRITE_MASK);
- if (mmu_invalid_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
+ if (mmu_invalid_pfn(pfn))
break;
- }
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
NULL, PT_PAGE_TABLE_LEVEL, gfn,
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access;
- pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
+ pte_access &= gpte_access(vcpu, gpte);
+ protect_clean_gpte(&pte_access, gpte);
if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
continue;
/*
- * Kernel-based Virtual Machine -- Performane Monitoring Unit support
+ * Kernel-based Virtual Machine -- Performance Monitoring Unit support
*
* Copyright 2011 Red Hat, Inc. and/or its affiliates.
*
#define MSR_INVALID 0xffffffffU
-static struct svm_direct_access_msrs {
+static const struct svm_direct_access_msrs {
u32 index; /* Index of the MSR */
bool always; /* True if intercept is always on */
} direct_access_msrs[] = {
int r;
};
-static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
+static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
#define MSRS_RANGE_SIZE 2048
svm_set_efer(&svm->vcpu, 0);
save->dr6 = 0xffff0ff0;
- save->dr7 = 0x400;
kvm_set_rflags(&svm->vcpu, 2);
save->rip = 0x0000fff0;
svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
mark_dirty(svm->vmcb, VMCB_SEG);
}
-static void update_db_intercept(struct kvm_vcpu *vcpu)
+static void update_db_bp_intercept(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
vcpu->guest_debug = 0;
}
-static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
- else
- svm->vmcb->save.dr7 = vcpu->arch.dr7;
-
- mark_dirty(svm->vmcb, VMCB_DR);
-
- update_db_intercept(vcpu);
-}
-
static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
{
if (sd->next_asid > sd->max_asid) {
if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
svm->vmcb->save.rflags &=
~(X86_EFLAGS_TF | X86_EFLAGS_RF);
- update_db_intercept(&svm->vcpu);
+ update_db_bp_intercept(&svm->vcpu);
}
if (svm->vcpu.guest_debug &
if (svm->nested.intercept & 1ULL) {
/*
* The #vmexit can't be emulated here directly because this
- * code path runs with irqs and preemtion disabled. A
+ * code path runs with irqs and preemption disabled. A
* #vmexit emulation might sleep. Only signal request for
* the #vmexit here.
*/
return kmap(page);
error:
- kvm_release_page_clean(page);
kvm_inject_gp(&svm->vcpu, 0);
return NULL;
{
/*
* This function merges the msr permission bitmaps of kvm and the
- * nested vmcb. It is omptimized in that it only merges the parts where
+ * nested vmcb. It is optimized in that it only merges the parts where
* the kvm msr permission bitmap may contain zero bits
*/
int i;
return 1;
}
-static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
+static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
[SVM_EXIT_READ_CR4] = cr_interception,
*/
svm->nmi_singlestep = true;
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
- update_db_intercept(vcpu);
+ update_db_bp_intercept(vcpu);
}
static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
svm_complete_interrupts(svm);
}
-#ifdef CONFIG_X86_64
-#define R "r"
-#else
-#define R "e"
-#endif
-
static void svm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
local_irq_enable();
asm volatile (
- "push %%"R"bp; \n\t"
- "mov %c[rbx](%[svm]), %%"R"bx \n\t"
- "mov %c[rcx](%[svm]), %%"R"cx \n\t"
- "mov %c[rdx](%[svm]), %%"R"dx \n\t"
- "mov %c[rsi](%[svm]), %%"R"si \n\t"
- "mov %c[rdi](%[svm]), %%"R"di \n\t"
- "mov %c[rbp](%[svm]), %%"R"bp \n\t"
+ "push %%" _ASM_BP "; \n\t"
+ "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
+ "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
+ "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
+ "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
+ "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
+ "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
#ifdef CONFIG_X86_64
"mov %c[r8](%[svm]), %%r8 \n\t"
"mov %c[r9](%[svm]), %%r9 \n\t"
#endif
/* Enter guest mode */
- "push %%"R"ax \n\t"
- "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
+ "push %%" _ASM_AX " \n\t"
+ "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
__ex(SVM_VMLOAD) "\n\t"
__ex(SVM_VMRUN) "\n\t"
__ex(SVM_VMSAVE) "\n\t"
- "pop %%"R"ax \n\t"
+ "pop %%" _ASM_AX " \n\t"
/* Save guest registers, load host registers */
- "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
- "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
- "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
- "mov %%"R"si, %c[rsi](%[svm]) \n\t"
- "mov %%"R"di, %c[rdi](%[svm]) \n\t"
- "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
+ "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
+ "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
+ "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
+ "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
+ "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
+ "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
#ifdef CONFIG_X86_64
"mov %%r8, %c[r8](%[svm]) \n\t"
"mov %%r9, %c[r9](%[svm]) \n\t"
"mov %%r14, %c[r14](%[svm]) \n\t"
"mov %%r15, %c[r15](%[svm]) \n\t"
#endif
- "pop %%"R"bp"
+ "pop %%" _ASM_BP
:
: [svm]"a"(svm),
[vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
[r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
#endif
: "cc", "memory"
- , R"bx", R"cx", R"dx", R"si", R"di"
#ifdef CONFIG_X86_64
+ , "rbx", "rcx", "rdx", "rsi", "rdi"
, "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
+#else
+ , "ebx", "ecx", "edx", "esi", "edi"
#endif
);
mark_all_clean(svm->vmcb);
}
-#undef R
-
static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
struct vcpu_svm *svm = to_svm(vcpu);
#define POST_MEM(exit) { .exit_code = (exit), \
.stage = X86_ICPT_POST_MEMACCESS, }
-static struct __x86_intercept {
+static const struct __x86_intercept {
u32 exit_code;
enum x86_intercept_stage stage;
} x86_intercept_map[] = {
.vcpu_load = svm_vcpu_load,
.vcpu_put = svm_vcpu_put,
- .set_guest_debug = svm_guest_debug,
+ .update_db_bp_intercept = update_db_bp_intercept,
.get_msr = svm_get_msr,
.set_msr = svm_set_msr,
.get_segment_base = svm_get_segment_base,
+++ /dev/null
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * timer support
- *
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- */
-
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/hrtimer.h>
-#include <linux/atomic.h>
-#include "kvm_timer.h"
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
-{
- struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
- struct kvm_vcpu *vcpu = ktimer->vcpu;
- wait_queue_head_t *q = &vcpu->wq;
-
- /*
- * There is a race window between reading and incrementing, but we do
- * not care about potentially losing timer events in the !reinject
- * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
- * in vcpu_enter_guest.
- */
- if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
- atomic_inc(&ktimer->pending);
- /* FIXME: this code should not know anything about vcpus */
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
- }
-
- if (waitqueue_active(q))
- wake_up_interruptible(q);
-
- if (ktimer->t_ops->is_periodic(ktimer)) {
- hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
- return HRTIMER_RESTART;
- } else
- return HRTIMER_NORESTART;
-}
static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, int, S_IRUGO);
+extern const ulong vmx_return;
+
#define NR_AUTOLOAD_MSRS 8
#define VMCS02_POOL_SIZE 1
struct {
int vm86_active;
ulong save_rflags;
+ struct kvm_segment segs[8];
+ } rmode;
+ struct {
+ u32 bitmask; /* 4 bits per segment (1 bit per field) */
struct kvm_save_segment {
u16 selector;
unsigned long base;
u32 limit;
u32 ar;
- } tr, es, ds, fs, gs;
- } rmode;
- struct {
- u32 bitmask; /* 4 bits per segment (1 bit per field) */
- struct kvm_save_segment seg[8];
+ } seg[8];
} segment_cache;
int vpid;
bool emulation_required;
#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
[number##_HIGH] = VMCS12_OFFSET(name)+4
-static unsigned short vmcs_field_to_offset_table[] = {
+static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
{
struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
- if (is_error_page(page)) {
- kvm_release_page_clean(page);
+ if (is_error_page(page))
return NULL;
- }
+
return page;
}
.ar_bytes = GUEST_##seg##_AR_BYTES, \
}
-static struct kvm_vmx_segment_field {
+static const struct kvm_vmx_segment_field {
unsigned selector;
unsigned base;
unsigned limit;
guest_efer = vmx->vcpu.arch.efer;
/*
- * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+ * NX is emulated; LMA and LME handled by hardware; SCE meaningless
* outside long mode
*/
ignore_bits = EFER_NX | EFER_SCE;
#endif
CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
- CPU_BASED_RDPMC_EXITING |
+ CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
/*
* We can allow some features even when not supported by the
}
}
-static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
-{
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
- else
- vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
-
- update_exception_bitmap(vcpu);
-}
-
static __init int cpu_has_kvm_support(void)
{
return cpu_has_vmx();
free_kvm_area();
}
-static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
+static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save)
{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ struct kvm_segment tmp = *save;
- if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
- vmcs_write16(sf->selector, save->selector);
- vmcs_writel(sf->base, save->base);
- vmcs_write32(sf->limit, save->limit);
- vmcs_write32(sf->ar_bytes, save->ar);
- } else {
- u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
- << AR_DPL_SHIFT;
- vmcs_write32(sf->ar_bytes, 0x93 | dpl);
+ if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
+ tmp.base = vmcs_readl(sf->base);
+ tmp.selector = vmcs_read16(sf->selector);
+ tmp.s = 1;
}
+ vmx_set_segment(vcpu, &tmp, seg);
}
static void enter_pmode(struct kvm_vcpu *vcpu)
vmx_segment_cache_clear(vmx);
- vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
- vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
- vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
- vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
+ vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
flags = vmcs_readl(GUEST_RFLAGS);
flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
if (emulate_invalid_guest_state)
return;
- fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es);
- fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds);
- fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
- fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
+ fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+ fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+ fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
+ fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
vmx_segment_cache_clear(vmx);
return kvm->arch.tss_addr;
}
-static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
+static void fix_rmode_seg(int seg, struct kvm_segment *save)
{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
- save->selector = vmcs_read16(sf->selector);
- save->base = vmcs_readl(sf->base);
- save->limit = vmcs_read32(sf->limit);
- save->ar = vmcs_read32(sf->ar_bytes);
vmcs_write16(sf->selector, save->base >> 4);
vmcs_write32(sf->base, save->base & 0xffff0);
vmcs_write32(sf->limit, 0xffff);
if (enable_unrestricted_guest)
return;
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+
vmx->emulation_required = 1;
vmx->rmode.vm86_active = 1;
+
/*
* Very old userspace does not call KVM_SET_TSS_ADDR before entering
* vcpu. Call it here with phys address pointing 16M below 4G.
vmx_segment_cache_clear(vmx);
- vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
- vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
-
- vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
-
- vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
flags = vmcs_readl(GUEST_RFLAGS);
struct kvm_segment *var, int seg)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_save_segment *save;
u32 ar;
if (vmx->rmode.vm86_active
&& (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
|| seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
- || seg == VCPU_SREG_GS)
- && !emulate_invalid_guest_state) {
- switch (seg) {
- case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
- case VCPU_SREG_ES: save = &vmx->rmode.es; break;
- case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
- case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
- case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
- default: BUG();
- }
- var->selector = save->selector;
- var->base = save->base;
- var->limit = save->limit;
- ar = save->ar;
+ || seg == VCPU_SREG_GS)) {
+ *var = vmx->rmode.segs[seg];
if (seg == VCPU_SREG_TR
|| var->selector == vmx_read_guest_seg_selector(vmx, seg))
- goto use_saved_rmode_seg;
+ return;
+ var->base = vmx_read_guest_seg_base(vmx, seg);
+ var->selector = vmx_read_guest_seg_selector(vmx, seg);
+ return;
}
var->base = vmx_read_guest_seg_base(vmx, seg);
var->limit = vmx_read_guest_seg_limit(vmx, seg);
var->selector = vmx_read_guest_seg_selector(vmx, seg);
ar = vmx_read_guest_seg_ar(vmx, seg);
-use_saved_rmode_seg:
if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
ar = 0;
var->type = ar & 15;
struct kvm_segment *var, int seg)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
u32 ar;
vmx_segment_cache_clear(vmx);
if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
vmcs_write16(sf->selector, var->selector);
- vmx->rmode.tr.selector = var->selector;
- vmx->rmode.tr.base = var->base;
- vmx->rmode.tr.limit = var->limit;
- vmx->rmode.tr.ar = vmx_segment_access_rights(var);
+ vmx->rmode.segs[VCPU_SREG_TR] = *var;
return;
}
vmcs_writel(sf->base, var->base);
vmcs_write32(sf->limit, var->limit);
vmcs_write16(sf->selector, var->selector);
if (vmx->rmode.vm86_active && var->s) {
+ vmx->rmode.segs[seg] = *var;
/*
* Hack real-mode segments into vm86 compatibility.
*/
* qemu binaries.
* IA32 arch specifies that at the time of processor reset the
* "Accessed" bit in the AR field of segment registers is 1. And qemu
- * is setting it to 0 in the usedland code. This causes invalid guest
+ * is setting it to 0 in the userland code. This causes invalid guest
* state vmexit when "unrestricted guest" mode is turned on.
* Fix for this setup issue in cpu_reset is being pushed in the qemu
* tree. Newer qemu binaries with that qemu fix would not need this
vmcs_readl(GUEST_CS_BASE) >> 4);
break;
case VCPU_SREG_ES:
- fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
- break;
case VCPU_SREG_DS:
- fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
- break;
case VCPU_SREG_GS:
- fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
- break;
case VCPU_SREG_FS:
- fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+ fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
break;
case VCPU_SREG_SS:
vmcs_write16(GUEST_SS_SELECTOR,
if (var.base != (var.selector << 4))
return false;
- if (var.limit != 0xffff)
+ if (var.limit < 0xffff)
return false;
- if (ar != 0xf3)
+ if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3)
return false;
return true;
static void seg_setup(int seg)
{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
unsigned int ar;
vmcs_write16(sf->selector, 0);
native_store_idt(&dt);
vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
- asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
- vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
+ vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
kvm_rip_write(vcpu, 0);
kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
- vmcs_writel(GUEST_DR7, 0x400);
-
vmcs_writel(GUEST_GDTR_BASE, 0);
vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
hypercall[2] = 0xc1;
}
-/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
{
if (to_vmx(vcpu)->nested.vmxon &&
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
* to be done to userspace and return 0.
*/
-static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
+static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_EXCEPTION_NMI] = handle_exception,
[EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
[EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
msrs[i].host);
}
-#ifdef CONFIG_X86_64
-#define R "r"
-#define Q "q"
-#else
-#define R "e"
-#define Q "l"
-#endif
-
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long debugctlmsr;
if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
vmx_set_interrupt_shadow(vcpu, 0);
atomic_switch_perf_msrs(vmx);
+ debugctlmsr = get_debugctlmsr();
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
- "push %%"R"dx; push %%"R"bp;"
- "push %%"R"cx \n\t" /* placeholder for guest rcx */
- "push %%"R"cx \n\t"
- "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
+ "push %%" _ASM_DX "; push %%" _ASM_BP ";"
+ "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
+ "push %%" _ASM_CX " \n\t"
+ "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
"je 1f \n\t"
- "mov %%"R"sp, %c[host_rsp](%0) \n\t"
+ "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
"1: \n\t"
/* Reload cr2 if changed */
- "mov %c[cr2](%0), %%"R"ax \n\t"
- "mov %%cr2, %%"R"dx \n\t"
- "cmp %%"R"ax, %%"R"dx \n\t"
+ "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
+ "mov %%cr2, %%" _ASM_DX " \n\t"
+ "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
"je 2f \n\t"
- "mov %%"R"ax, %%cr2 \n\t"
+ "mov %%" _ASM_AX", %%cr2 \n\t"
"2: \n\t"
/* Check if vmlaunch of vmresume is needed */
"cmpl $0, %c[launched](%0) \n\t"
/* Load guest registers. Don't clobber flags. */
- "mov %c[rax](%0), %%"R"ax \n\t"
- "mov %c[rbx](%0), %%"R"bx \n\t"
- "mov %c[rdx](%0), %%"R"dx \n\t"
- "mov %c[rsi](%0), %%"R"si \n\t"
- "mov %c[rdi](%0), %%"R"di \n\t"
- "mov %c[rbp](%0), %%"R"bp \n\t"
+ "mov %c[rax](%0), %%" _ASM_AX " \n\t"
+ "mov %c[rbx](%0), %%" _ASM_BX " \n\t"
+ "mov %c[rdx](%0), %%" _ASM_DX " \n\t"
+ "mov %c[rsi](%0), %%" _ASM_SI " \n\t"
+ "mov %c[rdi](%0), %%" _ASM_DI " \n\t"
+ "mov %c[rbp](%0), %%" _ASM_BP " \n\t"
#ifdef CONFIG_X86_64
"mov %c[r8](%0), %%r8 \n\t"
"mov %c[r9](%0), %%r9 \n\t"
"mov %c[r14](%0), %%r14 \n\t"
"mov %c[r15](%0), %%r15 \n\t"
#endif
- "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
+ "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
/* Enter guest mode */
- "jne .Llaunched \n\t"
+ "jne 1f \n\t"
__ex(ASM_VMX_VMLAUNCH) "\n\t"
- "jmp .Lkvm_vmx_return \n\t"
- ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
- ".Lkvm_vmx_return: "
+ "jmp 2f \n\t"
+ "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
+ "2: "
/* Save guest registers, load host registers, keep flags */
- "mov %0, %c[wordsize](%%"R"sp) \n\t"
+ "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
"pop %0 \n\t"
- "mov %%"R"ax, %c[rax](%0) \n\t"
- "mov %%"R"bx, %c[rbx](%0) \n\t"
- "pop"Q" %c[rcx](%0) \n\t"
- "mov %%"R"dx, %c[rdx](%0) \n\t"
- "mov %%"R"si, %c[rsi](%0) \n\t"
- "mov %%"R"di, %c[rdi](%0) \n\t"
- "mov %%"R"bp, %c[rbp](%0) \n\t"
+ "mov %%" _ASM_AX ", %c[rax](%0) \n\t"
+ "mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
+ __ASM_SIZE(pop) " %c[rcx](%0) \n\t"
+ "mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
+ "mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
+ "mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
+ "mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
#ifdef CONFIG_X86_64
"mov %%r8, %c[r8](%0) \n\t"
"mov %%r9, %c[r9](%0) \n\t"
"mov %%r14, %c[r14](%0) \n\t"
"mov %%r15, %c[r15](%0) \n\t"
#endif
- "mov %%cr2, %%"R"ax \n\t"
- "mov %%"R"ax, %c[cr2](%0) \n\t"
+ "mov %%cr2, %%" _ASM_AX " \n\t"
+ "mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
- "pop %%"R"bp; pop %%"R"dx \n\t"
+ "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
"setbe %c[fail](%0) \n\t"
+ ".pushsection .rodata \n\t"
+ ".global vmx_return \n\t"
+ "vmx_return: " _ASM_PTR " 2b \n\t"
+ ".popsection"
: : "c"(vmx), "d"((unsigned long)HOST_RSP),
[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
[wordsize]"i"(sizeof(ulong))
: "cc", "memory"
- , R"ax", R"bx", R"di", R"si"
#ifdef CONFIG_X86_64
+ , "rax", "rbx", "rdi", "rsi"
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+#else
+ , "eax", "ebx", "edi", "esi"
#endif
);
+ /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+ if (debugctlmsr)
+ update_debugctlmsr(debugctlmsr);
+
#ifndef CONFIG_X86_64
/*
* The sysexit path does not restore ds/es, so we must set them to
vmx_complete_interrupts(vmx);
}
-#undef R
-#undef Q
-
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
- .set_guest_debug = set_guest_debug,
+ .update_db_bp_intercept = update_exception_bitmap,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
.get_segment_base = vmx_get_segment_base,
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
- if (irqchip_in_kernel(vcpu->kvm))
- return vcpu->arch.apic_base;
- else
- return vcpu->arch.apic_base;
+ return vcpu->arch.apic_base;
}
EXPORT_SYMBOL_GPL(kvm_get_apic_base);
void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
{
/* TODO: reserve bits check */
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_lapic_set_base(vcpu, data);
- else
- vcpu->arch.apic_base = data;
+ kvm_lapic_set_base(vcpu, data);
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
}
EXPORT_SYMBOL_GPL(kvm_get_cr8);
+static void kvm_update_dr7(struct kvm_vcpu *vcpu)
+{
+ unsigned long dr7;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ dr7 = vcpu->arch.guest_debug_dr7;
+ else
+ dr7 = vcpu->arch.dr7;
+ kvm_x86_ops->set_dr7(vcpu, dr7);
+ vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
+}
+
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
switch (dr) {
if (val & 0xffffffff00000000ULL)
return -1; /* #GP */
vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
- kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
- vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
- }
+ kvm_update_dr7(vcpu);
break;
}
static unsigned num_msrs_to_save;
-static u32 emulated_msrs[] = {
+static const u32 emulated_msrs[] = {
MSR_IA32_TSCDEADLINE,
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
* For each generation, we track the original measured
* nanosecond time, offset, and write, so if TSCs are in
* sync, we can match exact offset, and if not, we can match
- * exact software computaion in compute_guest_tsc()
+ * exact software computation in compute_guest_tsc()
*
* These values are tracked in kvm->arch.cur_xxx variables.
*/
unsigned long this_tsc_khz;
s64 kernel_ns, max_kernel_ns;
u64 tsc_timestamp;
+ u8 pvclock_flags;
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_kernel_ns = kernel_ns;
vcpu->last_guest_tsc = tsc_timestamp;
- vcpu->hv_clock.flags = 0;
+
+ pvclock_flags = 0;
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ pvclock_flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+
+ vcpu->hv_clock.flags = pvclock_flags;
/*
* The interface expects us to write an even number signaling that the
{
gpa_t gpa = data & ~0x3f;
- /* Bits 2:5 are resrved, Should be zero */
+ /* Bits 2:5 are reserved, Should be zero */
if (data & 0x3c)
return 1;
vcpu->arch.time_page =
gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
- if (is_error_page(vcpu->arch.time_page)) {
- kvm_release_page_clean(vcpu->arch.time_page);
+ if (is_error_page(vcpu->arch.time_page))
vcpu->arch.time_page = NULL;
- }
+
break;
}
case MSR_KVM_ASYNC_PF_EN:
* Ignore all writes to this no longer documented MSR.
* Writes are only relevant for old K7 processors,
* all pre-dating SVM, but a recommended workaround from
- * AMD for these chips. It is possible to speicify the
+ * AMD for these chips. It is possible to specify the
* affected processor models on the command line, hence
* the need to ignore the workaround.
*/
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_PCI_2_3:
case KVM_CAP_KVMCLOCK_CTRL:
+ case KVM_CAP_READONLY_MEM:
+ case KVM_CAP_IRQFD_RESAMPLE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
- kvm_apic_post_state_restore(vcpu);
+ kvm_apic_post_state_restore(vcpu, s);
update_cr8_intercept(vcpu);
return 0;
static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq)
{
- if (irq->irq < 0 || irq->irq >= 256)
+ if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS)
return -EINVAL;
if (irqchip_in_kernel(vcpu->kvm))
return -ENXIO;
*/
static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
{
- struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock;
if (!vcpu->arch.time_page)
return -EINVAL;
- src->flags |= PVCLOCK_GUEST_STOPPED;
- mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
+ vcpu->arch.pvclock_set_guest_stopped_request = true;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
return 0;
}
if (!kvm->arch.vpit)
return -ENXIO;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
- kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+ kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return 0;
}
return r;
}
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level);
+ return 0;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
create_pit_unlock:
mutex_unlock(&kvm->slots_lock);
break;
- case KVM_IRQ_LINE_STATUS:
- case KVM_IRQ_LINE: {
- struct kvm_irq_level irq_event;
-
- r = -EFAULT;
- if (copy_from_user(&irq_event, argp, sizeof irq_event))
- goto out;
- r = -ENXIO;
- if (irqchip_in_kernel(kvm)) {
- __s32 status;
- status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event.irq, irq_event.level);
- if (ioctl == KVM_IRQ_LINE_STATUS) {
- r = -EFAULT;
- irq_event.status = status;
- if (copy_to_user(argp, &irq_event,
- sizeof irq_event))
- goto out;
- }
- r = 0;
- }
- break;
- }
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
struct kvm_irqchip *chip;
gpa_t *gpa, struct x86_exception *exception,
bool write)
{
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
+ | (write ? PFERR_WRITE_MASK : 0);
- if (vcpu_match_mmio_gva(vcpu, gva) &&
- check_write_user_access(vcpu, write, access,
- vcpu->arch.access)) {
+ if (vcpu_match_mmio_gva(vcpu, gva)
+ && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
(gva & (PAGE_SIZE - 1));
trace_vcpu_match_mmio(gva, *gpa, write, false);
return 1;
}
- if (write)
- access |= PFERR_WRITE_MASK;
-
*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
if (*gpa == UNMAPPED_GVA)
return X86EMUL_CONTINUE;
}
-static struct read_write_emulator_ops read_emultor = {
+static const struct read_write_emulator_ops read_emultor = {
.read_write_prepare = read_prepare,
.read_write_emulate = read_emulate,
.read_write_mmio = vcpu_mmio_read,
.read_write_exit_mmio = read_exit_mmio,
};
-static struct read_write_emulator_ops write_emultor = {
+static const struct read_write_emulator_ops write_emultor = {
.read_write_emulate = write_emulate,
.read_write_mmio = write_mmio,
.read_write_exit_mmio = write_exit_mmio,
unsigned int bytes,
struct x86_exception *exception,
struct kvm_vcpu *vcpu,
- struct read_write_emulator_ops *ops)
+ const struct read_write_emulator_ops *ops)
{
gpa_t gpa;
int handled, ret;
int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
void *val, unsigned int bytes,
struct x86_exception *exception,
- struct read_write_emulator_ops *ops)
+ const struct read_write_emulator_ops *ops)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
gpa_t gpa;
goto emul_write;
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- if (is_error_page(page)) {
- kvm_release_page_clean(page);
+ if (is_error_page(page))
goto emul_write;
- }
kaddr = kmap_atomic(page);
kaddr += offset_in_page(gpa);
kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
}
-static struct x86_emulate_ops emulate_ops = {
+static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
+{
+ return kvm_register_read(emul_to_vcpu(ctxt), reg);
+}
+
+static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
+{
+ kvm_register_write(emul_to_vcpu(ctxt), reg, val);
+}
+
+static const struct x86_emulate_ops emulate_ops = {
+ .read_gpr = emulator_read_gpr,
+ .write_gpr = emulator_write_gpr,
.read_std = kvm_read_guest_virt_system,
.write_std = kvm_write_guest_virt_system,
.fetch = kvm_fetch_guest_virt,
.get_cpuid = emulator_get_cpuid,
};
-static void cache_all_regs(struct kvm_vcpu *vcpu)
-{
- kvm_register_read(vcpu, VCPU_REGS_RAX);
- kvm_register_read(vcpu, VCPU_REGS_RSP);
- kvm_register_read(vcpu, VCPU_REGS_RIP);
- vcpu->arch.regs_dirty = ~0;
-}
-
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
kvm_queue_exception(vcpu, ctxt->exception.vector);
}
-static void init_decode_cache(struct x86_emulate_ctxt *ctxt,
- const unsigned long *regs)
+static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
{
memset(&ctxt->twobyte, 0,
- (void *)&ctxt->regs - (void *)&ctxt->twobyte);
- memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
+ (void *)&ctxt->_regs - (void *)&ctxt->twobyte);
ctxt->fetch.start = 0;
ctxt->fetch.end = 0;
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
int cs_db, cs_l;
- /*
- * TODO: fix emulate.c to use guest_read/write_register
- * instead of direct ->regs accesses, can save hundred cycles
- * on Intel for instructions that don't read/change RSP, for
- * for example.
- */
- cache_all_regs(vcpu);
-
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
ctxt->eflags = kvm_get_rflags(vcpu);
X86EMUL_MODE_PROT16;
ctxt->guest_mode = is_guest_mode(vcpu);
- init_decode_cache(ctxt, vcpu->arch.regs);
+ init_decode_cache(ctxt);
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
}
return EMULATE_FAIL;
ctxt->eip = ctxt->_eip;
- memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
kvm_rip_write(vcpu, ctxt->eip);
kvm_set_rflags(vcpu, ctxt->eflags);
static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
{
gpa_t gpa;
+ pfn_t pfn;
if (tdp_enabled)
return false;
/*
* if emulation was due to access to shadowed page table
- * and it failed try to unshadow page and re-entetr the
+ * and it failed try to unshadow page and re-enter the
* guest to let CPU execute the instruction.
*/
if (kvm_mmu_unprotect_page_virt(vcpu, gva))
if (gpa == UNMAPPED_GVA)
return true; /* let cpu generate fault */
- if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
+ /*
+ * Do not retry the unhandleable instruction if it faults on the
+ * readonly host memory, otherwise it will goto a infinite loop:
+ * retry instruction -> write #PF -> emulation fail -> retry
+ * instruction -> ...
+ */
+ pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
+ if (!is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
return true;
+ }
return false;
}
return true;
}
+static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
+static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+
int x86_emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,
int emulation_type,
changes registers values during IO operation */
if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
- memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs);
+ emulator_invalidate_register_cache(ctxt);
}
restart:
} else if (vcpu->arch.pio.count) {
if (!vcpu->arch.pio.in)
vcpu->arch.pio.count = 0;
- else
+ else {
writeback = false;
+ vcpu->arch.complete_userspace_io = complete_emulated_pio;
+ }
r = EMULATE_DO_MMIO;
} else if (vcpu->mmio_needed) {
if (!vcpu->mmio_is_write)
writeback = false;
r = EMULATE_DO_MMIO;
+ vcpu->arch.complete_userspace_io = complete_emulated_mmio;
} else if (r == EMULATION_RESTART)
goto restart;
else
toggle_interruptibility(vcpu, ctxt->interruptibility);
kvm_set_rflags(vcpu, ctxt->eflags);
kvm_make_request(KVM_REQ_EVENT, vcpu);
- memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
kvm_rip_write(vcpu, ctxt->eip);
} else
if (cpu_has_xsave)
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ kvm_lapic_init();
return 0;
out:
return r;
}
+static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
+{
+ int r;
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ if (r != EMULATE_DONE)
+ return 0;
+ return 1;
+}
+
+static int complete_emulated_pio(struct kvm_vcpu *vcpu)
+{
+ BUG_ON(!vcpu->arch.pio.count);
+
+ return complete_emulated_io(vcpu);
+}
+
/*
* Implements the following, as a state machine:
*
* copy data
* exit
*/
-static int complete_mmio(struct kvm_vcpu *vcpu)
+static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
struct kvm_mmio_fragment *frag;
- int r;
- if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
- return 1;
+ BUG_ON(!vcpu->mmio_needed);
- if (vcpu->mmio_needed) {
- /* Complete previous fragment */
- frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
- if (!vcpu->mmio_is_write)
- memcpy(frag->data, run->mmio.data, frag->len);
- if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
- vcpu->mmio_needed = 0;
- if (vcpu->mmio_is_write)
- return 1;
- vcpu->mmio_read_completed = 1;
- goto done;
- }
- /* Initiate next fragment */
- ++frag;
- run->exit_reason = KVM_EXIT_MMIO;
- run->mmio.phys_addr = frag->gpa;
+ /* Complete previous fragment */
+ frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
+ if (!vcpu->mmio_is_write)
+ memcpy(frag->data, run->mmio.data, frag->len);
+ if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
+ vcpu->mmio_needed = 0;
if (vcpu->mmio_is_write)
- memcpy(run->mmio.data, frag->data, frag->len);
- run->mmio.len = frag->len;
- run->mmio.is_write = vcpu->mmio_is_write;
- return 0;
-
- }
-done:
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- if (r != EMULATE_DONE)
- return 0;
- return 1;
+ return 1;
+ vcpu->mmio_read_completed = 1;
+ return complete_emulated_io(vcpu);
+ }
+ /* Initiate next fragment */
+ ++frag;
+ run->exit_reason = KVM_EXIT_MMIO;
+ run->mmio.phys_addr = frag->gpa;
+ if (vcpu->mmio_is_write)
+ memcpy(run->mmio.data, frag->data, frag->len);
+ run->mmio.len = frag->len;
+ run->mmio.is_write = vcpu->mmio_is_write;
+ vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+ return 0;
}
+
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
}
}
- r = complete_mmio(vcpu);
- if (r <= 0)
- goto out;
+ if (unlikely(vcpu->arch.complete_userspace_io)) {
+ int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
+ vcpu->arch.complete_userspace_io = NULL;
+ r = cui(vcpu);
+ if (r <= 0)
+ goto out;
+ } else
+ WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
r = __vcpu_run(vcpu);
/*
* We are here if userspace calls get_regs() in the middle of
* instruction emulation. Registers state needs to be copied
- * back from emulation context to vcpu. Usrapace shouldn't do
+ * back from emulation context to vcpu. Userspace shouldn't do
* that usually, but some bad designed PV devices (vmware
* backdoor interface) need this to work
*/
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
- memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
+ emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
}
regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
if (ret)
return EMULATE_FAIL;
- memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
kvm_rip_write(vcpu, ctxt->eip);
kvm_set_rflags(vcpu, ctxt->eflags);
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (mmu_reset_needed)
kvm_mmu_reset_context(vcpu);
- max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+ max_bits = KVM_NR_INTERRUPTS;
pending_vec = find_first_bit(
(const unsigned long *)sregs->interrupt_bitmap, max_bits);
if (pending_vec < max_bits) {
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
for (i = 0; i < KVM_NR_DB_REGS; ++i)
vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
- vcpu->arch.switch_db_regs =
- (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
+ vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
} else {
for (i = 0; i < KVM_NR_DB_REGS; i++)
vcpu->arch.eff_db[i] = vcpu->arch.db[i];
- vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
}
+ kvm_update_dr7(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
*/
kvm_set_rflags(vcpu, rflags);
- kvm_x86_ops->set_guest_debug(vcpu, dbg);
+ kvm_x86_ops->update_db_bp_intercept(vcpu);
r = 0;
int r;
vcpu->arch.mtrr_state.have_fixed = 1;
- vcpu_load(vcpu);
+ r = vcpu_load(vcpu);
+ if (r)
+ return r;
r = kvm_arch_vcpu_reset(vcpu);
if (r == 0)
r = kvm_mmu_setup(vcpu);
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
+ int r;
vcpu->arch.apf.msr_val = 0;
- vcpu_load(vcpu);
+ r = vcpu_load(vcpu);
+ BUG_ON(r);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
vcpu->arch.nmi_pending = 0;
vcpu->arch.nmi_injected = false;
- vcpu->arch.switch_db_regs = 0;
memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
vcpu->arch.dr6 = DR6_FIXED_1;
vcpu->arch.dr7 = DR7_FIXED_1;
+ kvm_update_dr7(vcpu);
kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.apf.msr_val = 0;
* as we reset last_host_tsc on all VCPUs to stop this from being
* called multiple times (one for each physical CPU bringup).
*
- * Platforms with unnreliable TSCs don't have to deal with this, they
+ * Platforms with unreliable TSCs don't have to deal with this, they
* will be compensated by the logic in vcpu_load, which sets the TSC to
* catchup mode. This will catchup all VCPUs to real time, but cannot
* guarantee that they stay in perfect synchronization.
return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
}
+struct static_key kvm_no_apic_vcpu __read_mostly;
+
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
struct page *page;
r = kvm_create_lapic(vcpu);
if (r < 0)
goto fail_mmu_destroy;
- }
+ } else
+ static_key_slow_inc(&kvm_no_apic_vcpu);
vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
GFP_KERNEL);
kvm_mmu_destroy(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
+ if (!irqchip_in_kernel(vcpu->kvm))
+ static_key_slow_dec(&kvm_no_apic_vcpu);
}
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
+ /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
+ set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ &kvm->arch.irq_sources_bitmap);
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
+ mutex_init(&kvm->arch.apic_map_lock);
return 0;
}
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
{
- vcpu_load(vcpu);
+ int r;
+ r = vcpu_load(vcpu);
+ BUG_ON(r);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
}
put_page(kvm->arch.apic_access_page);
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
+ kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
}
void kvm_arch_free_memslot(struct kvm_memory_slot *free,
{
int i;
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
- if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
- kvm_kvfree(free->arch.lpage_info[i]);
- free->arch.lpage_info[i] = NULL;
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
+ kvm_kvfree(free->arch.rmap[i]);
+ free->arch.rmap[i] = NULL;
+ }
+ if (i == 0)
+ continue;
+
+ if (!dont || free->arch.lpage_info[i - 1] !=
+ dont->arch.lpage_info[i - 1]) {
+ kvm_kvfree(free->arch.lpage_info[i - 1]);
+ free->arch.lpage_info[i - 1] = NULL;
}
}
}
{
int i;
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
unsigned long ugfn;
int lpages;
- int level = i + 2;
+ int level = i + 1;
lpages = gfn_to_index(slot->base_gfn + npages - 1,
slot->base_gfn, level) + 1;
- slot->arch.lpage_info[i] =
- kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
- if (!slot->arch.lpage_info[i])
+ slot->arch.rmap[i] =
+ kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
+ if (!slot->arch.rmap[i])
+ goto out_free;
+ if (i == 0)
+ continue;
+
+ slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
+ sizeof(*slot->arch.lpage_info[i - 1]));
+ if (!slot->arch.lpage_info[i - 1])
goto out_free;
if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
- slot->arch.lpage_info[i][0].write_count = 1;
+ slot->arch.lpage_info[i - 1][0].write_count = 1;
if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
- slot->arch.lpage_info[i][lpages - 1].write_count = 1;
+ slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
ugfn = slot->userspace_addr >> PAGE_SHIFT;
/*
* If the gfn and userspace address are not aligned wrt each
unsigned long j;
for (j = 0; j < lpages; ++j)
- slot->arch.lpage_info[i][j].write_count = 1;
+ slot->arch.lpage_info[i - 1][j].write_count = 1;
}
}
return 0;
out_free:
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
- kvm_kvfree(slot->arch.lpage_info[i]);
- slot->arch.lpage_info[i] = NULL;
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ kvm_kvfree(slot->arch.rmap[i]);
+ slot->arch.rmap[i] = NULL;
+ if (i == 0)
+ continue;
+
+ kvm_kvfree(slot->arch.lpage_info[i - 1]);
+ slot->arch.lpage_info[i - 1] = NULL;
}
return -ENOMEM;
}
map_flags = MAP_SHARED | MAP_ANONYMOUS;
/*To keep backward compatibility with older userspace,
- *x86 needs to hanlde !user_alloc case.
+ *x86 needs to handle !user_alloc case.
*/
if (!user_alloc) {
- if (npages && !old.rmap) {
+ if (npages && !old.npages) {
unsigned long userspace_addr;
userspace_addr = vm_mmap(NULL, 0,
int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
- if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
+ if (!user_alloc && !old.user_alloc && old.npages && !npages) {
int ret;
ret = vm_munmap(old.userspace_addr,
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
kvm_mmu_slot_remove_write_access(kvm, mem->slot);
spin_unlock(&kvm->mmu_lock);
+ /*
+ * If memory slot is created, or moved, we need to clear all
+ * mmio sptes.
+ */
+ if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {
+ kvm_mmu_zap_all(kvm);
+ kvm_reload_remote_mmus(kvm);
+ }
}
-void kvm_arch_flush_shadow(struct kvm *kvm)
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
kvm_mmu_zap_all(kvm);
kvm_reload_remote_mmus(kvm);
}
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_arch_flush_shadow_all(kvm);
+}
+
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
extern u64 host_xcr0;
+extern struct static_key kvm_no_apic_vcpu;
#endif
__u64 userspace_addr; /* start of the userspace allocated memory */
};
-/* for kvm_memory_region::flags */
-#define KVM_MEM_LOG_DIRTY_PAGES 1UL
-#define KVM_MEMSLOT_INVALID (1UL << 1)
+/*
+ * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace,
+ * other bits are reserved for kvm internal use which are defined in
+ * include/linux/kvm_host.h.
+ */
+#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0)
+#define KVM_MEM_READONLY (1UL << 1)
/* for KVM_IRQ_LINE */
struct kvm_irq_level {
#define KVM_CAP_PPC_GET_SMMU_INFO 78
#define KVM_CAP_S390_COW 79
#define KVM_CAP_PPC_ALLOC_HTAB 80
+#ifdef __KVM_HAVE_READONLY_MEM
+#define KVM_CAP_READONLY_MEM 81
+#endif
+#define KVM_CAP_IRQFD_RESAMPLE 82
#ifdef KVM_CAP_IRQ_ROUTING
#endif
#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
+/*
+ * Available with KVM_CAP_IRQFD_RESAMPLE
+ *
+ * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies
+ * the irqfd to operate in resampling mode for level triggered interrupt
+ * emlation. See Documentation/virtual/kvm/api.txt.
+ */
+#define KVM_IRQFD_FLAG_RESAMPLE (1 << 1)
struct kvm_irqfd {
__u32 fd;
__u32 gsi;
__u32 flags;
- __u8 pad[20];
+ __u32 resamplefd;
+ __u8 pad[16];
};
struct kvm_clock_data {
#include <linux/slab.h>
#include <linux/rcupdate.h>
#include <linux/ratelimit.h>
+#include <linux/err.h>
#include <asm/signal.h>
#include <linux/kvm.h>
#define KVM_MMIO_SIZE 8
#endif
+/*
+ * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
+ * in kvm, other bits are visible for userspace which are defined in
+ * include/linux/kvm_h.
+ */
+#define KVM_MEMSLOT_INVALID (1UL << 16)
+
/*
* If we support unaligned MMIO, at most one fragment will be split into two:
*/
#define KVM_MAX_MMIO_FRAGMENTS \
(KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS)
+/*
+ * For the normal pfn, the highest 12 bits should be zero,
+ * so we can mask these bits to indicate the error.
+ */
+#define KVM_PFN_ERR_MASK (0xfffULL << 52)
+
+#define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK)
+#define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1)
+#define KVM_PFN_ERR_BAD (KVM_PFN_ERR_MASK + 2)
+#define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 3)
+
+static inline bool is_error_pfn(pfn_t pfn)
+{
+ return !!(pfn & KVM_PFN_ERR_MASK);
+}
+
+static inline bool is_noslot_pfn(pfn_t pfn)
+{
+ return pfn == KVM_PFN_ERR_BAD;
+}
+
+static inline bool is_invalid_pfn(pfn_t pfn)
+{
+ return !is_noslot_pfn(pfn) && is_error_pfn(pfn);
+}
+
+#define KVM_HVA_ERR_BAD (PAGE_OFFSET)
+#define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE)
+
+static inline bool kvm_is_error_hva(unsigned long addr)
+{
+ return addr >= PAGE_OFFSET;
+}
+
+#define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT))
+
+static inline bool is_error_page(struct page *page)
+{
+ return IS_ERR(page);
+}
+
/*
* vcpu->requests bit members
*/
#define KVM_REQ_PMU 16
#define KVM_REQ_PMI 17
-#define KVM_USERSPACE_IRQ_SOURCE_ID 0
+#define KVM_USERSPACE_IRQ_SOURCE_ID 0
+#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
struct kvm;
struct kvm_vcpu;
} async_pf;
#endif
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+ /*
+ * Cpu relax intercept or pause loop exit optimization
+ * in_spin_loop: set when a vcpu does a pause loop exit
+ * or cpu relax intercepted.
+ * dy_eligible: indicates whether vcpu is eligible for directed yield.
+ */
+ struct {
+ bool in_spin_loop;
+ bool dy_eligible;
+ } spin_loop;
+#endif
struct kvm_vcpu_arch arch;
};
gfn_t base_gfn;
unsigned long npages;
unsigned long flags;
- unsigned long *rmap;
unsigned long *dirty_bitmap;
struct kvm_arch_memory_slot arch;
unsigned long userspace_addr;
struct {
spinlock_t lock;
struct list_head items;
+ struct list_head resampler_list;
+ struct mutex resampler_lock;
} irqfds;
struct list_head ioeventfds;
#endif
int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
-void vcpu_load(struct kvm_vcpu *vcpu);
+int __must_check vcpu_load(struct kvm_vcpu *vcpu);
void vcpu_put(struct kvm_vcpu *vcpu);
int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
return slot;
}
-#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
-#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
-static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
-
-extern struct page *bad_page;
-extern struct page *fault_page;
-
-extern pfn_t bad_pfn;
-extern pfn_t fault_pfn;
-
-int is_error_page(struct page *page);
-int is_error_pfn(pfn_t pfn);
-int is_hwpoison_pfn(pfn_t pfn);
-int is_fault_pfn(pfn_t pfn);
-int is_noslot_pfn(pfn_t pfn);
-int is_invalid_pfn(pfn_t pfn);
-int kvm_is_error_hva(unsigned long addr);
int kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
int user_alloc);
int user_alloc);
bool kvm_largepages_enabled(void);
void kvm_disable_largepages(void);
-void kvm_arch_flush_shadow(struct kvm *kvm);
+/* flush all memory translations */
+void kvm_arch_flush_shadow_all(struct kvm *kvm);
+/* flush memory translations pointing to 'slot' */
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot);
int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
int nr_pages);
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
+unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_release_page_clean(struct page *page);
void kvm_release_page_dirty(struct page *page);
void kvm_set_page_dirty(struct page *page);
void kvm_set_page_accessed(struct page *page);
-pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
bool write_fault, bool *writable);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable);
-pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn);
-void kvm_release_pfn_dirty(pfn_t);
+pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
+pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
+
+void kvm_release_pfn_dirty(pfn_t pfn);
void kvm_release_pfn_clean(pfn_t pfn);
void kvm_set_pfn_dirty(pfn_t pfn);
void kvm_set_pfn_accessed(pfn_t pfn);
struct
kvm_userspace_memory_region *mem,
int user_alloc);
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level);
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg);
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
-int kvm_is_mmio_pfn(pfn_t pfn);
+bool kvm_is_mmio_pfn(pfn_t pfn);
struct kvm_irq_ack_notifier {
struct hlist_node link;
return search_memslots(slots, gfn);
}
+static inline unsigned long
+__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+}
+
static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
{
return gfn_to_memslot(kvm, gfn)->id;
(base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
}
-static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
- gfn_t gfn)
+static inline gfn_t
+hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
{
- return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+ gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT;
+
+ return slot->base_gfn + gfn_offset;
}
static inline gpa_t gfn_to_gpa(gfn_t gfn)
}
}
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+
+static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
+{
+ vcpu->spin_loop.in_spin_loop = val;
+}
+static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
+{
+ vcpu->spin_loop.dy_eligible = val;
+}
+
+#else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
+
+static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
+{
+}
+
+static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
+{
+}
+
+static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+{
+ return true;
+}
+
+#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
#endif
key->timeout = rl;
INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
}
+EXPORT_SYMBOL_GPL(jump_label_rate_limit);
static int addr_conflict(struct jump_entry *entry, void *start, void *end)
{
config HAVE_KVM_MSI
bool
+
+config HAVE_KVM_CPU_RELAX_INTERCEPT
+ bool
list_entry(vcpu->async_pf.done.next,
typeof(*work), link);
list_del(&work->link);
- if (work->page)
- put_page(work->page);
+ if (!is_error_page(work->page))
+ kvm_release_page_clean(work->page);
kmem_cache_free(async_pf_cache, work);
}
spin_unlock(&vcpu->async_pf.lock);
list_del(&work->queue);
vcpu->async_pf.queued--;
- if (work->page)
- put_page(work->page);
+ if (!is_error_page(work->page))
+ kvm_release_page_clean(work->page);
kmem_cache_free(async_pf_cache, work);
}
}
if (!work)
return -ENOMEM;
- work->page = bad_page;
- get_page(bad_page);
+ work->page = KVM_ERR_PTR_BAD_PAGE;
INIT_LIST_HEAD(&work->queue); /* for list_del to work */
spin_lock(&vcpu->async_pf.lock);
* --------------------------------------------------------------------
*/
+/*
+ * Resampling irqfds are a special variety of irqfds used to emulate
+ * level triggered interrupts. The interrupt is asserted on eventfd
+ * trigger. On acknowledgement through the irq ack notifier, the
+ * interrupt is de-asserted and userspace is notified through the
+ * resamplefd. All resamplers on the same gsi are de-asserted
+ * together, so we don't need to track the state of each individual
+ * user. We can also therefore share the same irq source ID.
+ */
+struct _irqfd_resampler {
+ struct kvm *kvm;
+ /*
+ * List of resampling struct _irqfd objects sharing this gsi.
+ * RCU list modified under kvm->irqfds.resampler_lock
+ */
+ struct list_head list;
+ struct kvm_irq_ack_notifier notifier;
+ /*
+ * Entry in list of kvm->irqfd.resampler_list. Use for sharing
+ * resamplers among irqfds on the same gsi.
+ * Accessed and modified under kvm->irqfds.resampler_lock
+ */
+ struct list_head link;
+};
+
struct _irqfd {
/* Used for MSI fast-path */
struct kvm *kvm;
/* Used for level IRQ fast-path */
int gsi;
struct work_struct inject;
+ /* The resampler used by this irqfd (resampler-only) */
+ struct _irqfd_resampler *resampler;
+ /* Eventfd notified on resample (resampler-only) */
+ struct eventfd_ctx *resamplefd;
+ /* Entry in list of irqfds for a resampler (resampler-only) */
+ struct list_head resampler_link;
/* Used for setup/shutdown */
struct eventfd_ctx *eventfd;
struct list_head list;
struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
struct kvm *kvm = irqfd->kvm;
- kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
- kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+ if (!irqfd->resampler) {
+ kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
+ kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+ } else
+ kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ irqfd->gsi, 1);
+}
+
+/*
+ * Since resampler irqfds share an IRQ source ID, we de-assert once
+ * then notify all of the resampler irqfds using this GSI. We can't
+ * do multiple de-asserts or we risk racing with incoming re-asserts.
+ */
+static void
+irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
+{
+ struct _irqfd_resampler *resampler;
+ struct _irqfd *irqfd;
+
+ resampler = container_of(kian, struct _irqfd_resampler, notifier);
+
+ kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ resampler->notifier.gsi, 0);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
+ eventfd_signal(irqfd->resamplefd, 1);
+
+ rcu_read_unlock();
+}
+
+static void
+irqfd_resampler_shutdown(struct _irqfd *irqfd)
+{
+ struct _irqfd_resampler *resampler = irqfd->resampler;
+ struct kvm *kvm = resampler->kvm;
+
+ mutex_lock(&kvm->irqfds.resampler_lock);
+
+ list_del_rcu(&irqfd->resampler_link);
+ synchronize_rcu();
+
+ if (list_empty(&resampler->list)) {
+ list_del(&resampler->link);
+ kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
+ kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ resampler->notifier.gsi, 0);
+ kfree(resampler);
+ }
+
+ mutex_unlock(&kvm->irqfds.resampler_lock);
}
/*
*/
flush_work(&irqfd->inject);
+ if (irqfd->resampler) {
+ irqfd_resampler_shutdown(irqfd);
+ eventfd_ctx_put(irqfd->resamplefd);
+ }
+
/*
* It is now safe to release the object's resources
*/
struct kvm_irq_routing_table *irq_rt;
struct _irqfd *irqfd, *tmp;
struct file *file = NULL;
- struct eventfd_ctx *eventfd = NULL;
+ struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
int ret;
unsigned int events;
irqfd->eventfd = eventfd;
+ if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
+ struct _irqfd_resampler *resampler;
+
+ resamplefd = eventfd_ctx_fdget(args->resamplefd);
+ if (IS_ERR(resamplefd)) {
+ ret = PTR_ERR(resamplefd);
+ goto fail;
+ }
+
+ irqfd->resamplefd = resamplefd;
+ INIT_LIST_HEAD(&irqfd->resampler_link);
+
+ mutex_lock(&kvm->irqfds.resampler_lock);
+
+ list_for_each_entry(resampler,
+ &kvm->irqfds.resampler_list, list) {
+ if (resampler->notifier.gsi == irqfd->gsi) {
+ irqfd->resampler = resampler;
+ break;
+ }
+ }
+
+ if (!irqfd->resampler) {
+ resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
+ if (!resampler) {
+ ret = -ENOMEM;
+ mutex_unlock(&kvm->irqfds.resampler_lock);
+ goto fail;
+ }
+
+ resampler->kvm = kvm;
+ INIT_LIST_HEAD(&resampler->list);
+ resampler->notifier.gsi = irqfd->gsi;
+ resampler->notifier.irq_acked = irqfd_resampler_ack;
+ INIT_LIST_HEAD(&resampler->link);
+
+ list_add(&resampler->link, &kvm->irqfds.resampler_list);
+ kvm_register_irq_ack_notifier(kvm,
+ &resampler->notifier);
+ irqfd->resampler = resampler;
+ }
+
+ list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
+ synchronize_rcu();
+
+ mutex_unlock(&kvm->irqfds.resampler_lock);
+ }
+
/*
* Install our own custom wake-up handling so we are notified via
* a callback whenever someone signals the underlying eventfd
return 0;
fail:
+ if (irqfd->resampler)
+ irqfd_resampler_shutdown(irqfd);
+
+ if (resamplefd && !IS_ERR(resamplefd))
+ eventfd_ctx_put(resamplefd);
+
if (eventfd && !IS_ERR(eventfd))
eventfd_ctx_put(eventfd);
{
spin_lock_init(&kvm->irqfds.lock);
INIT_LIST_HEAD(&kvm->irqfds.items);
+ INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
+ mutex_init(&kvm->irqfds.resampler_lock);
INIT_LIST_HEAD(&kvm->ioeventfds);
}
int
kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
{
- if (args->flags & ~KVM_IRQFD_FLAG_DEASSIGN)
+ if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
return -EINVAL;
if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
u32 old_irr;
u32 mask = 1 << irq;
union kvm_ioapic_redirect_entry entry;
- int ret = 1;
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
spin_lock(&ioapic->lock);
old_irr = ioapic->irr;
- if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
- int irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
- irq_source_id, level);
- entry = ioapic->redirtbl[irq];
- irq_level ^= entry.fields.polarity;
- if (!irq_level)
- ioapic->irr &= ~mask;
- else {
- int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
- ioapic->irr |= mask;
- if ((edge && old_irr != ioapic->irr) ||
- (!edge && !entry.fields.remote_irr))
- ret = ioapic_service(ioapic, irq);
- else
- ret = 0; /* report coalesced interrupt */
- }
- trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+ irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
+ irq_source_id, level);
+ entry = ioapic->redirtbl[irq];
+ irq_level ^= entry.fields.polarity;
+ if (!irq_level) {
+ ioapic->irr &= ~mask;
+ ret = 1;
+ } else {
+ int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+ ioapic->irr |= mask;
+ if ((edge && old_irr != ioapic->irr) ||
+ (!edge && !entry.fields.remote_irr))
+ ret = ioapic_service(ioapic, irq);
+ else
+ ret = 0; /* report coalesced interrupt */
}
+ trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
spin_unlock(&ioapic->lock);
return ret;
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages);
-static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, unsigned long size)
+static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+ unsigned long size)
{
gfn_t end_gfn;
pfn_t pfn;
- pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
+ pfn = gfn_to_pfn_memslot(slot, gfn);
end_gfn = gfn + (size >> PAGE_SHIFT);
gfn += 1;
return pfn;
while (gfn < end_gfn)
- gfn_to_pfn_memslot(kvm, slot, gfn++);
+ gfn_to_pfn_memslot(slot, gfn++);
return pfn;
}
* Pin all pages we are about to map in memory. This is
* important because we unmap and unpin in 4kb steps later.
*/
- pfn = kvm_pin_pages(kvm, slot, gfn, page_size);
+ pfn = kvm_pin_pages(slot, gfn, page_size);
if (is_error_pfn(pfn)) {
gfn += 1;
continue;
/* Get physical address */
phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
+
+ if (!phys) {
+ gfn++;
+ continue;
+ }
+
pfn = phys >> PAGE_SHIFT;
/* Unmap address from IO address space */
struct kvm_vcpu *vcpu, *lowest = NULL;
if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
- kvm_is_dm_lowest_prio(irq))
+ kvm_is_dm_lowest_prio(irq)) {
printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+ irq->delivery_mode = APIC_DM_FIXED;
+ }
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r))
+ return r;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!kvm_apic_present(vcpu))
}
ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+#ifdef CONFIG_X86
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+#endif
set_bit(irq_source_id, bitmap);
unlock:
mutex_unlock(&kvm->irq_lock);
void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
{
ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+#ifdef CONFIG_X86
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+#endif
mutex_lock(&kvm->irq_lock);
if (irq_source_id < 0 ||
switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_MASTER:
e->set = kvm_set_pic_irq;
- max_pin = 16;
+ max_pin = PIC_NUM_PINS;
break;
case KVM_IRQCHIP_PIC_SLAVE:
e->set = kvm_set_pic_irq;
- max_pin = 16;
+ max_pin = PIC_NUM_PINS;
delta = 8;
break;
case KVM_IRQCHIP_IOAPIC:
static bool largepages_enabled = true;
-static struct page *hwpoison_page;
-static pfn_t hwpoison_pfn;
-
-struct page *fault_page;
-pfn_t fault_pfn;
-
-inline int kvm_is_mmio_pfn(pfn_t pfn)
+bool kvm_is_mmio_pfn(pfn_t pfn)
{
if (pfn_valid(pfn)) {
int reserved;
/*
* Switches to specified vcpu, until a matching vcpu_put()
*/
-void vcpu_load(struct kvm_vcpu *vcpu)
+int vcpu_load(struct kvm_vcpu *vcpu)
{
int cpu;
- mutex_lock(&vcpu->mutex);
+ if (mutex_lock_killable(&vcpu->mutex))
+ return -EINTR;
if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
/* The thread running this VCPU changed. */
struct pid *oldpid = vcpu->pid;
preempt_notifier_register(&vcpu->preempt_notifier);
kvm_arch_vcpu_load(vcpu, cpu);
put_cpu();
+ return 0;
}
void vcpu_put(struct kvm_vcpu *vcpu)
}
vcpu->run = page_address(page);
+ kvm_vcpu_set_in_spin_loop(vcpu, false);
+ kvm_vcpu_set_dy_eligible(vcpu, false);
+
r = kvm_arch_vcpu_init(vcpu);
if (r < 0)
goto fail_free_run;
* count is also read inside the mmu_lock critical section.
*/
kvm->mmu_notifier_count++;
- for (; start < end; start += PAGE_SIZE)
- need_tlb_flush |= kvm_unmap_hva(kvm, start);
+ need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
need_tlb_flush |= kvm->tlbs_dirty;
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
int idx;
idx = srcu_read_lock(&kvm->srcu);
- kvm_arch_flush_shadow(kvm);
+ kvm_arch_flush_shadow_all(kvm);
srcu_read_unlock(&kvm->srcu, idx);
}
static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
struct kvm_memory_slot *dont)
{
- if (!dont || free->rmap != dont->rmap)
- vfree(free->rmap);
-
if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
kvm_destroy_dirty_bitmap(free);
kvm_arch_free_memslot(free, dont);
free->npages = 0;
- free->rmap = NULL;
}
void kvm_free_physmem(struct kvm *kvm)
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
#else
- kvm_arch_flush_shadow(kvm);
+ kvm_arch_flush_shadow_all(kvm);
#endif
kvm_arch_destroy_vm(kvm);
kvm_free_physmem(kvm);
slots->generation++;
}
+static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
+{
+ u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
+
+#ifdef KVM_CAP_READONLY_MEM
+ valid_flags |= KVM_MEM_READONLY;
+#endif
+
+ if (mem->flags & ~valid_flags)
+ return -EINVAL;
+
+ return 0;
+}
+
/*
* Allocate some memory and give it an address in the guest physical address
* space.
struct kvm_memory_slot old, new;
struct kvm_memslots *slots, *old_memslots;
+ r = check_memory_region_flags(mem);
+ if (r)
+ goto out;
+
r = -EINVAL;
/* General sanity checks */
if (mem->memory_size & (PAGE_SIZE - 1))
if (npages && !old.npages) {
new.user_alloc = user_alloc;
new.userspace_addr = mem->userspace_addr;
-#ifndef CONFIG_S390
- new.rmap = vzalloc(npages * sizeof(*new.rmap));
- if (!new.rmap)
- goto out_free;
-#endif /* not defined CONFIG_S390 */
+
if (kvm_arch_create_memslot(&new, npages))
goto out_free;
}
/* destroy any largepage mappings for dirty tracking */
}
- if (!npages) {
+ if (!npages || base_gfn != old.base_gfn) {
struct kvm_memory_slot *slot;
r = -ENOMEM;
old_memslots = kvm->memslots;
rcu_assign_pointer(kvm->memslots, slots);
synchronize_srcu_expedited(&kvm->srcu);
- /* From this point no new shadow pages pointing to a deleted
- * memslot will be created.
+ /* From this point no new shadow pages pointing to a deleted,
+ * or moved, memslot will be created.
*
* validation of sp->gfn happens in:
* - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
* - kvm_is_visible_gfn (mmu_check_roots)
*/
- kvm_arch_flush_shadow(kvm);
+ kvm_arch_flush_shadow_memslot(kvm, slot);
kfree(old_memslots);
}
/* actual memory is freed via old in kvm_free_physmem_slot below */
if (!npages) {
- new.rmap = NULL;
new.dirty_bitmap = NULL;
memset(&new.arch, 0, sizeof(new.arch));
}
kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
- /*
- * If the new memory slot is created, we need to clear all
- * mmio sptes.
- */
- if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
- kvm_arch_flush_shadow(kvm);
-
kvm_free_physmem_slot(&old, &new);
kfree(old_memslots);
}
EXPORT_SYMBOL_GPL(kvm_disable_largepages);
-int is_error_page(struct page *page)
-{
- return page == bad_page || page == hwpoison_page || page == fault_page;
-}
-EXPORT_SYMBOL_GPL(is_error_page);
-
-int is_error_pfn(pfn_t pfn)
-{
- return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_error_pfn);
-
-int is_hwpoison_pfn(pfn_t pfn)
-{
- return pfn == hwpoison_pfn;
-}
-EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
-
-int is_fault_pfn(pfn_t pfn)
-{
- return pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_fault_pfn);
-
-int is_noslot_pfn(pfn_t pfn)
-{
- return pfn == bad_pfn;
-}
-EXPORT_SYMBOL_GPL(is_noslot_pfn);
-
-int is_invalid_pfn(pfn_t pfn)
-{
- return pfn == hwpoison_pfn || pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_invalid_pfn);
-
-static inline unsigned long bad_hva(void)
-{
- return PAGE_OFFSET;
-}
-
-int kvm_is_error_hva(unsigned long addr)
-{
- return addr == bad_hva();
-}
-EXPORT_SYMBOL_GPL(kvm_is_error_hva);
-
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
return __gfn_to_memslot(kvm_memslots(kvm), gfn);
return size;
}
-static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
- gfn_t *nr_pages)
+static bool memslot_is_readonly(struct kvm_memory_slot *slot)
+{
+ return slot->flags & KVM_MEM_READONLY;
+}
+
+static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+ gfn_t *nr_pages, bool write)
{
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
- return bad_hva();
+ return KVM_HVA_ERR_BAD;
+
+ if (memslot_is_readonly(slot) && write)
+ return KVM_HVA_ERR_RO_BAD;
if (nr_pages)
*nr_pages = slot->npages - (gfn - slot->base_gfn);
- return gfn_to_hva_memslot(slot, gfn);
+ return __gfn_to_hva_memslot(slot, gfn);
}
+static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+ gfn_t *nr_pages)
+{
+ return __gfn_to_hva_many(slot, gfn, nr_pages, true);
+}
+
+unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+ gfn_t gfn)
+{
+ return gfn_to_hva_many(slot, gfn, NULL);
+}
+EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
+
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
-static pfn_t get_fault_pfn(void)
+/*
+ * The hva returned by this function is only allowed to be read.
+ * It should pair with kvm_read_hva() or kvm_read_hva_atomic().
+ */
+static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
+{
+ return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
+}
+
+static int kvm_read_hva(void *data, void __user *hva, int len)
{
- get_page(fault_page);
- return fault_pfn;
+ return __copy_from_user(data, hva, len);
+}
+
+static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
+{
+ return __copy_from_user_inatomic(data, hva, len);
}
int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
return rc == -EHWPOISON;
}
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
- bool *async, bool write_fault, bool *writable)
+/*
+ * The atomic path to get the writable pfn which will be stored in @pfn,
+ * true indicates success, otherwise false is returned.
+ */
+static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
+ bool write_fault, bool *writable, pfn_t *pfn)
{
struct page *page[1];
- int npages = 0;
- pfn_t pfn;
+ int npages;
- /* we can do it either atomically or asynchronously, not both */
- BUG_ON(atomic && async);
+ if (!(async || atomic))
+ return false;
- BUG_ON(!write_fault && !writable);
+ /*
+ * Fast pin a writable pfn only if it is a write fault request
+ * or the caller allows to map a writable pfn for a read fault
+ * request.
+ */
+ if (!(write_fault || writable))
+ return false;
- if (writable)
- *writable = true;
+ npages = __get_user_pages_fast(addr, 1, 1, page);
+ if (npages == 1) {
+ *pfn = page_to_pfn(page[0]);
- if (atomic || async)
- npages = __get_user_pages_fast(addr, 1, 1, page);
+ if (writable)
+ *writable = true;
+ return true;
+ }
- if (unlikely(npages != 1) && !atomic) {
- might_sleep();
+ return false;
+}
- if (writable)
- *writable = write_fault;
+/*
+ * The slow path to get the pfn of the specified host virtual address,
+ * 1 indicates success, -errno is returned if error is detected.
+ */
+static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
+ bool *writable, pfn_t *pfn)
+{
+ struct page *page[1];
+ int npages = 0;
- if (async) {
- down_read(¤t->mm->mmap_sem);
- npages = get_user_page_nowait(current, current->mm,
- addr, write_fault, page);
- up_read(¤t->mm->mmap_sem);
- } else
- npages = get_user_pages_fast(addr, 1, write_fault,
- page);
-
- /* map read fault as writable if possible */
- if (unlikely(!write_fault) && npages == 1) {
- struct page *wpage[1];
-
- npages = __get_user_pages_fast(addr, 1, 1, wpage);
- if (npages == 1) {
- *writable = true;
- put_page(page[0]);
- page[0] = wpage[0];
- }
- npages = 1;
+ might_sleep();
+
+ if (writable)
+ *writable = write_fault;
+
+ if (async) {
+ down_read(¤t->mm->mmap_sem);
+ npages = get_user_page_nowait(current, current->mm,
+ addr, write_fault, page);
+ up_read(¤t->mm->mmap_sem);
+ } else
+ npages = get_user_pages_fast(addr, 1, write_fault,
+ page);
+ if (npages != 1)
+ return npages;
+
+ /* map read fault as writable if possible */
+ if (unlikely(!write_fault) && writable) {
+ struct page *wpage[1];
+
+ npages = __get_user_pages_fast(addr, 1, 1, wpage);
+ if (npages == 1) {
+ *writable = true;
+ put_page(page[0]);
+ page[0] = wpage[0];
}
+
+ npages = 1;
}
+ *pfn = page_to_pfn(page[0]);
+ return npages;
+}
- if (unlikely(npages != 1)) {
- struct vm_area_struct *vma;
+static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
+{
+ if (unlikely(!(vma->vm_flags & VM_READ)))
+ return false;
- if (atomic)
- return get_fault_pfn();
+ if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
+ return false;
- down_read(¤t->mm->mmap_sem);
- if (npages == -EHWPOISON ||
- (!async && check_user_page_hwpoison(addr))) {
- up_read(¤t->mm->mmap_sem);
- get_page(hwpoison_page);
- return page_to_pfn(hwpoison_page);
- }
+ return true;
+}
- vma = find_vma_intersection(current->mm, addr, addr+1);
-
- if (vma == NULL)
- pfn = get_fault_pfn();
- else if ((vma->vm_flags & VM_PFNMAP)) {
- pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
- vma->vm_pgoff;
- BUG_ON(!kvm_is_mmio_pfn(pfn));
- } else {
- if (async && (vma->vm_flags & VM_WRITE))
- *async = true;
- pfn = get_fault_pfn();
- }
- up_read(¤t->mm->mmap_sem);
- } else
- pfn = page_to_pfn(page[0]);
+/*
+ * Pin guest page in memory and return its pfn.
+ * @addr: host virtual address which maps memory to the guest
+ * @atomic: whether this function can sleep
+ * @async: whether this function need to wait IO complete if the
+ * host page is not in the memory
+ * @write_fault: whether we should get a writable host page
+ * @writable: whether it allows to map a writable host page for !@write_fault
+ *
+ * The function will map a writable host page for these two cases:
+ * 1): @write_fault = true
+ * 2): @write_fault = false && @writable, @writable will tell the caller
+ * whether the mapping is writable.
+ */
+static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
+ bool write_fault, bool *writable)
+{
+ struct vm_area_struct *vma;
+ pfn_t pfn = 0;
+ int npages;
+
+ /* we can do it either atomically or asynchronously, not both */
+ BUG_ON(atomic && async);
+ if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
+ return pfn;
+
+ if (atomic)
+ return KVM_PFN_ERR_FAULT;
+
+ npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
+ if (npages == 1)
+ return pfn;
+
+ down_read(¤t->mm->mmap_sem);
+ if (npages == -EHWPOISON ||
+ (!async && check_user_page_hwpoison(addr))) {
+ pfn = KVM_PFN_ERR_HWPOISON;
+ goto exit;
+ }
+
+ vma = find_vma_intersection(current->mm, addr, addr + 1);
+
+ if (vma == NULL)
+ pfn = KVM_PFN_ERR_FAULT;
+ else if ((vma->vm_flags & VM_PFNMAP)) {
+ pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ BUG_ON(!kvm_is_mmio_pfn(pfn));
+ } else {
+ if (async && vma_is_valid(vma, write_fault))
+ *async = true;
+ pfn = KVM_PFN_ERR_FAULT;
+ }
+exit:
+ up_read(¤t->mm->mmap_sem);
return pfn;
}
-pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
+static pfn_t
+__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+ bool *async, bool write_fault, bool *writable)
{
- return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
+ unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
+
+ if (addr == KVM_HVA_ERR_RO_BAD)
+ return KVM_PFN_ERR_RO_FAULT;
+
+ if (kvm_is_error_hva(addr))
+ return KVM_PFN_ERR_BAD;
+
+ /* Do not map writable pfn in the readonly memslot. */
+ if (writable && memslot_is_readonly(slot)) {
+ *writable = false;
+ writable = NULL;
+ }
+
+ return hva_to_pfn(addr, atomic, async, write_fault,
+ writable);
}
-EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
bool write_fault, bool *writable)
{
- unsigned long addr;
+ struct kvm_memory_slot *slot;
if (async)
*async = false;
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr)) {
- get_page(bad_page);
- return page_to_pfn(bad_page);
- }
+ slot = gfn_to_memslot(kvm, gfn);
- return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
+ return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
+ writable);
}
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
-pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn)
+pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
+}
+
+pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
{
- unsigned long addr = gfn_to_hva_memslot(slot, gfn);
- return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
+ return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
int nr_pages)
}
EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
+static struct page *kvm_pfn_to_page(pfn_t pfn)
+{
+ if (is_error_pfn(pfn))
+ return KVM_ERR_PTR_BAD_PAGE;
+
+ if (kvm_is_mmio_pfn(pfn)) {
+ WARN_ON(1);
+ return KVM_ERR_PTR_BAD_PAGE;
+ }
+
+ return pfn_to_page(pfn);
+}
+
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
pfn_t pfn;
pfn = gfn_to_pfn(kvm, gfn);
- if (!kvm_is_mmio_pfn(pfn))
- return pfn_to_page(pfn);
-
- WARN_ON(kvm_is_mmio_pfn(pfn));
- get_page(bad_page);
- return bad_page;
+ return kvm_pfn_to_page(pfn);
}
EXPORT_SYMBOL_GPL(gfn_to_page);
void kvm_release_page_clean(struct page *page)
{
+ WARN_ON(is_error_page(page));
+
kvm_release_pfn_clean(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_release_page_clean);
void kvm_release_pfn_clean(pfn_t pfn)
{
+ WARN_ON(is_error_pfn(pfn));
+
if (!kvm_is_mmio_pfn(pfn))
put_page(pfn_to_page(pfn));
}
void kvm_release_page_dirty(struct page *page)
{
+ WARN_ON(is_error_page(page));
+
kvm_release_pfn_dirty(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
int r;
unsigned long addr;
- addr = gfn_to_hva(kvm, gfn);
+ addr = gfn_to_hva_read(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
- r = __copy_from_user(data, (void __user *)addr + offset, len);
+ r = kvm_read_hva(data, (void __user *)addr + offset, len);
if (r)
return -EFAULT;
return 0;
gfn_t gfn = gpa >> PAGE_SHIFT;
int offset = offset_in_page(gpa);
- addr = gfn_to_hva(kvm, gfn);
+ addr = gfn_to_hva_read(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
pagefault_disable();
- r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+ r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
pagefault_enable();
if (r)
return -EFAULT;
}
EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+/*
+ * Helper that checks whether a VCPU is eligible for directed yield.
+ * Most eligible candidate to yield is decided by following heuristics:
+ *
+ * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
+ * (preempted lock holder), indicated by @in_spin_loop.
+ * Set at the beiginning and cleared at the end of interception/PLE handler.
+ *
+ * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
+ * chance last time (mostly it has become eligible now since we have probably
+ * yielded to lockholder in last iteration. This is done by toggling
+ * @dy_eligible each time a VCPU checked for eligibility.)
+ *
+ * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
+ * to preempted lock-holder could result in wrong VCPU selection and CPU
+ * burning. Giving priority for a potential lock-holder increases lock
+ * progress.
+ *
+ * Since algorithm is based on heuristics, accessing another VCPU data without
+ * locking does not harm. It may result in trying to yield to same VCPU, fail
+ * and continue with next VCPU and so on.
+ */
+bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+{
+ bool eligible;
+
+ eligible = !vcpu->spin_loop.in_spin_loop ||
+ (vcpu->spin_loop.in_spin_loop &&
+ vcpu->spin_loop.dy_eligible);
+
+ if (vcpu->spin_loop.in_spin_loop)
+ kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
+
+ return eligible;
+}
+#endif
void kvm_vcpu_on_spin(struct kvm_vcpu *me)
{
struct kvm *kvm = me->kvm;
int pass;
int i;
+ kvm_vcpu_set_in_spin_loop(me, true);
/*
* We boost the priority of a VCPU that is runnable but not
* currently running, because it got preempted by something
continue;
if (waitqueue_active(&vcpu->wq))
continue;
+ if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
+ continue;
if (kvm_vcpu_yield_to(vcpu)) {
kvm->last_boosted_vcpu = i;
yielded = 1;
}
}
}
+ kvm_vcpu_set_in_spin_loop(me, false);
+
+ /* Ensure vcpu is not eligible during next spinloop */
+ kvm_vcpu_set_dy_eligible(me, false);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
#endif
- vcpu_load(vcpu);
+ r = vcpu_load(vcpu);
+ if (r)
+ return r;
switch (ioctl) {
case KVM_RUN:
r = -EINVAL;
r = kvm_send_userspace_msi(kvm, &msi);
break;
}
+#endif
+#ifdef __KVM_HAVE_IRQ_LINE
+ case KVM_IRQ_LINE_STATUS:
+ case KVM_IRQ_LINE: {
+ struct kvm_irq_level irq_event;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq_event, argp, sizeof irq_event))
+ goto out;
+
+ r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
+ if (r)
+ goto out;
+
+ r = -EFAULT;
+ if (ioctl == KVM_IRQ_LINE_STATUS) {
+ if (copy_to_user(argp, &irq_event, sizeof irq_event))
+ goto out;
+ }
+
+ r = 0;
+ break;
+ }
#endif
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
.resume = kvm_resume,
};
-struct page *bad_page;
-pfn_t bad_pfn;
-
static inline
struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
{
if (r)
goto out_fail;
- bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (bad_page == NULL) {
- r = -ENOMEM;
- goto out;
- }
-
- bad_pfn = page_to_pfn(bad_page);
-
- hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (hwpoison_page == NULL) {
- r = -ENOMEM;
- goto out_free_0;
- }
-
- hwpoison_pfn = page_to_pfn(hwpoison_page);
-
- fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (fault_page == NULL) {
- r = -ENOMEM;
- goto out_free_0;
- }
-
- fault_pfn = page_to_pfn(fault_page);
-
if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
r = -ENOMEM;
goto out_free_0;
out_free_0a:
free_cpumask_var(cpus_hardware_enabled);
out_free_0:
- if (fault_page)
- __free_page(fault_page);
- if (hwpoison_page)
- __free_page(hwpoison_page);
- __free_page(bad_page);
-out:
kvm_arch_exit();
out_fail:
return r;
kvm_arch_hardware_unsetup();
kvm_arch_exit();
free_cpumask_var(cpus_hardware_enabled);
- __free_page(fault_page);
- __free_page(hwpoison_page);
- __free_page(bad_page);
}
EXPORT_SYMBOL_GPL(kvm_exit);