3 #include "kvm/interrupt.h"
8 #include <asm/bootparam.h>
10 #include <sys/ioctl.h>
24 * Compatibility code. Remove this when we move to tools/kvm.
26 #ifndef KVM_EXIT_INTERNAL_ERROR
27 # define KVM_EXIT_INTERNAL_ERROR 17
30 #define DEFINE_KVM_EXIT_REASON(reason) [reason] = #reason
32 const char *kvm_exit_reasons[] = {
33 DEFINE_KVM_EXIT_REASON(KVM_EXIT_UNKNOWN),
34 DEFINE_KVM_EXIT_REASON(KVM_EXIT_EXCEPTION),
35 DEFINE_KVM_EXIT_REASON(KVM_EXIT_IO),
36 DEFINE_KVM_EXIT_REASON(KVM_EXIT_HYPERCALL),
37 DEFINE_KVM_EXIT_REASON(KVM_EXIT_DEBUG),
38 DEFINE_KVM_EXIT_REASON(KVM_EXIT_HLT),
39 DEFINE_KVM_EXIT_REASON(KVM_EXIT_MMIO),
40 DEFINE_KVM_EXIT_REASON(KVM_EXIT_IRQ_WINDOW_OPEN),
41 DEFINE_KVM_EXIT_REASON(KVM_EXIT_SHUTDOWN),
42 DEFINE_KVM_EXIT_REASON(KVM_EXIT_FAIL_ENTRY),
43 DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTR),
44 DEFINE_KVM_EXIT_REASON(KVM_EXIT_SET_TPR),
45 DEFINE_KVM_EXIT_REASON(KVM_EXIT_TPR_ACCESS),
46 DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_SIEIC),
47 DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_RESET),
48 DEFINE_KVM_EXIT_REASON(KVM_EXIT_DCR),
49 DEFINE_KVM_EXIT_REASON(KVM_EXIT_NMI),
50 DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTERNAL_ERROR),
53 #define DEFINE_KVM_EXT(ext) \
61 { DEFINE_KVM_EXT(KVM_CAP_COALESCED_MMIO) },
62 { DEFINE_KVM_EXT(KVM_CAP_SET_TSS_ADDR) },
63 { DEFINE_KVM_EXT(KVM_CAP_PIT2) },
64 { DEFINE_KVM_EXT(KVM_CAP_USER_MEMORY) },
65 { DEFINE_KVM_EXT(KVM_CAP_IRQ_ROUTING) },
66 { DEFINE_KVM_EXT(KVM_CAP_IRQCHIP) },
67 { DEFINE_KVM_EXT(KVM_CAP_IRQ_INJECT_STATUS) },
70 static inline bool host_ptr_in_ram(struct kvm *self, void *p)
72 return self->ram_start <= p && p < (self->ram_start + self->ram_size);
75 static inline uint32_t segment_to_flat(uint16_t selector, uint16_t offset)
77 return ((uint32_t)selector << 4) + (uint32_t) offset;
80 static inline void *guest_flat_to_host(struct kvm *self, unsigned long offset)
82 return self->ram_start + offset;
85 static inline void *guest_real_to_host(struct kvm *self, uint16_t selector, uint16_t offset)
87 unsigned long flat = segment_to_flat(selector, offset);
89 return guest_flat_to_host(self, flat);
92 static bool kvm__supports_extension(struct kvm *self, unsigned int extension)
96 ret = ioctl(self->sys_fd, KVM_CHECK_EXTENSION, extension);
103 static int kvm__check_extensions(struct kvm *self)
107 for (i = 0; i < ARRAY_SIZE(kvm_req_ext); i++) {
108 if (!kvm__supports_extension(self, kvm_req_ext[i].code)) {
109 error("Unsuppored KVM extension detected: %s",
110 kvm_req_ext[i].name);
118 static struct kvm *kvm__new(void)
120 struct kvm *self = calloc(1, sizeof *self);
123 die("out of memory");
128 void kvm__delete(struct kvm *self)
130 free(self->ram_start);
134 struct kvm *kvm__init(void)
136 struct kvm_userspace_memory_region mem;
137 struct kvm_pit_config pit_config = { .flags = 0, };
145 self->sys_fd = open("/dev/kvm", O_RDWR);
146 if (self->sys_fd < 0)
149 ret = ioctl(self->sys_fd, KVM_GET_API_VERSION, 0);
150 if (ret != KVM_API_VERSION)
151 die_perror("KVM_API_VERSION ioctl");
153 self->vm_fd = ioctl(self->sys_fd, KVM_CREATE_VM, 0);
155 die_perror("KVM_CREATE_VM ioctl");
157 if (kvm__check_extensions(self))
158 die("A required KVM extention is not supported by OS");
160 ret = ioctl(self->vm_fd, KVM_SET_TSS_ADDR, 0xfffbd000);
162 die_perror("KVM_SET_TSS_ADDR ioctl");
164 ret = ioctl(self->vm_fd, KVM_CREATE_PIT2, &pit_config);
166 die_perror("KVM_CREATE_PIT2 ioctl");
168 self->ram_size = 64UL * 1024UL * 1024UL;
170 page_size = sysconf(_SC_PAGESIZE);
171 if (posix_memalign(&self->ram_start, page_size, self->ram_size) != 0)
172 die("out of memory");
174 mem = (struct kvm_userspace_memory_region) {
176 .guest_phys_addr = 0x0UL,
177 .memory_size = self->ram_size,
178 .userspace_addr = (unsigned long) self->ram_start,
181 ret = ioctl(self->vm_fd, KVM_SET_USER_MEMORY_REGION, &mem);
183 die_perror("KVM_SET_USER_MEMORY_REGION ioctl");
185 ret = ioctl(self->vm_fd, KVM_CREATE_IRQCHIP);
187 die_perror("KVM_CREATE_IRQCHIP ioctl");
189 self->vcpu_fd = ioctl(self->vm_fd, KVM_CREATE_VCPU, 0);
190 if (self->vcpu_fd < 0)
191 die_perror("KVM_CREATE_VCPU ioctl");
193 mmap_size = ioctl(self->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
195 die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
197 self->kvm_run = mmap(NULL, mmap_size, PROT_READ|PROT_WRITE, MAP_SHARED, self->vcpu_fd, 0);
198 if (self->kvm_run == MAP_FAILED)
199 die("unable to mmap vcpu fd");
204 void kvm__enable_singlestep(struct kvm *self)
206 struct kvm_guest_debug debug = {
207 .control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP,
210 if (ioctl(self->vcpu_fd, KVM_SET_GUEST_DEBUG, &debug) < 0)
211 warning("KVM_SET_GUEST_DEBUG failed");
214 #define BOOT_LOADER_SELECTOR 0x1000
215 #define BOOT_LOADER_IP 0x0000
216 #define BOOT_LOADER_SP 0x8000
217 #define BOOT_CMDLINE_OFFSET 0x20000
219 #define BOOT_PROTOCOL_REQUIRED 0x202
220 #define LOAD_HIGH 0x01
222 static int load_flat_binary(struct kvm *self, int fd)
227 if (lseek(fd, 0, SEEK_SET) < 0)
230 p = guest_real_to_host(self, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP);
232 while ((nr = read(fd, p, 65536)) > 0)
235 self->boot_selector = BOOT_LOADER_SELECTOR;
236 self->boot_ip = BOOT_LOADER_IP;
237 self->boot_sp = BOOT_LOADER_SP;
243 * The protected mode kernel part of a modern bzImage is loaded at 1 MB by
246 #define BZ_KERNEL_START 0x100000UL
248 static const char *BZIMAGE_MAGIC = "HdrS";
250 #define BZ_DEFAULT_SETUP_SECTS 4
252 static bool load_bzimage(struct kvm *self, int fd, const char *kernel_cmdline)
254 struct real_intr_desc intr;
255 struct boot_params boot;
256 unsigned long setup_sects;
257 unsigned int intr_addr;
264 * See Documentation/x86/boot.txt for details no bzImage on-disk and
268 if (lseek(fd, 0, SEEK_SET) < 0)
271 read(fd, &boot, sizeof(boot));
273 if (memcmp(&boot.hdr.header, BZIMAGE_MAGIC, strlen(BZIMAGE_MAGIC)) != 0)
276 if (boot.hdr.version < BOOT_PROTOCOL_REQUIRED) {
277 warning("Too old kernel");
281 if (lseek(fd, 0, SEEK_SET) < 0)
284 if (!boot.hdr.setup_sects)
285 boot.hdr.setup_sects = BZ_DEFAULT_SETUP_SECTS;
286 setup_sects = boot.hdr.setup_sects + 1;
288 setup_size = setup_sects << 9;
289 p = guest_real_to_host(self, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP);
291 if (read(fd, p, setup_size) != setup_size)
294 p = guest_flat_to_host(self, BZ_KERNEL_START);
296 while ((nr = read(fd, p, 65536)) > 0)
299 p = guest_flat_to_host(self, BOOT_CMDLINE_OFFSET);
300 if (kernel_cmdline) {
301 cmdline_size = strlen(kernel_cmdline) + 1;
302 if (cmdline_size > boot.hdr.cmdline_size)
303 cmdline_size = boot.hdr.cmdline_size;
305 memset(p, 0, boot.hdr.cmdline_size);
306 memcpy(p, kernel_cmdline, cmdline_size - 1);
309 #define hdr_offset(member) \
310 offsetof(struct boot_params, hdr) + \
311 offsetof(struct setup_header, member)
312 #define guest_hdr(kvm, member) \
313 guest_real_to_host(kvm, \
314 BOOT_LOADER_SELECTOR, \
317 /* some fields in guest header have to be updated */
318 p = guest_hdr(self, cmd_line_ptr);
319 *(uint32_t *)p = BOOT_CMDLINE_OFFSET;
321 p = guest_hdr(self, type_of_loader);
322 *(uint8_t *)p = 0xff;
324 p = guest_hdr(self, heap_end_ptr);
325 *(uint16_t *)p = 0xfe00;
327 p = guest_hdr(self, loadflags);
328 *(uint8_t *)p |= CAN_USE_HEAP;
330 self->boot_selector = BOOT_LOADER_SELECTOR;
332 * The real-mode setup code starts at offset 0x200 of a bzImage. See
333 * Documentation/x86/boot.txt for details.
335 self->boot_ip = BOOT_LOADER_IP + 0x200;
336 self->boot_sp = BOOT_LOADER_SP;
339 * Setup a *fake* real mode vector table, it has only
340 * one real hadler which does just iret
342 * This is where the BIOS lives -- BDA area
344 intr_addr = BIOS_INTR_NEXT(BDA_START + 0, 16);
345 p = guest_flat_to_host(self, intr_addr);
346 memcpy(p, intfake, intfake_end - intfake);
347 intr = (struct real_intr_desc) {
348 .segment = REAL_SEGMENT(intr_addr),
351 interrupt_table__setup(&self->interrupt_table, &intr);
353 intr_addr = BIOS_INTR_NEXT(BDA_START + (intfake_end - intfake), 16);
354 p = guest_flat_to_host(self, intr_addr);
355 memcpy(p, int10, int10_end - int10);
356 intr = (struct real_intr_desc) {
357 .segment = REAL_SEGMENT(intr_addr),
360 interrupt_table__set(&self->interrupt_table, &intr, 0x10);
362 p = guest_flat_to_host(self, 0);
363 interrupt_table__copy(&self->interrupt_table, p, REAL_INTR_SIZE);
368 bool kvm__load_kernel(struct kvm *kvm, const char *kernel_filename,
369 const char *kernel_cmdline)
374 fd = open(kernel_filename, O_RDONLY);
376 die("unable to open kernel");
378 ret = load_bzimage(kvm, fd, kernel_cmdline);
382 ret = load_flat_binary(kvm, fd);
386 die("%s is not a valid bzImage or flat binary", kernel_filename);
392 static inline uint64_t ip_flat_to_real(struct kvm *self, uint64_t ip)
394 uint64_t cs = self->sregs.cs.selector;
396 return ip - (cs << 4);
399 static inline bool is_in_protected_mode(struct kvm *self)
401 return self->sregs.cr0 & 0x01;
404 static inline uint64_t ip_to_flat(struct kvm *self, uint64_t ip)
409 * NOTE! We should take code segment base address into account here.
410 * Luckily it's usually zero because Linux uses flat memory model.
412 if (is_in_protected_mode(self))
415 cs = self->sregs.cs.selector;
417 return ip + (cs << 4);
420 static inline uint32_t selector_to_base(uint16_t selector)
423 * KVM on Intel requires 'base' to be 'selector * 16' in real mode.
425 return (uint32_t)selector * 16;
428 static struct kvm_msrs *kvm_msrs__new(size_t nmsrs)
430 struct kvm_msrs *self = calloc(1, sizeof(*self) + (sizeof(struct kvm_msr_entry) * nmsrs));
433 die("out of memory");
438 #define MSR_IA32_TIME_STAMP_COUNTER 0x10
440 #define MSR_IA32_SYSENTER_CS 0x174
441 #define MSR_IA32_SYSENTER_ESP 0x175
442 #define MSR_IA32_SYSENTER_EIP 0x176
444 #define MSR_IA32_STAR 0xc0000081
445 #define MSR_IA32_LSTAR 0xc0000082
446 #define MSR_IA32_CSTAR 0xc0000083
447 #define MSR_IA32_FMASK 0xc0000084
448 #define MSR_IA32_KERNEL_GS_BASE 0xc0000102
450 #define KVM_MSR_ENTRY(_index, _data) \
451 (struct kvm_msr_entry) { .index = _index, .data = _data }
453 static void kvm__setup_msrs(struct kvm *self)
455 unsigned long ndx = 0;
457 self->msrs = kvm_msrs__new(100);
459 self->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_CS, 0x0);
460 self->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_ESP, 0x0);
461 self->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_EIP, 0x0);
463 self->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_STAR, 0x0);
464 self->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_CSTAR, 0x0);
465 self->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_KERNEL_GS_BASE, 0x0);
466 self->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_FMASK, 0x0);
467 self->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_LSTAR, 0x0);
469 self->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_TIME_STAMP_COUNTER, 0x0);
471 self->msrs->nmsrs = ndx;
473 if (ioctl(self->vcpu_fd, KVM_SET_MSRS, self->msrs) < 0)
474 die_perror("KVM_SET_MSRS failed");
477 static void kvm__setup_fpu(struct kvm *self)
479 self->fpu = (struct kvm_fpu) {
484 if (ioctl(self->vcpu_fd, KVM_SET_FPU, &self->fpu) < 0)
485 die_perror("KVM_SET_FPU failed");
488 static void kvm__setup_regs(struct kvm *self)
490 self->regs = (struct kvm_regs) {
491 /* We start the guest in 16-bit real mode */
492 .rflags = 0x0000000000000002ULL,
494 .rip = self->boot_ip,
495 .rsp = self->boot_sp,
496 .rbp = self->boot_sp,
499 if (self->regs.rip > USHRT_MAX)
500 die("ip 0x%" PRIx64 " is too high for real mode", (uint64_t) self->regs.rip);
502 if (ioctl(self->vcpu_fd, KVM_SET_REGS, &self->regs) < 0)
503 die_perror("KVM_SET_REGS failed");
506 static void kvm__setup_sregs(struct kvm *self)
509 if (ioctl(self->vcpu_fd, KVM_GET_SREGS, &self->sregs) < 0)
510 die_perror("KVM_GET_SREGS failed");
512 self->sregs.cs.selector = self->boot_selector;
513 self->sregs.cs.base = selector_to_base(self->boot_selector);
514 self->sregs.ss.selector = self->boot_selector;
515 self->sregs.ss.base = selector_to_base(self->boot_selector);
516 self->sregs.ds.selector = self->boot_selector;
517 self->sregs.ds.base = selector_to_base(self->boot_selector);
518 self->sregs.es.selector = self->boot_selector;
519 self->sregs.es.base = selector_to_base(self->boot_selector);
520 self->sregs.fs.selector = self->boot_selector;
521 self->sregs.fs.base = selector_to_base(self->boot_selector);
522 self->sregs.gs.selector = self->boot_selector;
523 self->sregs.gs.base = selector_to_base(self->boot_selector);
525 if (ioctl(self->vcpu_fd, KVM_SET_SREGS, &self->sregs) < 0)
526 die_perror("KVM_SET_SREGS failed");
529 void kvm__reset_vcpu(struct kvm *self)
531 kvm__setup_sregs(self);
533 kvm__setup_regs(self);
535 kvm__setup_fpu(self);
537 kvm__setup_msrs(self);
540 void kvm__run(struct kvm *self)
542 if (ioctl(self->vcpu_fd, KVM_RUN, 0) < 0)
543 die_perror("KVM_RUN failed");
546 static void print_dtable(const char *name, struct kvm_dtable *dtable)
548 printf(" %s %016" PRIx64 " %08" PRIx16 "\n",
549 name, (uint64_t) dtable->base, (uint16_t) dtable->limit);
552 static void print_segment(const char *name, struct kvm_segment *seg)
554 printf(" %s %04" PRIx16 " %016" PRIx64 " %08" PRIx32 " %02" PRIx8 " %x %x %x %x %x %x %x\n",
555 name, (uint16_t) seg->selector, (uint64_t) seg->base, (uint32_t) seg->limit,
556 (uint8_t) seg->type, seg->present, seg->dpl, seg->db, seg->s, seg->l, seg->g, seg->avl);
559 void kvm__show_registers(struct kvm *self)
561 unsigned long cr0, cr2, cr3;
562 unsigned long cr4, cr8;
563 unsigned long rax, rbx, rcx;
564 unsigned long rdx, rsi, rdi;
565 unsigned long rbp, r8, r9;
566 unsigned long r10, r11, r12;
567 unsigned long r13, r14, r15;
568 unsigned long rip, rsp;
569 struct kvm_sregs sregs;
570 unsigned long rflags;
571 struct kvm_regs regs;
574 if (ioctl(self->vcpu_fd, KVM_GET_REGS, ®s) < 0)
575 die("KVM_GET_REGS failed");
577 rflags = regs.rflags;
579 rip = regs.rip; rsp = regs.rsp;
580 rax = regs.rax; rbx = regs.rbx; rcx = regs.rcx;
581 rdx = regs.rdx; rsi = regs.rsi; rdi = regs.rdi;
582 rbp = regs.rbp; r8 = regs.r8; r9 = regs.r9;
583 r10 = regs.r10; r11 = regs.r11; r12 = regs.r12;
584 r13 = regs.r13; r14 = regs.r14; r15 = regs.r15;
586 printf("Registers:\n");
587 printf(" rip: %016lx rsp: %016lx flags: %016lx\n", rip, rsp, rflags);
588 printf(" rax: %016lx rbx: %016lx rcx: %016lx\n", rax, rbx, rcx);
589 printf(" rdx: %016lx rsi: %016lx rdi: %016lx\n", rdx, rsi, rdi);
590 printf(" rbp: %016lx r8: %016lx r9: %016lx\n", rbp, r8, r9);
591 printf(" r10: %016lx r11: %016lx r12: %016lx\n", r10, r11, r12);
592 printf(" r13: %016lx r14: %016lx r15: %016lx\n", r13, r14, r15);
594 if (ioctl(self->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
595 die("KVM_GET_REGS failed");
597 cr0 = sregs.cr0; cr2 = sregs.cr2; cr3 = sregs.cr3;
598 cr4 = sregs.cr4; cr8 = sregs.cr8;
600 printf(" cr0: %016lx cr2: %016lx cr3: %016lx\n", cr0, cr2, cr3);
601 printf(" cr4: %016lx cr8: %016lx\n", cr4, cr8);
602 printf("Segment registers:\n");
603 printf(" register selector base limit type p dpl db s l g avl\n");
604 print_segment("cs ", &sregs.cs);
605 print_segment("ss ", &sregs.ss);
606 print_segment("ds ", &sregs.ds);
607 print_segment("es ", &sregs.es);
608 print_segment("fs ", &sregs.fs);
609 print_segment("gs ", &sregs.gs);
610 print_segment("tr ", &sregs.tr);
611 print_segment("ldt", &sregs.ldt);
612 print_dtable("gdt", &sregs.gdt);
613 print_dtable("idt", &sregs.idt);
614 printf(" [ efer: %016" PRIx64 " apic base: %016" PRIx64 " nmi: %s ]\n",
615 (uint64_t) sregs.efer, (uint64_t) sregs.apic_base,
616 (self->nmi_disabled ? "disabled" : "enabled"));
617 printf("Interrupt bitmap:\n");
619 for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++)
620 printf("%016" PRIx64 " ", (uint64_t) sregs.interrupt_bitmap[i]);
624 void kvm__show_code(struct kvm *self)
626 unsigned int code_bytes = 64;
627 unsigned int code_prologue = code_bytes * 43 / 64;
628 unsigned int code_len = code_bytes;
633 if (ioctl(self->vcpu_fd, KVM_GET_REGS, &self->regs) < 0)
634 die("KVM_GET_REGS failed");
636 if (ioctl(self->vcpu_fd, KVM_GET_SREGS, &self->sregs) < 0)
637 die("KVM_GET_SREGS failed");
639 ip = guest_flat_to_host(self, ip_to_flat(self, self->regs.rip) - code_prologue);
643 for (i = 0; i < code_len; i++, ip++) {
644 if (!host_ptr_in_ram(self, ip))
649 if (ip == guest_flat_to_host(self, ip_to_flat(self, self->regs.rip)))
650 printf("<%02x> ", c);
658 kvm__dump_mem(self, self->regs.rsp, 32);
661 void kvm__show_page_tables(struct kvm *self)
668 if (!is_in_protected_mode(self))
671 if (ioctl(self->vcpu_fd, KVM_GET_SREGS, &self->sregs) < 0)
672 die("KVM_GET_SREGS failed");
674 pte4 = guest_flat_to_host(self, self->sregs.cr3);
675 if (!host_ptr_in_ram(self, pte4))
678 pte3 = guest_flat_to_host(self, (*pte4 & ~0xfff));
679 if (!host_ptr_in_ram(self, pte3))
682 pte2 = guest_flat_to_host(self, (*pte3 & ~0xfff));
683 if (!host_ptr_in_ram(self, pte2))
686 pte1 = guest_flat_to_host(self, (*pte2 & ~0xfff));
687 if (!host_ptr_in_ram(self, pte1))
690 printf("Page Tables:\n");
691 if (*pte2 & (1 << 7))
692 printf(" pte4: %016" PRIx64 " pte3: %016" PRIx64
693 " pte2: %016" PRIx64 "\n",
694 *pte4, *pte3, *pte2);
696 printf(" pte4: %016" PRIx64 " pte3: %016" PRIx64 " pte2: %016"
697 PRIx64 " pte1: %016" PRIx64 "\n",
698 *pte4, *pte3, *pte2, *pte1);
701 void kvm__dump_mem(struct kvm *self, unsigned long addr, unsigned long size)
706 size &= ~7; /* mod 8 */
710 p = guest_flat_to_host(self, addr);
712 for (n = 0; n < size; n+=8) {
713 if (!host_ptr_in_ram(self, p + n))
716 printf(" 0x%08lx: %02x %02x %02x %02x %02x %02x %02x %02x\n",
717 addr + n, p[n + 0], p[n + 1], p[n + 2], p[n + 3],
718 p[n + 4], p[n + 5], p[n + 6], p[n + 7]);