]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge remote-tracking branch 'kvmtool/master'
authorStephen Rothwell <sfr@canb.auug.org.au>
Wed, 14 Sep 2011 05:31:28 +0000 (15:31 +1000)
committerStephen Rothwell <sfr@canb.auug.org.au>
Wed, 14 Sep 2011 05:31:28 +0000 (15:31 +1000)
Conflicts:
include/net/9p/9p.h

179 files changed:
MAINTAINERS
tools/kvm/.gitignore [new file with mode: 0644]
tools/kvm/CREDITS-Git [new file with mode: 0644]
tools/kvm/Documentation/kvm-balloon.txt [new file with mode: 0644]
tools/kvm/Documentation/kvm-debug.txt [new file with mode: 0644]
tools/kvm/Documentation/kvm-list.txt [new file with mode: 0644]
tools/kvm/Documentation/kvm-pause.txt [new file with mode: 0644]
tools/kvm/Documentation/kvm-resume.txt [new file with mode: 0644]
tools/kvm/Documentation/kvm-run.txt [new file with mode: 0644]
tools/kvm/Documentation/kvm-setup.txt [new file with mode: 0644]
tools/kvm/Documentation/kvm-stat.txt [new file with mode: 0644]
tools/kvm/Documentation/kvm-stop.txt [new file with mode: 0644]
tools/kvm/Documentation/kvm-version.txt [new file with mode: 0644]
tools/kvm/Makefile [new file with mode: 0644]
tools/kvm/README [new file with mode: 0644]
tools/kvm/bios.c [new file with mode: 0644]
tools/kvm/bios/.gitignore [new file with mode: 0644]
tools/kvm/bios/bios-rom.S [new file with mode: 0644]
tools/kvm/bios/e820.c [new file with mode: 0644]
tools/kvm/bios/entry.S [new file with mode: 0644]
tools/kvm/bios/gen-offsets.sh [new file with mode: 0644]
tools/kvm/bios/int10.c [new file with mode: 0644]
tools/kvm/bios/int15.c [new file with mode: 0644]
tools/kvm/bios/local.S [new file with mode: 0644]
tools/kvm/bios/macro.S [new file with mode: 0644]
tools/kvm/bios/rom.ld.S [new file with mode: 0644]
tools/kvm/builtin-balloon.c [new file with mode: 0644]
tools/kvm/builtin-debug.c [new file with mode: 0644]
tools/kvm/builtin-help.c [new file with mode: 0644]
tools/kvm/builtin-list.c [new file with mode: 0644]
tools/kvm/builtin-pause.c [new file with mode: 0644]
tools/kvm/builtin-resume.c [new file with mode: 0644]
tools/kvm/builtin-run.c [new file with mode: 0644]
tools/kvm/builtin-setup.c [new file with mode: 0644]
tools/kvm/builtin-stat.c [new file with mode: 0644]
tools/kvm/builtin-stop.c [new file with mode: 0644]
tools/kvm/builtin-version.c [new file with mode: 0644]
tools/kvm/code16gcc.h [new file with mode: 0644]
tools/kvm/command-list.txt [new file with mode: 0644]
tools/kvm/config/feature-tests.mak [new file with mode: 0644]
tools/kvm/config/utilities.mak [new file with mode: 0644]
tools/kvm/cpuid.c [new file with mode: 0644]
tools/kvm/disk/blk.c [new file with mode: 0644]
tools/kvm/disk/core.c [new file with mode: 0644]
tools/kvm/disk/qcow.c [new file with mode: 0644]
tools/kvm/disk/raw.c [new file with mode: 0644]
tools/kvm/framebuffer.c [new file with mode: 0644]
tools/kvm/guest/init.c [new file with mode: 0644]
tools/kvm/guest/setnet.sh [new file with mode: 0755]
tools/kvm/guest_compat.c [new file with mode: 0644]
tools/kvm/hw/i8042.c [new file with mode: 0644]
tools/kvm/hw/pci-shmem.c [new file with mode: 0644]
tools/kvm/hw/rtc.c [new file with mode: 0644]
tools/kvm/hw/serial.c [new file with mode: 0644]
tools/kvm/hw/vesa.c [new file with mode: 0644]
tools/kvm/include/asm/hweight.h [new file with mode: 0644]
tools/kvm/include/kvm/8250-serial.h [new file with mode: 0644]
tools/kvm/include/kvm/apic.h [new file with mode: 0644]
tools/kvm/include/kvm/assembly.h [new file with mode: 0644]
tools/kvm/include/kvm/barrier.h [new file with mode: 0644]
tools/kvm/include/kvm/bios-export.h [new file with mode: 0644]
tools/kvm/include/kvm/bios.h [new file with mode: 0644]
tools/kvm/include/kvm/boot-protocol.h [new file with mode: 0644]
tools/kvm/include/kvm/brlock.h [new file with mode: 0644]
tools/kvm/include/kvm/builtin-balloon.h [new file with mode: 0644]
tools/kvm/include/kvm/builtin-debug.h [new file with mode: 0644]
tools/kvm/include/kvm/builtin-help.h [new file with mode: 0644]
tools/kvm/include/kvm/builtin-list.h [new file with mode: 0644]
tools/kvm/include/kvm/builtin-pause.h [new file with mode: 0644]
tools/kvm/include/kvm/builtin-resume.h [new file with mode: 0644]
tools/kvm/include/kvm/builtin-run.h [new file with mode: 0644]
tools/kvm/include/kvm/builtin-setup.h [new file with mode: 0644]
tools/kvm/include/kvm/builtin-stat.h [new file with mode: 0644]
tools/kvm/include/kvm/builtin-stop.h [new file with mode: 0644]
tools/kvm/include/kvm/builtin-version.h [new file with mode: 0644]
tools/kvm/include/kvm/compiler.h [new file with mode: 0644]
tools/kvm/include/kvm/cpufeature.h [new file with mode: 0644]
tools/kvm/include/kvm/disk-image.h [new file with mode: 0644]
tools/kvm/include/kvm/e820.h [new file with mode: 0644]
tools/kvm/include/kvm/framebuffer.h [new file with mode: 0644]
tools/kvm/include/kvm/guest_compat.h [new file with mode: 0644]
tools/kvm/include/kvm/i8042.h [new file with mode: 0644]
tools/kvm/include/kvm/interrupt.h [new file with mode: 0644]
tools/kvm/include/kvm/ioeventfd.h [new file with mode: 0644]
tools/kvm/include/kvm/ioport.h [new file with mode: 0644]
tools/kvm/include/kvm/irq.h [new file with mode: 0644]
tools/kvm/include/kvm/kvm-cmd.h [new file with mode: 0644]
tools/kvm/include/kvm/kvm-cpu.h [new file with mode: 0644]
tools/kvm/include/kvm/kvm.h [new file with mode: 0644]
tools/kvm/include/kvm/mptable.h [new file with mode: 0644]
tools/kvm/include/kvm/mutex.h [new file with mode: 0644]
tools/kvm/include/kvm/parse-options.h [new file with mode: 0644]
tools/kvm/include/kvm/pci-shmem.h [new file with mode: 0644]
tools/kvm/include/kvm/pci.h [new file with mode: 0644]
tools/kvm/include/kvm/qcow.h [new file with mode: 0644]
tools/kvm/include/kvm/rbtree-interval.h [new file with mode: 0644]
tools/kvm/include/kvm/read-write.h [new file with mode: 0644]
tools/kvm/include/kvm/rtc.h [new file with mode: 0644]
tools/kvm/include/kvm/rwsem.h [new file with mode: 0644]
tools/kvm/include/kvm/sdl.h [new file with mode: 0644]
tools/kvm/include/kvm/segment.h [new file with mode: 0644]
tools/kvm/include/kvm/strbuf.h [new file with mode: 0644]
tools/kvm/include/kvm/symbol.h [new file with mode: 0644]
tools/kvm/include/kvm/term.h [new file with mode: 0644]
tools/kvm/include/kvm/threadpool.h [new file with mode: 0644]
tools/kvm/include/kvm/types.h [new file with mode: 0644]
tools/kvm/include/kvm/uip.h [new file with mode: 0644]
tools/kvm/include/kvm/util.h [new file with mode: 0644]
tools/kvm/include/kvm/vesa.h [new file with mode: 0644]
tools/kvm/include/kvm/virtio-9p.h [new file with mode: 0644]
tools/kvm/include/kvm/virtio-balloon.h [new file with mode: 0644]
tools/kvm/include/kvm/virtio-blk.h [new file with mode: 0644]
tools/kvm/include/kvm/virtio-console.h [new file with mode: 0644]
tools/kvm/include/kvm/virtio-net.h [new file with mode: 0644]
tools/kvm/include/kvm/virtio-pci-dev.h [new file with mode: 0644]
tools/kvm/include/kvm/virtio-pci.h [new file with mode: 0644]
tools/kvm/include/kvm/virtio-rng.h [new file with mode: 0644]
tools/kvm/include/kvm/virtio.h [new file with mode: 0644]
tools/kvm/include/kvm/vnc.h [new file with mode: 0644]
tools/kvm/include/linux/bitops.h [new file with mode: 0644]
tools/kvm/include/linux/byteorder.h [new file with mode: 0644]
tools/kvm/include/linux/kernel.h [new file with mode: 0644]
tools/kvm/include/linux/module.h [new file with mode: 0644]
tools/kvm/include/linux/prefetch.h [new file with mode: 0644]
tools/kvm/include/linux/types.h [new file with mode: 0644]
tools/kvm/interrupt.c [new file with mode: 0644]
tools/kvm/ioeventfd.c [new file with mode: 0644]
tools/kvm/ioport.c [new file with mode: 0644]
tools/kvm/irq.c [new file with mode: 0644]
tools/kvm/kvm-cmd.c [new file with mode: 0644]
tools/kvm/kvm-cpu.c [new file with mode: 0644]
tools/kvm/kvm.c [new file with mode: 0644]
tools/kvm/main.c [new file with mode: 0644]
tools/kvm/mmio.c [new file with mode: 0644]
tools/kvm/mptable.c [new file with mode: 0644]
tools/kvm/net/uip/arp.c [new file with mode: 0644]
tools/kvm/net/uip/buf.c [new file with mode: 0644]
tools/kvm/net/uip/core.c [new file with mode: 0644]
tools/kvm/net/uip/csum.c [new file with mode: 0644]
tools/kvm/net/uip/dhcp.c [new file with mode: 0644]
tools/kvm/net/uip/icmp.c [new file with mode: 0644]
tools/kvm/net/uip/ipv4.c [new file with mode: 0644]
tools/kvm/net/uip/tcp.c [new file with mode: 0644]
tools/kvm/net/uip/udp.c [new file with mode: 0644]
tools/kvm/pci.c [new file with mode: 0644]
tools/kvm/read-write.c [new file with mode: 0644]
tools/kvm/symbol.c [new file with mode: 0644]
tools/kvm/term.c [new file with mode: 0644]
tools/kvm/tests/Makefile [new file with mode: 0644]
tools/kvm/tests/boot/Makefile [new file with mode: 0644]
tools/kvm/tests/boot/init.c [new file with mode: 0644]
tools/kvm/tests/kernel/.gitignore [new file with mode: 0644]
tools/kvm/tests/kernel/Makefile [new file with mode: 0644]
tools/kvm/tests/kernel/README [new file with mode: 0644]
tools/kvm/tests/kernel/kernel.S [new file with mode: 0644]
tools/kvm/tests/pit/.gitignore [new file with mode: 0644]
tools/kvm/tests/pit/Makefile [new file with mode: 0644]
tools/kvm/tests/pit/README [new file with mode: 0644]
tools/kvm/tests/pit/tick.S [new file with mode: 0644]
tools/kvm/threadpool.c [new file with mode: 0644]
tools/kvm/ui/sdl.c [new file with mode: 0644]
tools/kvm/ui/vnc.c [new file with mode: 0644]
tools/kvm/util.c [new file with mode: 0644]
tools/kvm/util/KVMTOOLS-VERSION-GEN [new file with mode: 0755]
tools/kvm/util/generate-cmdlist.sh [new file with mode: 0755]
tools/kvm/util/kvm-ifup-vbr0 [new file with mode: 0755]
tools/kvm/util/parse-options.c [new file with mode: 0644]
tools/kvm/util/rbtree-interval.c [new file with mode: 0644]
tools/kvm/util/set_private_br.sh [new file with mode: 0755]
tools/kvm/util/strbuf.c [new file with mode: 0644]
tools/kvm/virtio/9p-pdu.c [new file with mode: 0644]
tools/kvm/virtio/9p.c [new file with mode: 0644]
tools/kvm/virtio/balloon.c [new file with mode: 0644]
tools/kvm/virtio/blk.c [new file with mode: 0644]
tools/kvm/virtio/console.c [new file with mode: 0644]
tools/kvm/virtio/core.c [new file with mode: 0644]
tools/kvm/virtio/net.c [new file with mode: 0644]
tools/kvm/virtio/pci.c [new file with mode: 0644]
tools/kvm/virtio/rng.c [new file with mode: 0644]

index d73c7f468ab7259fdc6ab94d5cf5c33ab3df0fc8..0e19758aa0176958ea7ff603fdb535d4dd0f50e2 100644 (file)
@@ -4378,6 +4378,14 @@ L:       alsa-devel@alsa-project.org
 W:     http://www.native-instruments.com
 F:     sound/usb/caiaq/
 
+NATIVE LINUX KVM TOOL
+M:     Pekka Enberg <penberg@kernel.org>
+M:     Sasha Levin <levinsasha928@gmail.com>
+M:     Asias He <asias.hejun@gmail.com>
+L:     kvm@vger.kernel.org
+S:     Maintained
+F:     tools/kvm/
+
 NCP FILESYSTEM
 M:     Petr Vandrovec <petr@vandrovec.name>
 S:     Odd Fixes
diff --git a/tools/kvm/.gitignore b/tools/kvm/.gitignore
new file mode 100644 (file)
index 0000000..6ace4ec
--- /dev/null
@@ -0,0 +1,10 @@
+/kvm
+*.o
+*.d
+.cscope
+tags
+include/common-cmds.h
+tests/boot/boot_test.iso
+tests/boot/rootfs/
+guest/init
+KVMTOOLS-VERSION-FILE
diff --git a/tools/kvm/CREDITS-Git b/tools/kvm/CREDITS-Git
new file mode 100644 (file)
index 0000000..c2ddcb3
--- /dev/null
@@ -0,0 +1,30 @@
+Most of the infrastructure that 'perf' uses here has been reused
+from the Git project, as of version:
+
+    66996ec: Sync with 1.6.2.4
+
+Here is an (incomplete!) list of main contributors to those files
+in util/* and elsewhere:
+
+ Alex Riesen
+ Christian Couder
+ Dmitry Potapov
+ Jeff King
+ Johannes Schindelin
+ Johannes Sixt
+ Junio C Hamano
+ Linus Torvalds
+ Matthias Kestenholz
+ Michal Ostrowski
+ Miklos Vajna
+ Petr Baudis
+ Pierre Habouzit
+ René Scharfe
+ Samuel Tardieu
+ Shawn O. Pearce
+ Steffen Prohaska
+ Steve Haslam
+
+Thanks guys!
+
+The full history of the files can be found in the upstream Git commits.
diff --git a/tools/kvm/Documentation/kvm-balloon.txt b/tools/kvm/Documentation/kvm-balloon.txt
new file mode 100644 (file)
index 0000000..a29a201
--- /dev/null
@@ -0,0 +1,24 @@
+kvm-balloon(1)
+================
+
+NAME
+----
+kvm-balloon - Inflate or deflate the virtio balloon
+
+SYNOPSIS
+--------
+[verse]
+'kvm balloon [command] [size] [instance]'
+
+DESCRIPTION
+-----------
+The command inflates or deflates the virtio balloon located in the
+specified instance.
+For a list of running instances see 'kvm list'.
+
+Command can be either 'inflate' or 'deflate'. Inflate increases the
+size of the balloon, thus decreasing the amount of virtual RAM available
+for the guest. Deflation returns previously inflated memory back to the
+guest.
+
+size is specified in Mb.
diff --git a/tools/kvm/Documentation/kvm-debug.txt b/tools/kvm/Documentation/kvm-debug.txt
new file mode 100644 (file)
index 0000000..273af15
--- /dev/null
@@ -0,0 +1,16 @@
+kvm-debug(1)
+================
+
+NAME
+----
+kvm-debug - Print debug information from a running instance
+
+SYNOPSIS
+--------
+[verse]
+'kvm debug [instance]'
+
+DESCRIPTION
+-----------
+The command prints debug information from a running instance.
+For a list of running instances see 'kvm list'.
diff --git a/tools/kvm/Documentation/kvm-list.txt b/tools/kvm/Documentation/kvm-list.txt
new file mode 100644 (file)
index 0000000..121e45c
--- /dev/null
@@ -0,0 +1,16 @@
+kvm-list(1)
+================
+
+NAME
+----
+kvm-list - Print a list of running instances on the host.
+
+SYNOPSIS
+--------
+[verse]
+'kvm list'
+
+DESCRIPTION
+-----------
+This command prints a list of running instances on the host which
+belong to the user who currently ran 'kvm list'.
diff --git a/tools/kvm/Documentation/kvm-pause.txt b/tools/kvm/Documentation/kvm-pause.txt
new file mode 100644 (file)
index 0000000..770bcfe
--- /dev/null
@@ -0,0 +1,16 @@
+kvm-pause(1)
+================
+
+NAME
+----
+kvm-pause - Pause the virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'kvm pause [instance]'
+
+DESCRIPTION
+-----------
+The command pauses a virtual machine.
+For a list of running instances see 'kvm list'.
diff --git a/tools/kvm/Documentation/kvm-resume.txt b/tools/kvm/Documentation/kvm-resume.txt
new file mode 100644 (file)
index 0000000..918648e
--- /dev/null
@@ -0,0 +1,16 @@
+kvm-resume(1)
+================
+
+NAME
+----
+kvm-resume - Resume the virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'kvm resume [instance]'
+
+DESCRIPTION
+-----------
+The command resumes a virtual machine.
+For a list of running instances see 'kvm list'.
diff --git a/tools/kvm/Documentation/kvm-run.txt b/tools/kvm/Documentation/kvm-run.txt
new file mode 100644 (file)
index 0000000..6513427
--- /dev/null
@@ -0,0 +1,62 @@
+kvm-run(1)
+================
+
+NAME
+----
+kvm-run - Start the virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'kvm run' [-k <kernel image> | --kernel <kernel image>]
+
+DESCRIPTION
+-----------
+The command starts a virtual machine.
+
+OPTIONS
+-------
+-m::
+--mem=::
+       Virtual machine memory size in MiB.
+
+-p::
+--params::
+       Additional kernel command line arguments.
+
+-r::
+--initrd=::
+       Initial RAM disk image.
+
+-k::
+--kernel=::
+       The virtual machine kernel.
+
+--dev=::
+       KVM device file.
+
+-i::
+--image=::
+       A disk image file.
+
+-s::
+--single-step::
+       Enable single stepping.
+
+-g::
+--ioport-debug::
+       Enable ioport debugging.
+
+-c::
+--enable-virtio-console::
+       Enable the virtual IO console.
+
+--cpus::
+       The number of virtual CPUs to run.
+
+--debug::
+       Enable debug messages.
+
+SEE ALSO
+--------
+linkkvm:
diff --git a/tools/kvm/Documentation/kvm-setup.txt b/tools/kvm/Documentation/kvm-setup.txt
new file mode 100644 (file)
index 0000000..c845d17
--- /dev/null
@@ -0,0 +1,15 @@
+kvm-setup(1)
+================
+
+NAME
+----
+kvm-setup - Setup a new virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'kvm setup <name>'
+
+DESCRIPTION
+-----------
+The command setups a virtual machine.
diff --git a/tools/kvm/Documentation/kvm-stat.txt b/tools/kvm/Documentation/kvm-stat.txt
new file mode 100644 (file)
index 0000000..ce5ab54
--- /dev/null
@@ -0,0 +1,19 @@
+kvm-stat(1)
+================
+
+NAME
+----
+kvm-stat - Print statistics about a running instance
+
+SYNOPSIS
+--------
+[verse]
+'kvm [command] [-n instance] [-p instance pid] [--all]'
+
+DESCRIPTION
+-----------
+The command prints statistics about a running instance.
+For a list of running instances see 'kvm list'.
+
+Commands:
+ --memory, -m  Display memory statistics
diff --git a/tools/kvm/Documentation/kvm-stop.txt b/tools/kvm/Documentation/kvm-stop.txt
new file mode 100644 (file)
index 0000000..5267081
--- /dev/null
@@ -0,0 +1,16 @@
+kvm-stop(1)
+================
+
+NAME
+----
+kvm-stop - Stop a running instance
+
+SYNOPSIS
+--------
+[verse]
+'kvm stop [instance]'
+
+DESCRIPTION
+-----------
+The command stops a running instance.
+For a list of running instances see 'kvm list'.
diff --git a/tools/kvm/Documentation/kvm-version.txt b/tools/kvm/Documentation/kvm-version.txt
new file mode 100644 (file)
index 0000000..bf51540
--- /dev/null
@@ -0,0 +1,21 @@
+kvm-version(1)
+================
+
+NAME
+----
+kvm-version - Print the version of the kernel tree kvm tools
+was built on.
+
+SYNOPSIS
+--------
+[verse]
+'kvm version'
+
+DESCRIPTION
+-----------
+The command prints the version of the kernel that was used to build
+kvm tools.
+
+Note that the version is not the version of the kernel which is currently
+running on the host, but is the version of the kernel tree from which kvm
+tools was built.
diff --git a/tools/kvm/Makefile b/tools/kvm/Makefile
new file mode 100644 (file)
index 0000000..efa032d
--- /dev/null
@@ -0,0 +1,285 @@
+#
+# Define WERROR=0 to disable -Werror.
+#
+
+ifeq ($(strip $(V)),)
+       E = @echo
+       Q = @
+else
+       E = @\#
+       Q =
+endif
+export E Q
+
+include config/utilities.mak
+include config/feature-tests.mak
+
+FIND   := find
+CSCOPE := cscope
+TAGS   := ctags
+
+PROGRAM        := kvm
+
+GUEST_INIT := guest/init
+
+OBJS   += builtin-balloon.o
+OBJS   += builtin-debug.o
+OBJS   += builtin-help.o
+OBJS   += builtin-list.o
+OBJS   += builtin-stat.o
+OBJS   += builtin-pause.o
+OBJS   += builtin-resume.o
+OBJS   += builtin-run.o
+OBJS   += builtin-setup.o
+OBJS   += builtin-stop.o
+OBJS   += builtin-version.o
+OBJS   += cpuid.o
+OBJS   += disk/core.o
+OBJS   += framebuffer.o
+OBJS   += guest_compat.o
+OBJS   += hw/rtc.o
+OBJS   += hw/serial.o
+OBJS   += interrupt.o
+OBJS   += ioport.o
+OBJS   += kvm-cpu.o
+OBJS   += kvm.o
+OBJS   += main.o
+OBJS   += mmio.o
+OBJS   += pci.o
+OBJS   += read-write.o
+OBJS   += term.o
+OBJS   += util.o
+OBJS   += virtio/blk.o
+OBJS   += virtio/console.o
+OBJS   += virtio/core.o
+OBJS   += virtio/net.o
+OBJS   += virtio/rng.o
+OBJS    += virtio/balloon.o
+OBJS   += virtio/pci.o
+OBJS   += disk/blk.o
+OBJS   += disk/qcow.o
+OBJS   += disk/raw.o
+OBJS   += ioeventfd.o
+OBJS   += irq.o
+OBJS   += net/uip/core.o
+OBJS   += net/uip/arp.o
+OBJS   += net/uip/icmp.o
+OBJS   += net/uip/ipv4.o
+OBJS   += net/uip/tcp.o
+OBJS   += net/uip/udp.o
+OBJS   += net/uip/buf.o
+OBJS   += net/uip/csum.o
+OBJS   += net/uip/dhcp.o
+OBJS   += kvm-cmd.o
+OBJS   += mptable.o
+OBJS   += rbtree.o
+OBJS   += threadpool.o
+OBJS   += util/parse-options.o
+OBJS   += util/rbtree-interval.o
+OBJS   += util/strbuf.o
+OBJS   += virtio/9p.o
+OBJS   += virtio/9p-pdu.o
+OBJS   += hw/vesa.o
+OBJS   += hw/i8042.o
+OBJS   += hw/pci-shmem.o
+
+FLAGS_BFD := $(CFLAGS) -lbfd
+has_bfd := $(call try-cc,$(SOURCE_BFD),$(FLAGS_BFD))
+ifeq ($(has_bfd),y)
+       CFLAGS  += -DCONFIG_HAS_BFD
+       OBJS    += symbol.o
+       LIBS    += -lbfd
+endif
+
+FLAGS_VNCSERVER := $(CFLAGS) -lvncserver
+has_vncserver := $(call try-cc,$(SOURCE_VNCSERVER),$(FLAGS_VNCSERVER))
+ifeq ($(has_vncserver),y)
+       OBJS    += ui/vnc.o
+       CFLAGS  += -DCONFIG_HAS_VNCSERVER
+       LIBS    += -lvncserver
+endif
+
+FLAGS_SDL := $(CFLAGS) -lSDL
+has_SDL := $(call try-cc,$(SOURCE_SDL),$(FLAGS_SDL))
+ifeq ($(has_SDL),y)
+       OBJS    += ui/sdl.o
+       CFLAGS  += -DCONFIG_HAS_SDL
+       LIBS    += -lSDL
+endif
+
+DEPS   := $(patsubst %.o,%.d,$(OBJS))
+
+# Exclude BIOS object files from header dependencies.
+OBJS   += bios.o
+OBJS   += bios/bios-rom.o
+
+LIBS   += -lrt
+LIBS   += -lpthread
+
+# Additional ARCH settings for x86
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
+                  -e s/arm.*/arm/ -e s/sa110/arm/ \
+                  -e s/s390x/s390/ -e s/parisc64/parisc/ \
+                  -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \
+                  -e s/sh[234].*/sh/ )
+
+uname_M      := $(shell uname -m | sed -e s/i.86/i386/)
+ifeq ($(uname_M),i386)
+       ARCH         := x86
+       DEFINES      += -DCONFIG_X86_32
+endif
+ifeq ($(uname_M),x86_64)
+       ARCH         := x86
+       DEFINES      += -DCONFIG_X86_64
+endif
+
+DEFINES        += -D_FILE_OFFSET_BITS=64
+DEFINES        += -D_GNU_SOURCE
+DEFINES        += -DKVMTOOLS_VERSION='"$(KVMTOOLS_VERSION)"'
+
+KVM_INCLUDE := include
+CFLAGS += $(CPPFLAGS) $(DEFINES) -I$(KVM_INCLUDE) -I../../include -I../../arch/$(ARCH)/include/ -Os -g
+
+ifneq ($(WERROR),0)
+       WARNINGS += -Werror
+endif
+
+WARNINGS += -Wall
+WARNINGS += -Wcast-align
+WARNINGS += -Wformat=2
+WARNINGS += -Winit-self
+WARNINGS += -Wmissing-declarations
+WARNINGS += -Wmissing-prototypes
+WARNINGS += -Wnested-externs
+WARNINGS += -Wno-system-headers
+WARNINGS += -Wold-style-definition
+WARNINGS += -Wredundant-decls
+WARNINGS += -Wsign-compare
+WARNINGS += -Wstrict-prototypes
+WARNINGS += -Wundef
+WARNINGS += -Wvolatile-register-var
+WARNINGS += -Wwrite-strings
+WARNINGS += -Wunused-result
+
+CFLAGS += $(WARNINGS)
+
+all: $(PROGRAM) $(GUEST_INIT)
+
+KVMTOOLS-VERSION-FILE:
+       @$(SHELL_PATH) util/KVMTOOLS-VERSION-GEN $(OUTPUT)
+-include $(OUTPUT)KVMTOOLS-VERSION-FILE
+
+$(PROGRAM): $(DEPS) $(OBJS)
+       $(E) "  LINK    " $@
+       $(Q) $(CC) $(OBJS) $(LIBS) -o $@
+
+$(GUEST_INIT): guest/init.c
+       $(E) "  LINK    " $@
+       $(Q) $(CC) -static guest/init.c -o $@
+
+$(DEPS):
+
+%.d: %.c
+       $(Q) $(CC) -M -MT $(patsubst %.d,%.o,$@) $(CFLAGS) $< -o $@
+
+# The header file common-cmds.h is needed for compilation of builtin-help.c.
+builtin-help.d: $(KVM_INCLUDE)/common-cmds.h
+
+$(OBJS):
+
+rbtree.o: ../../lib/rbtree.c
+       $(Q) $(CC) -c $(CFLAGS) $< -o $@
+
+%.o: %.c
+       $(E) "  CC      " $@
+       $(Q) $(CC) -c $(CFLAGS) $< -o $@
+
+
+$(KVM_INCLUDE)/common-cmds.h: util/generate-cmdlist.sh command-list.txt
+
+$(KVM_INCLUDE)/common-cmds.h: $(wildcard Documentation/kvm-*.txt)
+       $(E) "  GEN     " $@
+       $(Q) util/generate-cmdlist.sh > $@+ && mv $@+ $@
+
+#
+# BIOS assembly weirdness
+#
+BIOS_CFLAGS += -m32
+BIOS_CFLAGS += -march=i386
+BIOS_CFLAGS += -mregparm=3
+
+bios.o: bios/bios.bin bios/bios-rom.h
+
+bios/bios.bin.elf: bios/entry.S bios/e820.c bios/int10.c bios/int15.c bios/rom.ld.S
+       $(E) "  CC       bios/e820.o"
+       $(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s bios/e820.c -o bios/e820.o
+       $(E) "  CC       bios/int10.o"
+       $(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s bios/int10.c -o bios/int10.o
+       $(E) "  CC       bios/int15.o"
+       $(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s bios/int15.c -o bios/int15.o
+       $(E) "  CC       bios/entry.o"
+       $(Q) $(CC) $(CFLAGS) $(BIOS_CFLAGS) -c -s bios/entry.S -o bios/entry.o
+       $(E) "  LD      " $@
+       $(Q) ld -T bios/rom.ld.S -o bios/bios.bin.elf bios/entry.o bios/e820.o bios/int10.o bios/int15.o
+
+bios/bios.bin: bios/bios.bin.elf
+       $(E) "  OBJCOPY " $@
+       $(Q) objcopy -O binary -j .text bios/bios.bin.elf bios/bios.bin
+
+bios/bios-rom.o: bios/bios-rom.S bios/bios.bin bios/bios-rom.h
+       $(E) "  CC      " $@
+       $(Q) $(CC) -c $(CFLAGS) bios/bios-rom.S -o bios/bios-rom.o
+
+bios/bios-rom.h: bios/bios.bin.elf
+       $(E) "  NM      " $@
+       $(Q) cd bios && sh gen-offsets.sh > bios-rom.h && cd ..
+
+check: $(PROGRAM)
+       $(MAKE) -C tests
+       ./$(PROGRAM) run tests/pit/tick.bin
+       ./$(PROGRAM) run -d tests/boot/boot_test.iso -p "init=init"
+.PHONY: check
+
+clean:
+       $(E) "  CLEAN"
+       $(Q) rm -f bios/*.bin
+       $(Q) rm -f bios/*.elf
+       $(Q) rm -f bios/*.o
+       $(Q) rm -f bios/bios-rom.h
+       $(Q) rm -f tests/boot/boot_test.iso
+       $(Q) rm -rf tests/boot/rootfs/
+       $(Q) rm -f $(DEPS) $(OBJS) $(PROGRAM) $(GUEST_INIT)
+       $(Q) rm -f cscope.*
+       $(Q) rm -f $(KVM_INCLUDE)/common-cmds.h
+       $(Q) rm -f KVMTOOLS-VERSION-FILE
+.PHONY: clean
+
+KVM_DEV        ?= /dev/kvm
+
+$(KVM_DEV):
+       $(E) "  MKNOD " $@
+       $(Q) mknod $@ char 10 232
+
+devices: $(KVM_DEV)
+.PHONY: devices
+
+TAGS:
+       $(E) "  GEN" $@
+       $(Q) $(RM) -f TAGS
+       $(Q) $(FIND) . -name '*.[hcS]' -print | xargs etags -a
+.PHONY: TAGS
+
+tags:
+       $(E) "  GEN" $@
+       $(Q) $(RM) -f tags
+       $(Q) $(FIND) . -name '*.[hcS]' -print | xargs ctags -a
+.PHONY: tags
+
+cscope:
+       $(E) "  GEN" $@
+       $(Q) $(FIND) . -name '*.[hcS]' -print > cscope.files
+       $(Q) $(CSCOPE) -bkqu
+.PHONY: cscope
+
+# Deps
+-include $(DEPS)
diff --git a/tools/kvm/README b/tools/kvm/README
new file mode 100644 (file)
index 0000000..5b85461
--- /dev/null
@@ -0,0 +1,109 @@
+Native Linux KVM tool
+=====================
+The goal of this tool is to provide a clean, from-scratch, lightweight
+KVM host tool implementation that can boot Linux guest images (just a
+hobby, won't be big and professional like QEMU) with no BIOS
+dependencies and with only the minimal amount of legacy device
+emulation.
+
+It's great as a learning tool if you want to get your feet wet in
+virtualization land: it's only 5 KLOC of clean C code that can already
+boot a guest Linux image.
+
+Right now it can boot a Linux image and provide you output via a serial
+console, over the host terminal, i.e. you can use it to boot a guest
+Linux image in a terminal or over ssh and log into the guest without
+much guest or host side setup work needed.
+
+1. To try out the tool, clone the git repository:
+
+  git clone git://github.com/penberg/linux-kvm.git
+
+or alternatively, if you already have a kernel source tree:
+
+  git remote add kvm-tool git://github.com/penberg/linux-kvm.git
+  git remote update
+  git checkout -b kvm-tool/master kvm-tool
+
+2. Compile the tool:
+
+  cd tools/kvm && make
+
+3. Download a raw userspace image:
+
+  wget http://wiki.qemu.org/download/linux-0.2.img.bz2 && bunzip2
+linux-0.2.img.bz2
+
+4. The guest kernel has to be built with the following configuration:
+
+ - For the default console output:
+       CONFIG_SERIAL_8250=y
+       CONFIG_SERIAL_8250_CONSOLE=y
+
+ - For running 32bit images on 64bit hosts:
+       CONFIG_IA32_EMULATION=y
+
+ - Proper FS options according to image FS (e.g. CONFIG_EXT2_FS, CONFIG_EXT4_FS).
+
+ - For all virtio devices listed below:
+       CONFIG_VIRTIO=y
+       CONFIG_VIRTIO_RING=y
+       CONFIG_VIRTIO_PCI=y
+
+ - For virtio-blk devices (--disk, -d):
+       CONFIG_VIRTIO_BLK=y
+
+ - For virtio-net devices ([--network, -n] virtio):
+       CONFIG_VIRTIO_NET=y
+
+ - For virtio-9p devices (--virtio-9p):
+       CONFIG_NET_9P=y
+       CONFIG_NET_9P_VIRTIO=y
+       CONFIG_9P_FS=y
+
+ - For virtio-balloon device (--balloon):
+       CONFIG_VIRTIO_BALLOON=y
+
+ - For virtio-console device (--console virtio):
+       CONFIG_VIRTIO_CONSOLE=y
+
+ - For virtio-rng device (--rng):
+       CONFIG_HW_RANDOM_VIRTIO=y
+
+
+5. And finally, launch the hypervisor:
+
+  ./kvm run --disk linux-0.2.img \
+           --kernel ../../arch/x86/boot/bzImage \
+or
+
+  sudo ./kvm run --disk linux-0.2.img \
+                --kernel ../../arch/x86/boot/bzImage \
+                --network virtio
+
+The tool has been written by Pekka Enberg, Cyrill Gorcunov, Asias He,
+Sasha Levin and Prasad Joshi. Special thanks to Avi Kivity for his help
+on KVM internals and Ingo Molnar for all-around support and encouragement!
+
+See the following thread for original discussion for motivation of this
+project:
+
+http://thread.gmane.org/gmane.linux.kernel/962051/focus=962620
+
+Build dependencies
+=====================
+For deb based systems:
+32-bit:
+sudo apt-get install build-essential
+64-bit:
+sudo apt-get install build-essential libc6-dev-i386
+
+For rpm based systems:
+32-bit:
+yum install glibc-devel
+64-bit:
+yum install glibc-devel glibc-devel.i386
+
+On 64-bit Arch Linux make sure the multilib repository is enabled in your
+/etc/pacman.conf and run
+pacman -Sy lib32-glibc
diff --git a/tools/kvm/bios.c b/tools/kvm/bios.c
new file mode 100644 (file)
index 0000000..6aefd1b
--- /dev/null
@@ -0,0 +1,157 @@
+#include "kvm/kvm.h"
+#include "kvm/boot-protocol.h"
+#include "kvm/e820.h"
+#include "kvm/interrupt.h"
+#include "kvm/util.h"
+
+#include <string.h>
+#include <asm/e820.h>
+
+#include "bios/bios-rom.h"
+
+struct irq_handler {
+       unsigned long           address;
+       unsigned int            irq;
+       void                    *handler;
+       size_t                  size;
+};
+
+#define BIOS_IRQ_PA_ADDR(name) (MB_BIOS_BEGIN + BIOS_OFFSET__##name)
+#define BIOS_IRQ_FUNC(name)    ((char *)&bios_rom[BIOS_OFFSET__##name])
+#define BIOS_IRQ_SIZE(name)    (BIOS_ENTRY_SIZE(BIOS_OFFSET__##name))
+
+#define DEFINE_BIOS_IRQ_HANDLER(_irq, _handler)                        \
+       {                                                       \
+               .irq            = _irq,                         \
+               .address        = BIOS_IRQ_PA_ADDR(_handler),   \
+               .handler        = BIOS_IRQ_FUNC(_handler),      \
+               .size           = BIOS_IRQ_SIZE(_handler),      \
+       }
+
+static struct irq_handler bios_irq_handlers[] = {
+       DEFINE_BIOS_IRQ_HANDLER(0x10, bios_int10),
+       DEFINE_BIOS_IRQ_HANDLER(0x15, bios_int15),
+};
+
+static void setup_irq_handler(struct kvm *kvm, struct irq_handler *handler)
+{
+       struct real_intr_desc intr_desc;
+       void *p;
+
+       p       = guest_flat_to_host(kvm, handler->address);
+       memcpy(p, handler->handler, handler->size);
+
+       intr_desc = (struct real_intr_desc) {
+               .segment        = REAL_SEGMENT(MB_BIOS_BEGIN),
+               .offset         = handler->address - MB_BIOS_BEGIN,
+       };
+
+       DIE_IF((handler->address - MB_BIOS_BEGIN) > (unsigned long)0xffff);
+
+       interrupt_table__set(&kvm->interrupt_table, &intr_desc, handler->irq);
+}
+
+/**
+ * e820_setup - setup some simple E820 memory map
+ * @kvm - guest system descriptor
+ */
+static void e820_setup(struct kvm *kvm)
+{
+       struct e820map *e820;
+       struct e820entry *mem_map;
+       unsigned int i = 0;
+
+       e820            = guest_flat_to_host(kvm, E820_MAP_START);
+       mem_map         = e820->map;
+
+       mem_map[i++]    = (struct e820entry) {
+               .addr           = REAL_MODE_IVT_BEGIN,
+               .size           = EBDA_START - REAL_MODE_IVT_BEGIN,
+               .type           = E820_RAM,
+       };
+       mem_map[i++]    = (struct e820entry) {
+               .addr           = EBDA_START,
+               .size           = VGA_RAM_BEGIN - EBDA_START,
+               .type           = E820_RESERVED,
+       };
+       mem_map[i++]    = (struct e820entry) {
+               .addr           = MB_BIOS_BEGIN,
+               .size           = MB_BIOS_END - MB_BIOS_BEGIN,
+               .type           = E820_RESERVED,
+       };
+       if (kvm->ram_size < KVM_32BIT_GAP_START) {
+               mem_map[i++]    = (struct e820entry) {
+                       .addr           = BZ_KERNEL_START,
+                       .size           = kvm->ram_size - BZ_KERNEL_START,
+                       .type           = E820_RAM,
+               };
+       } else {
+               mem_map[i++]    = (struct e820entry) {
+                       .addr           = BZ_KERNEL_START,
+                       .size           = KVM_32BIT_GAP_START - BZ_KERNEL_START,
+                       .type           = E820_RAM,
+               };
+               mem_map[i++]    = (struct e820entry) {
+                       .addr           = 0x100000000ULL,
+                       .size           = kvm->ram_size - KVM_32BIT_GAP_START,
+                       .type           = E820_RAM,
+               };
+       }
+
+       BUILD_BUG_ON(i > E820_X_MAX);
+
+       e820->nr_map                    = i;
+}
+
+/**
+ * setup_bios - inject BIOS into guest memory
+ * @kvm - guest system descriptor
+ */
+void setup_bios(struct kvm *kvm)
+{
+       unsigned long address = MB_BIOS_BEGIN;
+       struct real_intr_desc intr_desc;
+       unsigned int i;
+       void *p;
+
+       /*
+        * before anything else -- clean some known areas
+        * we definitely don't want any trash here
+        */
+       p = guest_flat_to_host(kvm, BDA_START);
+       memset(p, 0, BDA_END - BDA_START);
+
+       p = guest_flat_to_host(kvm, EBDA_START);
+       memset(p, 0, EBDA_END - EBDA_START);
+
+       p = guest_flat_to_host(kvm, MB_BIOS_BEGIN);
+       memset(p, 0, MB_BIOS_END - MB_BIOS_BEGIN);
+
+       p = guest_flat_to_host(kvm, VGA_ROM_BEGIN);
+       memset(p, 0, VGA_ROM_END - VGA_ROM_BEGIN);
+
+       /* just copy the bios rom into the place */
+       p = guest_flat_to_host(kvm, MB_BIOS_BEGIN);
+       memcpy(p, bios_rom, bios_rom_size);
+
+       /* E820 memory map must be present */
+       e820_setup(kvm);
+
+       /*
+        * Setup a *fake* real mode vector table, it has only
+        * one real hadler which does just iret
+        */
+       address = BIOS_IRQ_PA_ADDR(bios_intfake);
+       intr_desc = (struct real_intr_desc) {
+               .segment        = REAL_SEGMENT(MB_BIOS_BEGIN),
+               .offset         = address - MB_BIOS_BEGIN,
+       };
+       interrupt_table__setup(&kvm->interrupt_table, &intr_desc);
+
+       for (i = 0; i < ARRAY_SIZE(bios_irq_handlers); i++)
+               setup_irq_handler(kvm, &bios_irq_handlers[i]);
+
+       /* we almost done */
+       p = guest_flat_to_host(kvm, 0);
+       interrupt_table__copy(&kvm->interrupt_table, p, REAL_INTR_SIZE);
+}
diff --git a/tools/kvm/bios/.gitignore b/tools/kvm/bios/.gitignore
new file mode 100644 (file)
index 0000000..1f0080b
--- /dev/null
@@ -0,0 +1,3 @@
+bios-rom.bin
+bios-rom.bin.elf
+bios-rom.h
diff --git a/tools/kvm/bios/bios-rom.S b/tools/kvm/bios/bios-rom.S
new file mode 100644 (file)
index 0000000..dc52b1e
--- /dev/null
@@ -0,0 +1,12 @@
+#include <kvm/assembly.h>
+
+       .org 0
+#ifdef CONFIG_X86_64
+       .code64
+#else
+       .code32
+#endif
+
+GLOBAL(bios_rom)
+       .incbin "bios/bios.bin"
+END(bios_rom)
diff --git a/tools/kvm/bios/e820.c b/tools/kvm/bios/e820.c
new file mode 100644 (file)
index 0000000..1eafb5b
--- /dev/null
@@ -0,0 +1,73 @@
+#include "kvm/e820.h"
+
+#include "kvm/segment.h"
+#include "kvm/bios.h"
+#include "kvm/util.h"
+
+#include <asm/processor-flags.h>
+#include <asm/e820.h>
+
+static inline void set_fs(u16 seg)
+{
+       asm volatile("movw %0,%%fs" : : "rm" (seg));
+}
+
+static inline u8 rdfs8(unsigned long addr)
+{
+       u8 v;
+
+       asm volatile("addr32 movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
+
+       return v;
+}
+
+static inline u32 rdfs32(unsigned long addr)
+{
+       u32 v;
+
+       asm volatile("addr32 movl %%fs:%1,%0" : "=q" (v) : "m" (*(u32 *)addr));
+
+       return v;
+}
+
+bioscall void e820_query_map(struct biosregs *regs)
+{
+       struct e820map *e820;
+       u32 map_size;
+       u16 fs_seg;
+       u32 ndx;
+
+       e820            = (struct e820map *)E820_MAP_START;
+       fs_seg          = flat_to_seg16(E820_MAP_START);
+       set_fs(fs_seg);
+
+       ndx             = regs->ebx;
+
+       map_size        = rdfs32(flat_to_off16((u32)&e820->nr_map, fs_seg));
+
+       if (ndx < map_size) {
+               u32 start;
+               unsigned int i;
+               u8 *p;
+
+               fs_seg  = flat_to_seg16(E820_MAP_START);
+               set_fs(fs_seg);
+
+               start   = (u32)&e820->map[ndx];
+
+               p       = (void *) regs->edi;
+
+               for (i = 0; i < sizeof(struct e820entry); i++)
+                       *p++    = rdfs8(flat_to_off16(start + i, fs_seg));
+       }
+
+       regs->eax       = SMAP;
+       regs->ecx       = sizeof(struct e820entry);
+       regs->ebx       = ++ndx;
+
+       /* Clear CF to indicate success.  */
+       regs->eflags    &= ~X86_EFLAGS_CF;
+
+       if (ndx >= map_size)
+               regs->ebx       = 0;    /* end of map */
+}
diff --git a/tools/kvm/bios/entry.S b/tools/kvm/bios/entry.S
new file mode 100644 (file)
index 0000000..2ee21a9
--- /dev/null
@@ -0,0 +1,90 @@
+/*
+ * Our pretty trivial BIOS emulation
+ */
+
+#include <kvm/bios.h>
+#include <kvm/assembly.h>
+
+       .org 0
+       .code16gcc
+
+#define EFLAGS_CF      (1 << 0)
+
+#include "macro.S"
+
+/* If you change these macros, remember to update 'struct biosregs' */
+.macro SAVE_BIOSREGS
+       pushl   %fs
+       pushl   %es
+       pushl   %edi
+       pushl   %esi
+       pushl   %ebp
+       pushl   %esp
+       pushl   %edx
+       pushl   %ecx
+       pushl   %ebx
+       pushl   %eax
+.endm
+
+.macro RESTORE_BIOSREGS
+       popl    %eax
+       popl    %ebx
+       popl    %ecx
+       popl    %edx
+       popl    %esp
+       popl    %ebp
+       popl    %esi
+       popl    %edi
+       popl    %es
+       popl    %fs
+.endm
+
+/*
+ * fake interrupt handler, nothing can be faster ever
+ */
+ENTRY(bios_intfake)
+       /*
+        * Set CF to indicate failure. We don't want callers to think that the
+        * interrupt handler succeeded and then treat the return values in
+        * registers as valid data.
+        */
+       orl     $EFLAGS_CF, 0x4(%esp)
+
+       IRET
+ENTRY_END(bios_intfake)
+
+/*
+ * int 10 - video - service
+ */
+ENTRY(bios_int10)
+       SAVE_BIOSREGS
+
+       movl            %esp, %eax
+       /* this is way easier than doing it in assembly */
+       /* just push all the regs and jump to a C handler */
+       call    int10_handler
+
+       RESTORE_BIOSREGS
+
+       /* Clear CF to indicate success.  */
+       andl    $~EFLAGS_CF, 0x4(%esp)
+
+       IRET
+ENTRY_END(bios_int10)
+
+ENTRY(bios_int15)
+       SAVE_BIOSREGS
+
+       movl    %esp, %eax
+       call    int15_handler
+
+       RESTORE_BIOSREGS
+
+       IRET
+ENTRY_END(bios_int15)
+
+GLOBAL(__locals)
+
+#include "local.S"
+
+END(__locals)
diff --git a/tools/kvm/bios/gen-offsets.sh b/tools/kvm/bios/gen-offsets.sh
new file mode 100644 (file)
index 0000000..8771bbe
--- /dev/null
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+echo "/* Autogenerated file, don't edit */"
+echo "#ifndef BIOS_OFFSETS_H"
+echo "#define BIOS_OFFSETS_H"
+
+echo ""
+echo "#define BIOS_ENTRY_SIZE(name) (name##_end - name)"
+echo ""
+
+nm bios.bin.elf | grep ' [Tt] ' | awk '{ print "#define BIOS_OFFSET__" $3 " 0x" $1; }'
+
+echo ""
+echo "#endif"
diff --git a/tools/kvm/bios/int10.c b/tools/kvm/bios/int10.c
new file mode 100644 (file)
index 0000000..22da9fa
--- /dev/null
@@ -0,0 +1,151 @@
+#include "kvm/segment.h"
+#include "kvm/bios.h"
+#include "kvm/util.h"
+#include "kvm/vesa.h"
+#include <stdint.h>
+
+#define VESA_MAGIC ('V' + ('E' << 8) + ('S' << 16) + ('A' << 24))
+
+/* VESA General Information table */
+struct vesa_general_info {
+       u32     signature;              /* 0 Magic number = "VESA" */
+       u16     version;                /* 4 */
+       void    *vendor_string;         /* 6 */
+       u32     capabilities;           /* 10 */
+       void    *video_mode_ptr;        /* 14 */
+       u16     total_memory;           /* 18 */
+       u16     modes[2];               /* 20 */
+       char    oem_string[11];         /* 24 */
+
+       u8      reserved[223];          /* 35 */
+} __attribute__ ((packed));
+
+struct vminfo {
+       u16     mode_attr;              /* 0 */
+       u8      win_attr[2];            /* 2 */
+       u16     win_grain;              /* 4 */
+       u16     win_size;               /* 6 */
+       u16     win_seg[2];             /* 8 */
+       u32     win_scheme;             /* 12 */
+       u16     logical_scan;           /* 16 */
+
+       u16     h_res;                  /* 18 */
+       u16     v_res;                  /* 20 */
+       u8      char_width;             /* 22 */
+       u8      char_height;            /* 23 */
+       u8      memory_planes;          /* 24 */
+       u8      bpp;                    /* 25 */
+       u8      banks;                  /* 26 */
+       u8      memory_layout;          /* 27 */
+       u8      bank_size;              /* 28 */
+       u8      image_planes;           /* 29 */
+       u8      page_function;          /* 30 */
+
+       u8      rmask;                  /* 31 */
+       u8      rpos;                   /* 32 */
+       u8      gmask;                  /* 33 */
+       u8      gpos;                   /* 34 */
+       u8      bmask;                  /* 35 */
+       u8      bpos;                   /* 36 */
+       u8      resv_mask;              /* 37 */
+       u8      resv_pos;               /* 38 */
+       u8      dcm_info;               /* 39 */
+
+       u32     lfb_ptr;                /* 40 Linear frame buffer address */
+       u32     offscreen_ptr;          /* 44 Offscreen memory address */
+       u16     offscreen_size;         /* 48 */
+
+       u8      reserved[206];          /* 50 */
+};
+
+static inline void outb(unsigned short port, unsigned char val)
+{
+       asm volatile("outb %0, %1" : : "a"(val), "Nd"(port));
+}
+
+/*
+ * It's probably much more useful to make this print to the serial
+ * line rather than print to a non-displayed VGA memory
+ */
+static inline void int10_putchar(struct biosregs *args)
+{
+       u8 al = args->eax & 0xFF;
+
+       outb(0x3f8, al);
+}
+
+static void vbe_get_mode(struct biosregs *args)
+{
+       struct vminfo *info = (struct vminfo *) args->edi;
+
+       *info = (struct vminfo) {
+               .mode_attr              = 0xd9, /* 11011011 */
+               .logical_scan           = VESA_WIDTH*4,
+               .h_res                  = VESA_WIDTH,
+               .v_res                  = VESA_HEIGHT,
+               .bpp                    = VESA_BPP,
+               .memory_layout          = 6,
+               .memory_planes          = 1,
+               .lfb_ptr                = VESA_MEM_ADDR,
+               .rmask                  = 8,
+               .gmask                  = 8,
+               .bmask                  = 8,
+               .resv_mask              = 8,
+               .resv_pos               = 24,
+               .bpos                   = 16,
+               .gpos                   = 8,
+       };
+}
+
+static void vbe_get_info(struct biosregs *args)
+{
+       struct vesa_general_info *info = (struct vesa_general_info *) args->edi;
+
+       *info = (struct vesa_general_info) {
+               .signature              = VESA_MAGIC,
+               .version                = 0x102,
+               .vendor_string          = &info->oem_string,
+               .capabilities           = 0x10,
+               .video_mode_ptr         = &info->modes,
+               .total_memory           = (4 * VESA_WIDTH * VESA_HEIGHT) / 0x10000,
+               .oem_string             = "KVM VESA",
+               .modes                  = { 0x0112, 0xffff },
+       };
+}
+
+#define VBE_STATUS_OK          0x004F
+
+static void int10_vesa(struct biosregs *args)
+{
+       u8 al;
+
+       al = args->eax & 0xff;
+
+       switch (al) {
+       case 0x00:
+               vbe_get_info(args);
+               break;
+       case 0x01:
+               vbe_get_mode(args);
+               break;
+       }
+
+       args->eax = VBE_STATUS_OK;
+}
+
+bioscall void int10_handler(struct biosregs *args)
+{
+       u8 ah;
+
+       ah = (args->eax & 0xff00) >> 8;
+
+       switch (ah) {
+       case 0x0e:
+               int10_putchar(args);
+               break;
+       case 0x4f:
+               int10_vesa(args);
+               break;
+       }
+
+}
diff --git a/tools/kvm/bios/int15.c b/tools/kvm/bios/int15.c
new file mode 100644 (file)
index 0000000..faf5343
--- /dev/null
@@ -0,0 +1,18 @@
+#include "kvm/bios.h"
+
+#include "kvm/e820.h"
+
+#include <asm/processor-flags.h>
+
+bioscall void int15_handler(struct biosregs *regs)
+{
+       switch (regs->eax) {
+       case 0xe820:
+               e820_query_map(regs);
+               break;
+       default:
+               /* Set CF to indicate failure.  */
+               regs->eflags    |= X86_EFLAGS_CF;
+               break;
+       }
+}
diff --git a/tools/kvm/bios/local.S b/tools/kvm/bios/local.S
new file mode 100644 (file)
index 0000000..f2cdbf4
--- /dev/null
@@ -0,0 +1,7 @@
+/*
+ * Local variables for almost every BIOS irq handler
+ * Must be put somewhere inside irq handler body
+ */
+__CALLER_SS:           .int  0
+__CALLER_SP:           .long 0
+__CALLER_CLOBBER:      .long 0
diff --git a/tools/kvm/bios/macro.S b/tools/kvm/bios/macro.S
new file mode 100644 (file)
index 0000000..0d5e567
--- /dev/null
@@ -0,0 +1,25 @@
+/*
+ * handy BIOS macros
+ */
+
+/*
+ * switch to BIOS stack
+ */
+.macro stack_swap
+       movw %ss, %cs:(__CALLER_SS)
+       movl %esp, %cs:(__CALLER_SP)
+       movl %edx, %cs:(__CALLER_CLOBBER)
+       movw $MB_BIOS_SS, %dx
+       movw %dx, %ss
+       movw $MB_BIOS_SP, %sp
+       movl %cs:(__CALLER_CLOBBER), %edx
+.endm
+
+/*
+ * restore the original stack
+ */
+.macro stack_restore
+       movl %cs:(__CALLER_SP), %esp
+       movw %cs:(__CALLER_SS), %ss
+.endm
+
diff --git a/tools/kvm/bios/rom.ld.S b/tools/kvm/bios/rom.ld.S
new file mode 100644 (file)
index 0000000..f122b97
--- /dev/null
@@ -0,0 +1,17 @@
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+
+PHDRS {
+       text PT_LOAD FLAGS(5);          /* R_E */
+       data PT_LOAD FLAGS(7);          /* RWE */
+       user PT_LOAD FLAGS(5);          /* R_E */
+       percpu PT_LOAD FLAGS(6);        /* RW_ */
+       init PT_LOAD FLAGS(7);          /* RWE */
+       note PT_NOTE FLAGS(0);          /* ___ */
+}
+
+SECTIONS {
+       . = 0;
+       .text : { *(.text) } :text = 0x9090
+}
+
diff --git a/tools/kvm/builtin-balloon.c b/tools/kvm/builtin-balloon.c
new file mode 100644 (file)
index 0000000..7329063
--- /dev/null
@@ -0,0 +1,75 @@
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-balloon.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm.h>
+
+static pid_t instance_pid;
+static const char *instance_name;
+static u64 inflate;
+static u64 deflate;
+
+static const char * const balloon_usage[] = {
+       "kvm balloon [-n name] [-p pid] [-i amount] [-d amount]",
+       NULL
+};
+
+static const struct option balloon_options[] = {
+       OPT_GROUP("Instance options:"),
+       OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+       OPT_INTEGER('p', "pid", &instance_pid, "Instance pid"),
+       OPT_GROUP("Balloon options:"),
+       OPT_U64('i', "inflate", &inflate, "Amount to inflate"),
+       OPT_U64('d', "deflate", &deflate, "Amount to deflate"),
+       OPT_END(),
+};
+
+void kvm_balloon_help(void)
+{
+       usage_with_options(balloon_usage, balloon_options);
+}
+
+static void parse_balloon_options(int argc, const char **argv)
+{
+       while (argc != 0) {
+               argc = parse_options(argc, argv, balloon_options, balloon_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION);
+               if (argc != 0)
+                       kvm_balloon_help();
+       }
+}
+
+int kvm_cmd_balloon(int argc, const char **argv, const char *prefix)
+{
+       u64 i;
+
+       parse_balloon_options(argc, argv);
+
+       if (inflate == 0 && deflate == 0)
+               kvm_balloon_help();
+
+       if (instance_name == NULL &&
+           instance_pid == 0)
+               kvm_balloon_help();
+
+       if (instance_name)
+               instance_pid = kvm__get_pid_by_instance(instance_name);
+
+       if (instance_pid <= 0)
+               die("Failed locating instance");
+
+       if (inflate)
+               for (i = 0; i < inflate; i++)
+                       kill(instance_pid, SIGKVMADDMEM);
+       else if (deflate)
+               for (i = 0; i < deflate; i++)
+                       kill(instance_pid, SIGKVMDELMEM);
+       else
+               kvm_balloon_help();
+
+       return 0;
+}
diff --git a/tools/kvm/builtin-debug.c b/tools/kvm/builtin-debug.c
new file mode 100644 (file)
index 0000000..4be12cc
--- /dev/null
@@ -0,0 +1,66 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-debug.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static pid_t instance_pid;
+static const char *instance_name;
+
+static const char * const debug_usage[] = {
+       "kvm debug [--all] [-n name] [-p pid]",
+       NULL
+};
+
+static const struct option debug_options[] = {
+       OPT_GROUP("General options:"),
+       OPT_BOOLEAN('a', "all", &all, "Debug all instances"),
+       OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+       OPT_INTEGER('p', "pid", &instance_pid, "Instance pid"),
+       OPT_END()
+};
+
+static void parse_debug_options(int argc, const char **argv)
+{
+       while (argc != 0) {
+               argc = parse_options(argc, argv, debug_options, debug_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION);
+               if (argc != 0)
+                       kvm_debug_help();
+       }
+}
+
+void kvm_debug_help(void)
+{
+       usage_with_options(debug_usage, debug_options);
+}
+
+static int do_debug(const char *name, int pid)
+{
+       return kill(pid, SIGQUIT);
+}
+
+int kvm_cmd_debug(int argc, const char **argv, const char *prefix)
+{
+       parse_debug_options(argc, argv);
+
+       if (all)
+               return kvm__enumerate_instances(do_debug);
+
+       if (instance_name == NULL &&
+           instance_pid == 0)
+               kvm_debug_help();
+
+       if (instance_name)
+               instance_pid = kvm__get_pid_by_instance(instance_name);
+
+       if (instance_pid <= 0)
+               die("Failed locating instance");
+
+       return kill(instance_pid, SIGQUIT);
+}
diff --git a/tools/kvm/builtin-help.c b/tools/kvm/builtin-help.c
new file mode 100644 (file)
index 0000000..e70044a
--- /dev/null
@@ -0,0 +1,61 @@
+#include <stdio.h>
+#include <string.h>
+
+/* user defined headers */
+#include <common-cmds.h>
+
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-help.h>
+
+
+const char kvm_usage_string[] =
+       "kvm COMMAND [ARGS]";
+
+const char kvm_more_info_string[] =
+       "See 'kvm help COMMAND' for more information on a specific command.";
+
+
+static void list_common_cmds_help(void)
+{
+       unsigned int i, longest = 0;
+
+       for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+               if (longest < strlen(common_cmds[i].name))
+                       longest = strlen(common_cmds[i].name);
+       }
+
+       puts(" The most commonly used kvm commands are:");
+       for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+               printf("   %-*s   ", longest, common_cmds[i].name);
+               puts(common_cmds[i].help);
+       }
+}
+
+static void kvm_help(void)
+{
+       printf("\n usage: %s\n\n", kvm_usage_string);
+       list_common_cmds_help();
+       printf("\n %s\n\n", kvm_more_info_string);
+}
+
+
+static void help_cmd(const char *cmd)
+{
+       struct cmd_struct *p;
+       p = kvm_get_command(kvm_commands, cmd);
+       if (!p)
+               kvm_help();
+       else if (p->help)
+               p->help();
+}
+
+int kvm_cmd_help(int argc, const char **argv, const char *prefix)
+{
+       if (!argv || !*argv) {
+               kvm_help();
+               return 0;
+       }
+       help_cmd(argv[0]);
+       return 0;
+}
diff --git a/tools/kvm/builtin-list.c b/tools/kvm/builtin-list.c
new file mode 100644 (file)
index 0000000..fcf9bb0
--- /dev/null
@@ -0,0 +1,66 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-list.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <fcntl.h>
+
+#define PROCESS_NAME "kvm"
+
+static const char * const list_usage[] = {
+       "kvm list",
+       NULL
+};
+
+static const struct option list_options[] = {
+       OPT_END()
+};
+
+void kvm_list_help(void)
+{
+       usage_with_options(list_usage, list_options);
+}
+
+static int print_guest(const char *name, int pid)
+{
+       char proc_name[PATH_MAX];
+       char *comm = NULL;
+       FILE *fd;
+
+       sprintf(proc_name, "/proc/%d/stat", pid);
+       fd = fopen(proc_name, "r");
+       if (fd == NULL)
+               goto cleanup;
+       if (fscanf(fd, "%*u (%as)", &comm) == 0)
+               goto cleanup;
+       if (strncmp(comm, PROCESS_NAME, strlen(PROCESS_NAME)))
+               goto cleanup;
+
+       printf("%5d %s\n", pid, name);
+
+       free(comm);
+
+       fclose(fd);
+
+       return 0;
+
+cleanup:
+       if (fd)
+               fclose(fd);
+       if (comm)
+               free(comm);
+
+       kvm__remove_pidfile(name);
+       return 0;
+}
+
+int kvm_cmd_list(int argc, const char **argv, const char *prefix)
+{
+       printf("  PID GUEST\n");
+
+       return kvm__enumerate_instances(print_guest);
+}
diff --git a/tools/kvm/builtin-pause.c b/tools/kvm/builtin-pause.c
new file mode 100644 (file)
index 0000000..7b644ff
--- /dev/null
@@ -0,0 +1,66 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-pause.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static pid_t instance_pid;
+static const char *instance_name;
+
+static const char * const pause_usage[] = {
+       "kvm pause [--all] [-n name] [-p pid]",
+       NULL
+};
+
+static const struct option pause_options[] = {
+       OPT_GROUP("General options:"),
+       OPT_BOOLEAN('a', "all", &all, "Pause all instances"),
+       OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+       OPT_INTEGER('p', "pid", &instance_pid, "Instance pid"),
+       OPT_END()
+};
+
+static void parse_pause_options(int argc, const char **argv)
+{
+       while (argc != 0) {
+               argc = parse_options(argc, argv, pause_options, pause_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION);
+               if (argc != 0)
+                       kvm_pause_help();
+       }
+}
+
+void kvm_pause_help(void)
+{
+       usage_with_options(pause_usage, pause_options);
+}
+
+static int do_pause(const char *name, int pid)
+{
+       return kill(pid, SIGUSR2);
+}
+
+int kvm_cmd_pause(int argc, const char **argv, const char *prefix)
+{
+       parse_pause_options(argc, argv);
+
+       if (all)
+               return kvm__enumerate_instances(do_pause);
+
+       if (instance_name == NULL &&
+           instance_pid == 0)
+               kvm_pause_help();
+
+       if (instance_name)
+               instance_pid = kvm__get_pid_by_instance(instance_name);
+
+       if (instance_pid <= 0)
+               die("Failed locating instance");
+
+       return kill(instance_pid, SIGUSR2);
+}
diff --git a/tools/kvm/builtin-resume.c b/tools/kvm/builtin-resume.c
new file mode 100644 (file)
index 0000000..70de0fc
--- /dev/null
@@ -0,0 +1,66 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-resume.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static pid_t instance_pid;
+static const char *instance_name;
+
+static const char * const resume_usage[] = {
+       "kvm resume [--all] [-n name] [-p pid]",
+       NULL
+};
+
+static const struct option resume_options[] = {
+       OPT_GROUP("General options:"),
+       OPT_BOOLEAN('a', "all", &all, "Resume all instances"),
+       OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+       OPT_INTEGER('p', "pid", &instance_pid, "Instance pid"),
+       OPT_END()
+};
+
+static void parse_resume_options(int argc, const char **argv)
+{
+       while (argc != 0) {
+               argc = parse_options(argc, argv, resume_options, resume_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION);
+               if (argc != 0)
+                       kvm_resume_help();
+       }
+}
+
+void kvm_resume_help(void)
+{
+       usage_with_options(resume_usage, resume_options);
+}
+
+static int do_resume(const char *name, int pid)
+{
+       return kill(pid, SIGKVMRESUME);
+}
+
+int kvm_cmd_resume(int argc, const char **argv, const char *prefix)
+{
+       parse_resume_options(argc, argv);
+
+       if (all)
+               return kvm__enumerate_instances(do_resume);
+
+       if (instance_name == NULL &&
+           instance_pid == 0)
+               kvm_resume_help();
+
+       if (instance_name)
+               instance_pid = kvm__get_pid_by_instance(instance_name);
+
+       if (instance_pid <= 0)
+               die("Failed locating instance");
+
+       return kill(instance_pid, SIGKVMRESUME);
+}
diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c
new file mode 100644 (file)
index 0000000..591fd77
--- /dev/null
@@ -0,0 +1,907 @@
+#include "kvm/builtin-run.h"
+
+#include "kvm/builtin-setup.h"
+#include "kvm/virtio-balloon.h"
+#include "kvm/virtio-console.h"
+#include "kvm/parse-options.h"
+#include "kvm/8250-serial.h"
+#include "kvm/framebuffer.h"
+#include "kvm/disk-image.h"
+#include "kvm/threadpool.h"
+#include "kvm/virtio-blk.h"
+#include "kvm/virtio-net.h"
+#include "kvm/virtio-rng.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/virtio-9p.h"
+#include "kvm/barrier.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/ioport.h"
+#include "kvm/symbol.h"
+#include "kvm/i8042.h"
+#include "kvm/mutex.h"
+#include "kvm/term.h"
+#include "kvm/util.h"
+#include "kvm/vesa.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/rtc.h"
+#include "kvm/sdl.h"
+#include "kvm/vnc.h"
+#include "kvm/guest_compat.h"
+#include "kvm/pci-shmem.h"
+
+#include <linux/types.h>
+
+#include <sys/utsname.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <termios.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <stdio.h>
+
+#define DEFAULT_KVM_DEV                "/dev/kvm"
+#define DEFAULT_CONSOLE                "serial"
+#define DEFAULT_NETWORK                "user"
+#define DEFAULT_HOST_ADDR      "192.168.33.1"
+#define DEFAULT_GUEST_ADDR     "192.168.33.15"
+#define DEFAULT_GUEST_MAC      "02:15:15:15:15:15"
+#define DEFAULT_HOST_MAC       "02:01:01:01:01:01"
+#define DEFAULT_SCRIPT         "none"
+
+#define MB_SHIFT               (20)
+#define KB_SHIFT               (10)
+#define GB_SHIFT               (30)
+#define MIN_RAM_SIZE_MB                (64ULL)
+#define MIN_RAM_SIZE_BYTE      (MIN_RAM_SIZE_MB << MB_SHIFT)
+
+struct kvm *kvm;
+struct kvm_cpu *kvm_cpus[KVM_NR_CPUS];
+__thread struct kvm_cpu *current_kvm_cpu;
+
+static u64 ram_size;
+static u8  image_count;
+static bool virtio_rng;
+static const char *kernel_cmdline;
+static const char *kernel_filename;
+static const char *vmlinux_filename;
+static const char *initrd_filename;
+static const char *image_filename[MAX_DISK_IMAGES];
+static const char *console;
+static const char *dev;
+static const char *network;
+static const char *host_ip;
+static const char *guest_ip;
+static const char *guest_mac;
+static const char *host_mac;
+static const char *script;
+static const char *guest_name;
+static bool single_step;
+static bool readonly_image[MAX_DISK_IMAGES];
+static bool vnc;
+static bool sdl;
+static bool balloon;
+static bool using_rootfs;
+static bool custom_rootfs;
+extern bool ioport_debug;
+extern int  active_console;
+extern int  debug_iodelay;
+
+bool do_debug_print = false;
+
+static int nrcpus;
+static int vidmode = -1;
+
+static const char * const run_usage[] = {
+       "kvm run [<options>] [<kernel image>]",
+       NULL
+};
+
+static int img_name_parser(const struct option *opt, const char *arg, int unset)
+{
+       char *sep;
+       struct stat st;
+       char path[PATH_MAX];
+
+       if (stat(arg, &st) == 0 &&
+           S_ISDIR(st.st_mode)) {
+               char tmp[PATH_MAX];
+
+               if (realpath(arg, tmp) == 0 ||
+                   virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+                       die("Unable to initialize virtio 9p");
+               using_rootfs = 1;
+               return 0;
+       }
+
+       snprintf(path, PATH_MAX, "%s%s%s", HOME_DIR, KVM_PID_FILE_PATH, arg);
+
+       if (stat(path, &st) == 0 &&
+           S_ISDIR(st.st_mode)) {
+               char tmp[PATH_MAX];
+
+               if (realpath(path, tmp) == 0 ||
+                   virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+                       die("Unable to initialize virtio 9p");
+               if (virtio_9p__register(kvm, "/", "hostfs") < 0)
+                       die("Unable to initialize virtio 9p");
+               using_rootfs = custom_rootfs = 1;
+               return 0;
+       }
+
+       if (image_count >= MAX_DISK_IMAGES)
+               die("Currently only 4 images are supported");
+
+       image_filename[image_count] = arg;
+       sep = strstr(arg, ",");
+       if (sep) {
+               if (strcmp(sep + 1, "ro") == 0)
+                       readonly_image[image_count] = 1;
+               *sep = 0;
+       }
+
+       image_count++;
+
+       return 0;
+}
+
+static int virtio_9p_rootdir_parser(const struct option *opt, const char *arg, int unset)
+{
+       char *tag_name;
+       char tmp[PATH_MAX];
+
+       /*
+        * 9p dir can be of the form dirname,tag_name or
+        * just dirname. In the later case we use the
+        * default tag name
+        */
+       tag_name = strstr(arg, ",");
+       if (tag_name) {
+               *tag_name = '\0';
+               tag_name++;
+       }
+       if (realpath(arg, tmp)) {
+               if (virtio_9p__register(kvm, tmp, tag_name) < 0)
+                       die("Unable to initialize virtio 9p");
+       } else
+               die("Failed resolving 9p path");
+       return 0;
+}
+
+static int shmem_parser(const struct option *opt, const char *arg, int unset)
+{
+       const uint64_t default_size = SHMEM_DEFAULT_SIZE;
+       const uint64_t default_phys_addr = SHMEM_DEFAULT_ADDR;
+       const char *default_handle = SHMEM_DEFAULT_HANDLE;
+       struct shmem_info *si = malloc(sizeof(struct shmem_info));
+       uint64_t phys_addr;
+       uint64_t size;
+       char *handle = NULL;
+       int create = 0;
+       const char *p = arg;
+       char *next;
+       int base = 10;
+       int verbose = 0;
+
+       const int skip_pci = strlen("pci:");
+       if (verbose)
+               pr_info("shmem_parser(%p,%s,%d)", opt, arg, unset);
+       /* parse out optional addr family */
+       if (strcasestr(p, "pci:")) {
+               p += skip_pci;
+       } else if (strcasestr(p, "mem:")) {
+               die("I can't add to E820 map yet.\n");
+       }
+       /* parse out physical addr */
+       base = 10;
+       if (strcasestr(p, "0x"))
+               base = 16;
+       phys_addr = strtoll(p, &next, base);
+       if (next == p && phys_addr == 0) {
+               pr_info("shmem: no physical addr specified, using default.");
+               phys_addr = default_phys_addr;
+       }
+       if (*next != ':' && *next != '\0')
+               die("shmem: unexpected chars after phys addr.\n");
+       if (*next == '\0')
+               p = next;
+       else
+               p = next + 1;
+       /* parse out size */
+       base = 10;
+       if (strcasestr(p, "0x"))
+               base = 16;
+       size = strtoll(p, &next, base);
+       if (next == p && size == 0) {
+               pr_info("shmem: no size specified, using default.");
+               size = default_size;
+       }
+       /* look for [KMGkmg][Bb]*  uses base 2. */
+       int skip_B = 0;
+       if (strspn(next, "KMGkmg")) {   /* might have a prefix */
+               if (*(next + 1) == 'B' || *(next + 1) == 'b')
+                       skip_B = 1;
+               switch (*next) {
+               case 'K':
+               case 'k':
+                       size = size << KB_SHIFT;
+                       break;
+               case 'M':
+               case 'm':
+                       size = size << MB_SHIFT;
+                       break;
+               case 'G':
+               case 'g':
+                       size = size << GB_SHIFT;
+                       break;
+               default:
+                       die("shmem: bug in detecting size prefix.");
+                       break;
+               }
+               next += 1 + skip_B;
+       }
+       if (*next != ':' && *next != '\0') {
+               die("shmem: unexpected chars after phys size. <%c><%c>\n",
+                   *next, *p);
+       }
+       if (*next == '\0')
+               p = next;
+       else
+               p = next + 1;
+       /* parse out optional shmem handle */
+       const int skip_handle = strlen("handle=");
+       next = strcasestr(p, "handle=");
+       if (*p && next) {
+               if (p != next)
+                       die("unexpected chars before handle\n");
+               p += skip_handle;
+               next = strchrnul(p, ':');
+               if (next - p) {
+                       handle = malloc(next - p + 1);
+                       strncpy(handle, p, next - p);
+                       handle[next - p] = '\0';        /* just in case. */
+               }
+               if (*next == '\0')
+                       p = next;
+               else
+                       p = next + 1;
+       }
+       /* parse optional create flag to see if we should create shm seg. */
+       if (*p && strcasestr(p, "create")) {
+               create = 1;
+               p += strlen("create");
+       }
+       if (*p != '\0')
+               die("shmem: unexpected trailing chars\n");
+       if (handle == NULL) {
+               handle = malloc(strlen(default_handle) + 1);
+               strcpy(handle, default_handle);
+       }
+       if (verbose) {
+               pr_info("shmem: phys_addr = %lx", phys_addr);
+               pr_info("shmem: size      = %lx", size);
+               pr_info("shmem: handle    = %s", handle);
+               pr_info("shmem: create    = %d", create);
+       }
+
+       si->phys_addr = phys_addr;
+       si->size = size;
+       si->handle = handle;
+       si->create = create;
+       pci_shmem__register_mem(si);    /* ownership of si, etc. passed on. */
+       return 0;
+}
+
+static const struct option options[] = {
+       OPT_GROUP("Basic options:"),
+       OPT_STRING('\0', "name", &guest_name, "guest name",
+                       "A name for the guest"),
+       OPT_INTEGER('c', "cpus", &nrcpus, "Number of CPUs"),
+       OPT_U64('m', "mem", &ram_size, "Virtual machine memory size in MiB."),
+       OPT_CALLBACK('\0', "shmem", NULL,
+                    "[pci:]<addr>:<size>[:handle=<handle>][:create]",
+                    "Share host shmem with guest via pci device",
+                    shmem_parser),
+       OPT_CALLBACK('d', "disk", NULL, "image or rootfs_dir", "Disk image or rootfs directory", img_name_parser),
+       OPT_BOOLEAN('\0', "balloon", &balloon, "Enable virtio balloon"),
+       OPT_BOOLEAN('\0', "vnc", &vnc, "Enable VNC framebuffer"),
+       OPT_BOOLEAN('\0', "sdl", &sdl, "Enable SDL framebuffer"),
+       OPT_BOOLEAN('\0', "rng", &virtio_rng, "Enable virtio Random Number Generator"),
+       OPT_CALLBACK('\0', "9p", NULL, "dir_to_share,tag_name",
+                    "Enable virtio 9p to share files between host and guest", virtio_9p_rootdir_parser),
+       OPT_STRING('\0', "console", &console, "serial or virtio",
+                       "Console to use"),
+       OPT_STRING('\0', "dev", &dev, "device_file", "KVM device file"),
+
+       OPT_GROUP("Kernel options:"),
+       OPT_STRING('k', "kernel", &kernel_filename, "kernel",
+                       "Kernel to boot in virtual machine"),
+       OPT_STRING('i', "initrd", &initrd_filename, "initrd",
+                       "Initial RAM disk image"),
+       OPT_STRING('p', "params", &kernel_cmdline, "params",
+                       "Kernel command line arguments"),
+
+       OPT_GROUP("Networking options:"),
+       OPT_STRING('n', "network", &network, "user, tap, none",
+                       "Network to use"),
+       OPT_STRING('\0', "host-ip", &host_ip, "a.b.c.d",
+                       "Assign this address to the host side networking"),
+       OPT_STRING('\0', "guest-ip", &guest_ip, "a.b.c.d",
+                       "Assign this address to the guest side networking"),
+       OPT_STRING('\0', "host-mac", &host_mac, "aa:bb:cc:dd:ee:ff",
+                       "Assign this address to the host side NIC"),
+       OPT_STRING('\0', "guest-mac", &guest_mac, "aa:bb:cc:dd:ee:ff",
+                       "Assign this address to the guest side NIC"),
+       OPT_STRING('\0', "tapscript", &script, "Script path",
+                        "Assign a script to process created tap device"),
+
+       OPT_GROUP("BIOS options:"),
+       OPT_INTEGER('\0', "vidmode", &vidmode,
+                   "Video mode"),
+
+       OPT_GROUP("Debug options:"),
+       OPT_BOOLEAN('\0', "debug", &do_debug_print,
+                       "Enable debug messages"),
+       OPT_BOOLEAN('\0', "debug-single-step", &single_step,
+                       "Enable single stepping"),
+       OPT_BOOLEAN('\0', "debug-ioport", &ioport_debug,
+                       "Enable ioport debugging"),
+       OPT_INTEGER('\0', "debug-iodelay", &debug_iodelay,
+                       "Delay IO by millisecond"),
+       OPT_END()
+};
+
+/*
+ * Serialize debug printout so that the output of multiple vcpus does not
+ * get mixed up:
+ */
+static int printout_done;
+
+static void handle_sigusr1(int sig)
+{
+       struct kvm_cpu *cpu = current_kvm_cpu;
+
+       if (!cpu)
+               return;
+
+       printf("\n #\n # vCPU #%ld's dump:\n #\n", cpu->cpu_id);
+       kvm_cpu__show_registers(cpu);
+       kvm_cpu__show_code(cpu);
+       kvm_cpu__show_page_tables(cpu);
+       fflush(stdout);
+       printout_done = 1;
+       mb();
+}
+
+/* Pause/resume the guest using SIGUSR2 */
+static int is_paused;
+
+static void handle_sigusr2(int sig)
+{
+       if (sig == SIGKVMRESUME && is_paused)
+               kvm__continue();
+       else if (sig == SIGUSR2 && !is_paused)
+               kvm__pause();
+       else
+               return;
+
+       is_paused = !is_paused;
+       pr_info("Guest %s\n", is_paused ? "paused" : "resumed");
+}
+
+static void handle_sigquit(int sig)
+{
+       int i;
+
+       for (i = 0; i < nrcpus; i++) {
+               struct kvm_cpu *cpu = kvm_cpus[i];
+
+               if (!cpu)
+                       continue;
+
+               printout_done = 0;
+               pthread_kill(cpu->thread, SIGUSR1);
+               /*
+                * Wait for the vCPU to dump state before signalling
+                * the next thread. Since this is debug code it does
+                * not matter that we are burning CPU time a bit:
+                */
+               while (!printout_done)
+                       mb();
+       }
+
+       serial8250__inject_sysrq(kvm);
+}
+
+static void handle_sigalrm(int sig)
+{
+       serial8250__inject_interrupt(kvm);
+       virtio_console__inject_interrupt(kvm);
+}
+
+static void handle_sigstop(int sig)
+{
+       kvm_cpu__reboot();
+}
+
+static void *kvm_cpu_thread(void *arg)
+{
+       current_kvm_cpu         = arg;
+
+       if (kvm_cpu__start(current_kvm_cpu))
+               goto panic_kvm;
+
+       kvm_cpu__delete(current_kvm_cpu);
+
+       return (void *) (intptr_t) 0;
+
+panic_kvm:
+       fprintf(stderr, "KVM exit reason: %u (\"%s\")\n",
+               current_kvm_cpu->kvm_run->exit_reason,
+               kvm_exit_reasons[current_kvm_cpu->kvm_run->exit_reason]);
+       if (current_kvm_cpu->kvm_run->exit_reason == KVM_EXIT_UNKNOWN)
+               fprintf(stderr, "KVM exit code: 0x%Lu\n",
+                       current_kvm_cpu->kvm_run->hw.hardware_exit_reason);
+
+       kvm_cpu__show_registers(current_kvm_cpu);
+       kvm_cpu__show_code(current_kvm_cpu);
+       kvm_cpu__show_page_tables(current_kvm_cpu);
+
+       kvm_cpu__delete(current_kvm_cpu);
+
+       return (void *) (intptr_t) 1;
+}
+
+static char kernel[PATH_MAX];
+
+static const char *host_kernels[] = {
+       "/boot/vmlinuz",
+       "/boot/bzImage",
+       NULL
+};
+
+static const char *default_kernels[] = {
+       "./bzImage",
+       "../../arch/x86/boot/bzImage",
+       NULL
+};
+
+static const char *default_vmlinux[] = {
+       "../../../vmlinux",
+       "../../vmlinux",
+       NULL
+};
+
+static void kernel_usage_with_options(void)
+{
+       const char **k;
+       struct utsname uts;
+
+       fprintf(stderr, "Fatal: could not find default kernel image in:\n");
+       k = &default_kernels[0];
+       while (*k) {
+               fprintf(stderr, "\t%s\n", *k);
+               k++;
+       }
+
+       if (uname(&uts) < 0)
+               return;
+
+       k = &host_kernels[0];
+       while (*k) {
+               if (snprintf(kernel, PATH_MAX, "%s-%s", *k, uts.release) < 0)
+                       return;
+               fprintf(stderr, "\t%s\n", kernel);
+               k++;
+       }
+       fprintf(stderr, "\nPlease see 'kvm run --help' for more options.\n\n");
+}
+
+static u64 host_ram_size(void)
+{
+       long page_size;
+       long nr_pages;
+
+       nr_pages        = sysconf(_SC_PHYS_PAGES);
+       if (nr_pages < 0) {
+               pr_warning("sysconf(_SC_PHYS_PAGES) failed");
+               return 0;
+       }
+
+       page_size       = sysconf(_SC_PAGE_SIZE);
+       if (page_size < 0) {
+               pr_warning("sysconf(_SC_PAGE_SIZE) failed");
+               return 0;
+       }
+
+       return (nr_pages * page_size) >> MB_SHIFT;
+}
+
+/*
+ * If user didn't specify how much memory it wants to allocate for the guest,
+ * avoid filling the whole host RAM.
+ */
+#define RAM_SIZE_RATIO         0.8
+
+static u64 get_ram_size(int nr_cpus)
+{
+       u64 available;
+       u64 ram_size;
+
+       ram_size        = 64 * (nr_cpus + 3);
+
+       available       = host_ram_size() * RAM_SIZE_RATIO;
+       if (!available)
+               available = MIN_RAM_SIZE_MB;
+
+       if (ram_size > available)
+               ram_size        = available;
+
+       return ram_size;
+}
+
+static const char *find_kernel(void)
+{
+       const char **k;
+       struct stat st;
+       struct utsname uts;
+
+       k = &default_kernels[0];
+       while (*k) {
+               if (stat(*k, &st) < 0 || !S_ISREG(st.st_mode)) {
+                       k++;
+                       continue;
+               }
+               strncpy(kernel, *k, PATH_MAX);
+               return kernel;
+       }
+
+       if (uname(&uts) < 0)
+               return NULL;
+
+       k = &host_kernels[0];
+       while (*k) {
+               if (snprintf(kernel, PATH_MAX, "%s-%s", *k, uts.release) < 0)
+                       return NULL;
+
+               if (stat(kernel, &st) < 0 || !S_ISREG(st.st_mode)) {
+                       k++;
+                       continue;
+               }
+               return kernel;
+
+       }
+       return NULL;
+}
+
+static const char *find_vmlinux(void)
+{
+       const char **vmlinux;
+
+       vmlinux = &default_vmlinux[0];
+       while (*vmlinux) {
+               struct stat st;
+
+               if (stat(*vmlinux, &st) < 0 || !S_ISREG(st.st_mode)) {
+                       vmlinux++;
+                       continue;
+               }
+               return *vmlinux;
+       }
+       return NULL;
+}
+
+void kvm_run_help(void)
+{
+       usage_with_options(run_usage, options);
+}
+
+int kvm_cmd_run(int argc, const char **argv, const char *prefix)
+{
+       struct virtio_net_parameters net_params;
+       static char real_cmdline[2048], default_name[20];
+       struct framebuffer *fb = NULL;
+       unsigned int nr_online_cpus;
+       int exit_code = 0;
+       int max_cpus, recommended_cpus;
+       int i;
+       void *ret;
+
+       signal(SIGALRM, handle_sigalrm);
+       signal(SIGQUIT, handle_sigquit);
+       signal(SIGUSR1, handle_sigusr1);
+       signal(SIGUSR2, handle_sigusr2);
+       signal(SIGKVMSTOP, handle_sigstop);
+       signal(SIGKVMRESUME, handle_sigusr2);
+       /* ignore balloon signal by default if not enable balloon optiion */
+       signal(SIGKVMADDMEM, SIG_IGN);
+       signal(SIGKVMDELMEM, SIG_IGN);
+
+       nr_online_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+
+       while (argc != 0) {
+               argc = parse_options(argc, argv, options, run_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION);
+               if (argc != 0) {
+                       if (kernel_filename) {
+                               fprintf(stderr, "Cannot handle parameter: "
+                                               "%s\n", argv[0]);
+                               usage_with_options(run_usage, options);
+                               return EINVAL;
+                       }
+                       /* first unhandled parameter is treated as a kernel
+                          image
+                        */
+                       kernel_filename = argv[0];
+                       argv++;
+                       argc--;
+               }
+
+       }
+
+       if (!kernel_filename)
+               kernel_filename = find_kernel();
+
+       if (!kernel_filename) {
+               kernel_usage_with_options();
+               return EINVAL;
+       }
+
+       vmlinux_filename = find_vmlinux();
+
+       if (nrcpus == 0)
+               nrcpus = nr_online_cpus;
+       else if (nrcpus < 1 || nrcpus > KVM_NR_CPUS)
+               die("Number of CPUs %d is out of [1;%d] range", nrcpus, KVM_NR_CPUS);
+
+       if (!ram_size)
+               ram_size        = get_ram_size(nrcpus);
+
+       if (ram_size < MIN_RAM_SIZE_MB)
+               die("Not enough memory specified: %lluMB (min %lluMB)", ram_size, MIN_RAM_SIZE_MB);
+
+       if (ram_size > host_ram_size())
+               pr_warning("Guest memory size %lluMB exceeds host physical RAM size %lluMB", ram_size, host_ram_size());
+
+       ram_size <<= MB_SHIFT;
+
+       if (!dev)
+               dev = DEFAULT_KVM_DEV;
+
+       if (!console)
+               console = DEFAULT_CONSOLE;
+
+       if (!strncmp(console, "virtio", 6))
+               active_console  = CONSOLE_VIRTIO;
+       else
+               active_console  = CONSOLE_8250;
+
+       if (!host_ip)
+               host_ip = DEFAULT_HOST_ADDR;
+
+       if (!guest_ip)
+               guest_ip = DEFAULT_GUEST_ADDR;
+
+       if (!guest_mac)
+               guest_mac = DEFAULT_GUEST_MAC;
+
+       if (!host_mac)
+               host_mac = DEFAULT_HOST_MAC;
+
+       if (!script)
+               script = DEFAULT_SCRIPT;
+
+       symbol__init(vmlinux_filename);
+
+       term_init();
+
+       if (!guest_name) {
+               sprintf(default_name, "guest-%u", getpid());
+               guest_name = default_name;
+       }
+
+       kvm = kvm__init(dev, ram_size, guest_name);
+
+       irq__init(kvm);
+
+       kvm->single_step = single_step;
+
+       ioeventfd__init();
+
+       max_cpus = kvm__max_cpus(kvm);
+       recommended_cpus = kvm__recommended_cpus(kvm);
+
+       if (nrcpus > max_cpus) {
+               printf("  # Limit the number of CPUs to %d\n", max_cpus);
+               kvm->nrcpus     = max_cpus;
+       } else if (nrcpus > recommended_cpus) {
+               printf("  # Warning: The maximum recommended amount of VCPUs"
+                       " is %d\n", recommended_cpus);
+       }
+
+       kvm->nrcpus = nrcpus;
+
+       /*
+        * vidmode should be either specified
+        * either set by default
+        */
+       if (vnc || sdl) {
+               if (vidmode == -1)
+                       vidmode = 0x312;
+       } else
+               vidmode = 0;
+
+       memset(real_cmdline, 0, sizeof(real_cmdline));
+       strcpy(real_cmdline, "notsc noapic noacpi pci=conf1 reboot=k panic=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1");
+       if (vnc || sdl) {
+               strcat(real_cmdline, " video=vesafb console=tty0");
+       } else
+               strcat(real_cmdline, " console=ttyS0 earlyprintk=serial");
+       strcat(real_cmdline, " ");
+       if (kernel_cmdline)
+               strlcat(real_cmdline, kernel_cmdline, sizeof(real_cmdline));
+
+       if (!using_rootfs && !image_filename[0]) {
+               char tmp[PATH_MAX];
+
+               kvm_setup_create_new("default");
+
+               snprintf(tmp, PATH_MAX, "%s%s%s", HOME_DIR, KVM_PID_FILE_PATH, "default");
+               if (virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+                       die("Unable to initialize virtio 9p");
+               if (virtio_9p__register(kvm, "/", "hostfs") < 0)
+                       die("Unable to initialize virtio 9p");
+               using_rootfs = custom_rootfs = 1;
+
+               if (!strstr(real_cmdline, "init="))
+                       strlcat(real_cmdline, " init=/bin/sh ", sizeof(real_cmdline));
+       }
+
+       if (using_rootfs) {
+               strcat(real_cmdline, " root=/dev/root rw rootflags=rw,trans=virtio,version=9p2000.L rootfstype=9p");
+               if (custom_rootfs)
+                       strcat(real_cmdline, " init=/virt/init");
+       } else if (!strstr(real_cmdline, "root=")) {
+               strlcat(real_cmdline, " root=/dev/vda rw ", sizeof(real_cmdline));
+       }
+
+       if (image_count) {
+               kvm->nr_disks = image_count;
+               kvm->disks    = disk_image__open_all(image_filename, readonly_image, image_count);
+               if (!kvm->disks)
+                       die("Unable to load all disk images.");
+
+               virtio_blk__init_all(kvm);
+       }
+
+       printf("  # kvm run -k %s -m %Lu -c %d --name %s\n", kernel_filename, ram_size / 1024 / 1024, nrcpus, guest_name);
+
+       if (!kvm__load_kernel(kvm, kernel_filename, initrd_filename,
+                               real_cmdline, vidmode))
+               die("unable to load kernel %s", kernel_filename);
+
+       kvm->vmlinux            = vmlinux_filename;
+
+       ioport__setup_legacy();
+
+       rtc__init();
+
+       serial8250__init(kvm);
+
+       pci__init();
+
+       if (active_console == CONSOLE_VIRTIO)
+               virtio_console__init(kvm);
+
+       if (virtio_rng)
+               virtio_rng__init(kvm);
+
+       if (balloon)
+               virtio_bln__init(kvm);
+
+       if (!network)
+               network = DEFAULT_NETWORK;
+
+       virtio_9p__init(kvm);
+
+       if (strncmp(network, "none", 4)) {
+               net_params.guest_ip = guest_ip;
+               net_params.host_ip = host_ip;
+               net_params.kvm = kvm;
+               net_params.script = script;
+               sscanf(guest_mac, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+                       net_params.guest_mac,
+                       net_params.guest_mac+1,
+                       net_params.guest_mac+2,
+                       net_params.guest_mac+3,
+                       net_params.guest_mac+4,
+                       net_params.guest_mac+5);
+               sscanf(host_mac, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+                       net_params.host_mac,
+                       net_params.host_mac+1,
+                       net_params.host_mac+2,
+                       net_params.host_mac+3,
+                       net_params.host_mac+4,
+                       net_params.host_mac+5);
+
+               if (!strncmp(network, "user", 4))
+                       net_params.mode = NET_MODE_USER;
+               else if (!strncmp(network, "tap", 3))
+                       net_params.mode = NET_MODE_TAP;
+               else
+                       die("Unkown network mode %s, please use -network user, tap, none", network);
+               virtio_net__init(&net_params);
+       }
+
+       kvm__start_timer(kvm);
+
+       kvm__setup_bios(kvm);
+
+       for (i = 0; i < nrcpus; i++) {
+               kvm_cpus[i] = kvm_cpu__init(kvm, i);
+               if (!kvm_cpus[i])
+                       die("unable to initialize KVM VCPU");
+       }
+
+       kvm__init_ram(kvm);
+
+       kbd__init(kvm);
+
+       pci_shmem__init(kvm);
+
+       if (vnc || sdl)
+               fb = vesa__init(kvm);
+
+       if (vnc) {
+               if (fb)
+                       vnc__init(fb);
+       }
+
+       if (sdl) {
+               if (fb)
+                       sdl__init(fb);
+       }
+
+       fb__start();
+
+       thread_pool__init(nr_online_cpus);
+       ioeventfd__start();
+
+       for (i = 0; i < nrcpus; i++) {
+               if (pthread_create(&kvm_cpus[i]->thread, NULL, kvm_cpu_thread, kvm_cpus[i]) != 0)
+                       die("unable to create KVM VCPU thread");
+       }
+
+       /* Only VCPU #0 is going to exit by itself when shutting down */
+       if (pthread_join(kvm_cpus[0]->thread, &ret) != 0)
+               exit_code = 1;
+
+       for (i = 1; i < nrcpus; i++) {
+               if (kvm_cpus[i]->is_running) {
+                       pthread_kill(kvm_cpus[i]->thread, SIGKVMEXIT);
+                       if (pthread_join(kvm_cpus[i]->thread, &ret) != 0)
+                               die("pthread_join");
+               }
+               if (ret != NULL)
+                       exit_code = 1;
+       }
+
+       compat__print_all_messages();
+
+       fb__stop();
+
+       virtio_blk__delete_all(kvm);
+       virtio_rng__delete_all(kvm);
+
+       disk_image__close_all(kvm->disks, image_count);
+       kvm__delete(kvm);
+
+       if (!exit_code)
+               printf("\n  # KVM session ended normally.\n");
+
+       return exit_code;
+}
diff --git a/tools/kvm/builtin-setup.c b/tools/kvm/builtin-setup.c
new file mode 100644 (file)
index 0000000..c93eec3
--- /dev/null
@@ -0,0 +1,218 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-setup.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define KVM_PID_FILE_PATH      "/.kvm-tools/"
+#define HOME_DIR               getenv("HOME")
+
+static const char *instance_name;
+
+static const char * const setup_usage[] = {
+       "kvm setup [-n name]",
+       NULL
+};
+
+static const struct option setup_options[] = {
+       OPT_GROUP("General options:"),
+       OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+       OPT_END()
+};
+
+static void parse_setup_options(int argc, const char **argv)
+{
+       while (argc != 0) {
+               argc = parse_options(argc, argv, setup_options, setup_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION);
+               if (argc != 0)
+                       kvm_setup_help();
+       }
+}
+
+void kvm_setup_help(void)
+{
+       usage_with_options(setup_usage, setup_options);
+}
+
+static int copy_file(const char *from, const char *to)
+{
+       int in_fd, out_fd;
+       void *src, *dst;
+       struct stat st;
+       int err = -1;
+
+       in_fd = open(from, O_RDONLY);
+       if (in_fd < 0)
+               return err;
+
+       if (fstat(in_fd, &st) < 0)
+               goto error_close_in;
+
+       out_fd = open(to, O_RDWR | O_CREAT | O_TRUNC, st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO));
+       if (out_fd < 0)
+               goto error_close_in;
+
+       src = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, in_fd, 0);
+       if (src == MAP_FAILED)
+               goto error_close_out;
+
+       if (ftruncate(out_fd, st.st_size) < 0)
+               goto error_munmap_src;
+
+       dst = mmap(NULL, st.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, out_fd, 0);
+       if (dst == MAP_FAILED)
+               goto error_munmap_src;
+
+       memcpy(dst, src, st.st_size);
+
+       if (fsync(out_fd) < 0)
+               goto error_munmap_dst;
+
+       err = 0;
+
+error_munmap_dst:
+       munmap(dst, st.st_size);
+error_munmap_src:
+       munmap(src, st.st_size);
+error_close_out:
+       close(out_fd);
+error_close_in:
+       close(in_fd);
+
+       return err;
+}
+
+static const char *guestfs_dirs[] = {
+       "/dev",
+       "/etc",
+       "/home",
+       "/host",
+       "/proc",
+       "/root",
+       "/sys",
+       "/tmp",
+       "/var",
+       "/var/lib",
+       "/virt",
+};
+
+static const char *guestfs_symlinks[] = {
+       "/bin",
+       "/lib",
+       "/lib64",
+       "/sbin",
+       "/usr",
+};
+
+static int copy_init(const char *guestfs_name)
+{
+       char path[PATH_MAX];
+
+       snprintf(path, PATH_MAX, "%s%s%s/virt/init", HOME_DIR, KVM_PID_FILE_PATH, guestfs_name);
+
+       return copy_file("guest/init", path);
+}
+
+static int copy_net(const char *guestfs_name)
+{
+       char path[PATH_MAX];
+
+       snprintf(path, PATH_MAX, "%s%s%s/virt/setnet.sh", HOME_DIR, KVM_PID_FILE_PATH, guestfs_name);
+
+       return copy_file("guest/setnet.sh", path);
+}
+
+static int make_guestfs_symlink(const char *guestfs_name, const char *path)
+{
+       char target[PATH_MAX];
+       char name[PATH_MAX];
+
+       snprintf(name, PATH_MAX, "%s%s%s%s", HOME_DIR, KVM_PID_FILE_PATH, guestfs_name, path);
+
+       snprintf(target, PATH_MAX, "/host%s", path);
+
+       return symlink(target, name);
+}
+
+static void make_root_dir(void)
+{
+       char name[PATH_MAX];
+
+       snprintf(name, PATH_MAX, "%s%s", HOME_DIR, KVM_PID_FILE_PATH);
+
+       mkdir(name, 0777);
+}
+
+static int make_dir(const char *dir)
+{
+       char name[PATH_MAX];
+
+       snprintf(name, PATH_MAX, "%s%s%s", HOME_DIR, KVM_PID_FILE_PATH, dir);
+
+       return mkdir(name, 0777);
+}
+
+static void make_guestfs_dir(const char *guestfs_name, const char *dir)
+{
+       char name[PATH_MAX];
+
+       snprintf(name, PATH_MAX, "%s%s", guestfs_name, dir);
+
+       make_dir(name);
+}
+
+static int do_setup(const char *guestfs_name)
+{
+       unsigned int i;
+       int ret;
+
+       make_root_dir();
+
+       ret = make_dir(guestfs_name);
+       if (ret < 0)
+               return ret;
+
+       for (i = 0; i < ARRAY_SIZE(guestfs_dirs); i++)
+               make_guestfs_dir(guestfs_name, guestfs_dirs[i]);
+
+       for (i = 0; i < ARRAY_SIZE(guestfs_symlinks); i++) {
+               make_guestfs_symlink(guestfs_name, guestfs_symlinks[i]);
+       }
+
+       ret = copy_net(guestfs_name);
+       if (ret < 0)
+               return ret;
+
+       return copy_init(guestfs_name);
+}
+
+int kvm_setup_create_new(const char *guestfs_name)
+{
+       return do_setup(guestfs_name);
+}
+
+int kvm_cmd_setup(int argc, const char **argv, const char *prefix)
+{
+       parse_setup_options(argc, argv);
+
+       if (instance_name == NULL)
+               kvm_setup_help();
+
+       return do_setup(instance_name);
+}
diff --git a/tools/kvm/builtin-stat.c b/tools/kvm/builtin-stat.c
new file mode 100644 (file)
index 0000000..be12d15
--- /dev/null
@@ -0,0 +1,79 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-stat.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool mem;
+static bool all;
+static pid_t instance_pid;
+static const char *instance_name;
+
+static const char * const stat_usage[] = {
+       "kvm stat [command] [--all] [-n name] [-p pid]",
+       NULL
+};
+
+static const struct option stat_options[] = {
+       OPT_GROUP("Commands options:"),
+       OPT_BOOLEAN('m', "memory", &mem, "Display memory statistics"),
+       OPT_GROUP("Instance options:"),
+       OPT_BOOLEAN('a', "all", &all, "All instances"),
+       OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+       OPT_INTEGER('p', "pid", &instance_pid, "Instance pid"),
+       OPT_END()
+};
+
+static void parse_stat_options(int argc, const char **argv)
+{
+       while (argc != 0) {
+               argc = parse_options(argc, argv, stat_options, stat_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION);
+               if (argc != 0)
+                       kvm_stat_help();
+       }
+}
+
+void kvm_stat_help(void)
+{
+       usage_with_options(stat_usage, stat_options);
+}
+
+static int do_memstat(const char *name, int pid)
+{
+       printf("Sending memstat command to %s, output should be on the targets' terminal.\n", name);
+       return kill(pid, SIGKVMMEMSTAT);
+}
+
+int kvm_cmd_stat(int argc, const char **argv, const char *prefix)
+{
+       parse_stat_options(argc, argv);
+
+       if (!mem)
+               usage_with_options(stat_usage, stat_options);
+
+       if (mem && all)
+               return kvm__enumerate_instances(do_memstat);
+
+       if (instance_name == NULL &&
+           instance_pid == 0)
+               kvm_stat_help();
+
+       if (instance_name)
+               instance_pid = kvm__get_pid_by_instance(instance_name);
+
+       if (instance_pid <= 0)
+               die("Failed locating instance");
+
+       if (mem) {
+               printf("Sending memstat command to designated instance, output should be on the targets' terminal.\n");
+
+               return kill(instance_pid, SIGKVMMEMSTAT);
+       }
+
+       return 0;
+}
diff --git a/tools/kvm/builtin-stop.c b/tools/kvm/builtin-stop.c
new file mode 100644 (file)
index 0000000..fd0500e
--- /dev/null
@@ -0,0 +1,66 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-stop.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static pid_t instance_pid;
+static const char *instance_name;
+
+static const char * const stop_usage[] = {
+       "kvm stop [--all] [-n name] [-p pid]",
+       NULL
+};
+
+static const struct option stop_options[] = {
+       OPT_GROUP("General options:"),
+       OPT_BOOLEAN('a', "all", &all, "Stop all instances"),
+       OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+       OPT_INTEGER('p', "pid", &instance_pid, "Instance pid"),
+       OPT_END()
+};
+
+static void parse_stop_options(int argc, const char **argv)
+{
+       while (argc != 0) {
+               argc = parse_options(argc, argv, stop_options, stop_usage,
+                               PARSE_OPT_STOP_AT_NON_OPTION);
+               if (argc != 0)
+                       kvm_stop_help();
+       }
+}
+
+void kvm_stop_help(void)
+{
+       usage_with_options(stop_usage, stop_options);
+}
+
+static int do_stop(const char *name, int pid)
+{
+       return kill(pid, SIGKVMSTOP);
+}
+
+int kvm_cmd_stop(int argc, const char **argv, const char *prefix)
+{
+       parse_stop_options(argc, argv);
+
+       if (all)
+               return kvm__enumerate_instances(do_stop);
+
+       if (instance_name == NULL &&
+           instance_pid == 0)
+               kvm_stop_help();
+
+       if (instance_name)
+               instance_pid = kvm__get_pid_by_instance(instance_name);
+
+       if (instance_pid <= 0)
+               die("Failed locating instance");
+
+       return kill(instance_pid, SIGKVMSTOP);
+}
diff --git a/tools/kvm/builtin-version.c b/tools/kvm/builtin-version.c
new file mode 100644 (file)
index 0000000..b8bb859
--- /dev/null
@@ -0,0 +1,15 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-version.h>
+#include <kvm/kvm.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+int kvm_cmd_version(int argc, const char **argv, const char *prefix)
+{
+       printf("kvm tool %s\n", KVMTOOLS_VERSION);
+
+       return 0;
+}
diff --git a/tools/kvm/code16gcc.h b/tools/kvm/code16gcc.h
new file mode 100644 (file)
index 0000000..d93e480
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * code16gcc.h
+ *
+ * This file is -include'd when compiling 16-bit C code.
+ * Note: this asm() needs to be emitted before gcc emits any code.
+ * Depending on gcc version, this requires -fno-unit-at-a-time or
+ * -fno-toplevel-reorder.
+ *
+ * Hopefully gcc will eventually have a real -m16 option so we can
+ * drop this hack long term.
+ */
+
+#ifndef __ASSEMBLY__
+asm(".code16gcc");
+#endif
diff --git a/tools/kvm/command-list.txt b/tools/kvm/command-list.txt
new file mode 100644 (file)
index 0000000..0d16c62
--- /dev/null
@@ -0,0 +1,14 @@
+#
+# List of known perf commands.
+# command name                 category [deprecated] [common]
+#
+kvm-run                                mainporcelain common
+kvm-setup                      mainporcelain common
+kvm-pause                      common
+kvm-resume                     common
+kvm-version                    common
+kvm-list                       common
+kvm-debug                      common
+kvm-balloon                    common
+kvm-stop                       common
+kvm-stat                       common
diff --git a/tools/kvm/config/feature-tests.mak b/tools/kvm/config/feature-tests.mak
new file mode 100644 (file)
index 0000000..bfd10ca
--- /dev/null
@@ -0,0 +1,148 @@
+define SOURCE_HELLO
+#include <stdio.h>
+int main(void)
+{
+       return puts(\"hi\");
+}
+endef
+
+ifndef NO_DWARF
+define SOURCE_DWARF
+#include <dwarf.h>
+#include <elfutils/libdw.h>
+#include <elfutils/version.h>
+#ifndef _ELFUTILS_PREREQ
+#error
+#endif
+
+int main(void)
+{
+       Dwarf *dbg = dwarf_begin(0, DWARF_C_READ);
+       return (long)dbg;
+}
+endef
+endif
+
+define SOURCE_LIBELF
+#include <libelf.h>
+
+int main(void)
+{
+       Elf *elf = elf_begin(0, ELF_C_READ, 0);
+       return (long)elf;
+}
+endef
+
+define SOURCE_GLIBC
+#include <gnu/libc-version.h>
+
+int main(void)
+{
+       const char *version = gnu_get_libc_version();
+       return (long)version;
+}
+endef
+
+define SOURCE_ELF_MMAP
+#include <libelf.h>
+int main(void)
+{
+       Elf *elf = elf_begin(0, ELF_C_READ_MMAP, 0);
+       return (long)elf;
+}
+endef
+
+ifndef NO_NEWT
+define SOURCE_NEWT
+#include <newt.h>
+
+int main(void)
+{
+       newtInit();
+       newtCls();
+       return newtFinished();
+}
+endef
+endif
+
+ifndef NO_LIBPERL
+define SOURCE_PERL_EMBED
+#include <EXTERN.h>
+#include <perl.h>
+
+int main(void)
+{
+perl_alloc();
+return 0;
+}
+endef
+endif
+
+ifndef NO_LIBPYTHON
+define SOURCE_PYTHON_VERSION
+#include <Python.h>
+#if PY_VERSION_HEX >= 0x03000000
+       #error
+#endif
+int main(void){}
+endef
+define SOURCE_PYTHON_EMBED
+#include <Python.h>
+int main(void)
+{
+       Py_Initialize();
+       return 0;
+}
+endef
+endif
+
+define SOURCE_BFD
+#include <bfd.h>
+
+int main(void)
+{
+       bfd_demangle(0, 0, 0);
+       return 0;
+}
+endef
+
+define SOURCE_CPLUS_DEMANGLE
+extern char *cplus_demangle(const char *, int);
+
+int main(void)
+{
+       cplus_demangle(0, 0);
+       return 0;
+}
+endef
+
+define SOURCE_STRLCPY
+#include <stdlib.h>
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
+int main(void)
+{
+       strlcpy(NULL, NULL, 0);
+       return 0;
+}
+endef
+
+define SOURCE_VNCSERVER
+#include <rfb/rfb.h>
+
+int main(void)
+{
+       rfbIsActive((void *)0);
+       return 0;
+}
+endef
+
+define SOURCE_SDL
+#include <SDL/SDL.h>
+
+int main(void)
+{
+       SDL_Init(SDL_INIT_VIDEO);
+       return 0;
+}
+endef
diff --git a/tools/kvm/config/utilities.mak b/tools/kvm/config/utilities.mak
new file mode 100644 (file)
index 0000000..8046182
--- /dev/null
@@ -0,0 +1,188 @@
+# This allows us to work with the newline character:
+define newline
+
+
+endef
+newline := $(newline)
+
+# nl-escape
+#
+# Usage: escape = $(call nl-escape[,escape])
+#
+# This is used as the common way to specify
+# what should replace a newline when escaping
+# newlines; the default is a bizarre string.
+#
+nl-escape = $(or $(1),m822df3020w6a44id34bt574ctac44eb9f4n)
+
+# escape-nl
+#
+# Usage: escaped-text = $(call escape-nl,text[,escape])
+#
+# GNU make's $(shell ...) function converts to a
+# single space each newline character in the output
+# produced during the expansion; this may not be
+# desirable.
+#
+# The only solution is to change each newline into
+# something that won't be converted, so that the
+# information can be recovered later with
+# $(call unescape-nl...)
+#
+escape-nl = $(subst $(newline),$(call nl-escape,$(2)),$(1))
+
+# unescape-nl
+#
+# Usage: text = $(call unescape-nl,escaped-text[,escape])
+#
+# See escape-nl.
+#
+unescape-nl = $(subst $(call nl-escape,$(2)),$(newline),$(1))
+
+# shell-escape-nl
+#
+# Usage: $(shell some-command | $(call shell-escape-nl[,escape]))
+#
+# Use this to escape newlines from within a shell call;
+# the default escape is a bizarre string.
+#
+# NOTE: The escape is used directly as a string constant
+#       in an `awk' program that is delimited by shell
+#       single-quotes, so be wary of the characters
+#       that are chosen.
+#
+define shell-escape-nl
+awk 'NR==1 {t=$$0} NR>1 {t=t "$(nl-escape)" $$0} END {printf t}'
+endef
+
+# shell-unescape-nl
+#
+# Usage: $(shell some-command | $(call shell-unescape-nl[,escape]))
+#
+# Use this to unescape newlines from within a shell call;
+# the default escape is a bizarre string.
+#
+# NOTE: The escape is used directly as an extended regular
+#       expression constant in an `awk' program that is
+#       delimited by shell single-quotes, so be wary
+#       of the characters that are chosen.
+#
+# (The bash shell has a bug where `{gsub(...),...}' is
+#  misinterpreted as a brace expansion; this can be
+#  overcome by putting a space between `{' and `gsub').
+#
+define shell-unescape-nl
+awk 'NR==1 {t=$$0} NR>1 {t=t "\n" $$0} END { gsub(/$(nl-escape)/,"\n",t); printf t }'
+endef
+
+# escape-for-shell-sq
+#
+# Usage: embeddable-text = $(call escape-for-shell-sq,text)
+#
+# This function produces text that is suitable for
+# embedding in a shell string that is delimited by
+# single-quotes.
+#
+escape-for-shell-sq =  $(subst ','\'',$(1))
+
+# shell-sq
+#
+# Usage: single-quoted-and-escaped-text = $(call shell-sq,text)
+#
+shell-sq = '$(escape-for-shell-sq)'
+
+# shell-wordify
+#
+# Usage: wordified-text = $(call shell-wordify,text)
+#
+# For instance:
+#
+#  |define text
+#  |hello
+#  |world
+#  |endef
+#  |
+#  |target:
+#  |   echo $(call shell-wordify,$(text))
+#
+# At least GNU make gets confused by expanding a newline
+# within the context of a command line of a makefile rule
+# (this is in constrast to a `$(shell ...)' function call,
+# which can handle it just fine).
+#
+# This function avoids the problem by producing a string
+# that works as a shell word, regardless of whether or
+# not it contains a newline.
+#
+# If the text to be wordified contains a newline, then
+# an intrictate shell command substitution is constructed
+# to render the text as a single line; when the shell
+# processes the resulting escaped text, it transforms
+# it into the original unescaped text.
+#
+# If the text does not contain a newline, then this function
+# produces the same results as the `$(shell-sq)' function.
+#
+shell-wordify = $(if $(findstring $(newline),$(1)),$(_sw-esc-nl),$(shell-sq))
+define _sw-esc-nl
+"$$(echo $(call escape-nl,$(shell-sq),$(2)) | $(call shell-unescape-nl,$(2)))"
+endef
+
+# is-absolute
+#
+# Usage: bool-value = $(call is-absolute,path)
+#
+is-absolute = $(shell echo $(shell-sq) | grep ^/ -q && echo y)
+
+# lookup
+#
+# Usage: absolute-executable-path-or-empty = $(call lookup,path)
+#
+# (It's necessary to use `sh -c' because GNU make messes up by
+#  trying too hard and getting things wrong).
+#
+lookup = $(call unescape-nl,$(shell sh -c $(_l-sh)))
+_l-sh = $(call shell-sq,command -v $(shell-sq) | $(call shell-escape-nl,))
+
+# is-executable
+#
+# Usage: bool-value = $(call is-executable,path)
+#
+# (It's necessary to use `sh -c' because GNU make messes up by
+#  trying too hard and getting things wrong).
+#
+is-executable = $(call _is-executable-helper,$(shell-sq))
+_is-executable-helper = $(shell sh -c $(_is-executable-sh))
+_is-executable-sh = $(call shell-sq,test -f $(1) -a -x $(1) && echo y)
+
+# get-executable
+#
+# Usage: absolute-executable-path-or-empty = $(call get-executable,path)
+#
+# The goal is to get an absolute path for an executable;
+# the `command -v' is defined by POSIX, but it's not
+# necessarily very portable, so it's only used if
+# relative path resolution is requested, as determined
+# by the presence of a leading `/'.
+#
+get-executable = $(if $(1),$(if $(is-absolute),$(_ge-abspath),$(lookup)))
+_ge-abspath = $(if $(is-executable),$(1))
+
+# get-supplied-or-default-executable
+#
+# Usage: absolute-executable-path-or-empty = $(call get-executable-or-default,variable,default)
+#
+define get-executable-or-default
+$(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2)))
+endef
+_ge_attempt = $(or $(get-executable),$(_gea_warn),$(call _gea_err,$(2)))
+_gea_warn = $(warning The path '$(1)' is not executable.)
+_gea_err  = $(if $(1),$(error Please set '$(1)' appropriately))
+
+# try-cc
+# Usage: option = $(call try-cc, source-to-build, cc-options)
+try-cc = $(shell sh -c                                           \
+       'TMP="$(OUTPUT)$(TMPOUT).$$$$";                           \
+        echo "$(1)" |                                            \
+        $(CC) -x c - $(2) -o "$$TMP" > /dev/null 2>&1 && echo y; \
+        rm -f "$$TMP"')
diff --git a/tools/kvm/cpuid.c b/tools/kvm/cpuid.c
new file mode 100644 (file)
index 0000000..c3b3d58
--- /dev/null
@@ -0,0 +1,51 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include <sys/ioctl.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define CPUID_FUNC_PERFMON             0x0A
+
+#define        MAX_KVM_CPUID_ENTRIES           100
+
+static void filter_cpuid(struct kvm_cpuid2 *kvm_cpuid)
+{
+       unsigned int i;
+
+       /*
+        * Filter CPUID functions that are not supported by the hypervisor.
+        */
+       for (i = 0; i < kvm_cpuid->nent; i++) {
+               struct kvm_cpuid_entry2 *entry = &kvm_cpuid->entries[i];
+
+               switch (entry->function) {
+               case CPUID_FUNC_PERFMON:
+                       entry->eax      = 0x00; /* disable it */
+                       break;
+               default:
+                       /* Keep the CPUID function as -is */
+                       break;
+               };
+       }
+}
+
+void kvm_cpu__setup_cpuid(struct kvm_cpu *vcpu)
+{
+       struct kvm_cpuid2 *kvm_cpuid;
+
+       kvm_cpuid = calloc(1, sizeof(*kvm_cpuid) + MAX_KVM_CPUID_ENTRIES * sizeof(*kvm_cpuid->entries));
+
+       kvm_cpuid->nent = MAX_KVM_CPUID_ENTRIES;
+       if (ioctl(vcpu->kvm->sys_fd, KVM_GET_SUPPORTED_CPUID, kvm_cpuid) < 0)
+               die_perror("KVM_GET_SUPPORTED_CPUID failed");
+
+       filter_cpuid(kvm_cpuid);
+
+       if (ioctl(vcpu->vcpu_fd, KVM_SET_CPUID2, kvm_cpuid) < 0)
+               die_perror("KVM_SET_CPUID2 failed");
+
+       free(kvm_cpuid);
+}
diff --git a/tools/kvm/disk/blk.c b/tools/kvm/disk/blk.c
new file mode 100644 (file)
index 0000000..59294e8
--- /dev/null
@@ -0,0 +1,39 @@
+#include "kvm/disk-image.h"
+
+/*
+ * raw image and blk dev are similar, so reuse raw image ops.
+ */
+static struct disk_image_operations blk_dev_ops = {
+       .read_sector            = raw_image__read_sector,
+       .write_sector           = raw_image__write_sector,
+       .close                  = raw_image__close,
+};
+
+struct disk_image *blkdev__probe(const char *filename, struct stat *st)
+{
+       u64 size;
+       int fd;
+
+       if (!S_ISBLK(st->st_mode))
+               return NULL;
+
+       /*
+        * Be careful! We are opening host block device!
+        * Open it readonly since we do not want to break user's data on disk.
+        */
+       fd                      = open(filename, O_RDONLY);
+       if (fd < 0)
+               return NULL;
+
+       if (ioctl(fd, BLKGETSIZE64, &size) < 0) {
+               close(fd);
+               return NULL;
+       }
+
+       /*
+        * FIXME: This will not work on 32-bit host because we can not
+        * mmap large disk. There is not enough virtual address space
+        * in 32-bit host. However, this works on 64-bit host.
+        */
+       return disk_image__new(fd, size, &blk_dev_ops, DISK_IMAGE_MMAP);
+}
diff --git a/tools/kvm/disk/core.c b/tools/kvm/disk/core.c
new file mode 100644 (file)
index 0000000..20e1990
--- /dev/null
@@ -0,0 +1,222 @@
+#include "kvm/disk-image.h"
+#include "kvm/qcow.h"
+
+int debug_iodelay;
+
+struct disk_image *disk_image__new(int fd, u64 size, struct disk_image_operations *ops, int use_mmap)
+{
+       struct disk_image *disk;
+
+       disk            = malloc(sizeof *disk);
+       if (!disk)
+               return NULL;
+
+       disk->fd        = fd;
+       disk->size      = size;
+       disk->ops       = ops;
+
+       if (use_mmap == DISK_IMAGE_MMAP) {
+               /*
+                * The write to disk image will be discarded
+                */
+               disk->priv = mmap(NULL, size, PROT_RW, MAP_PRIVATE | MAP_NORESERVE, fd, 0);
+               if (disk->priv == MAP_FAILED)
+                       die("mmap() failed");
+       }
+
+       return disk;
+}
+
+struct disk_image *disk_image__open(const char *filename, bool readonly)
+{
+       struct disk_image *disk;
+       struct stat st;
+       int fd;
+
+       if (stat(filename, &st) < 0)
+               return NULL;
+
+       /* blk device ?*/
+       disk            = blkdev__probe(filename, &st);
+       if (disk)
+               return disk;
+
+       fd              = open(filename, readonly ? O_RDONLY : O_RDWR);
+       if (fd < 0)
+               return NULL;
+
+       /* qcow image ?*/
+       disk            = qcow_probe(fd, true);
+       if (disk) {
+               pr_warning("Forcing read-only support for QCOW");
+               return disk;
+       }
+
+       /* raw image ?*/
+       disk            = raw_image__probe(fd, &st, readonly);
+       if (disk)
+               return disk;
+
+       if (close(fd) < 0)
+               pr_warning("close() failed");
+
+       return NULL;
+}
+
+struct disk_image **disk_image__open_all(const char **filenames, bool *readonly, int count)
+{
+       struct disk_image **disks;
+       int i;
+
+       if (!count || count > MAX_DISK_IMAGES)
+               return NULL;
+
+       disks = calloc(count, sizeof(*disks));
+       if (!disks)
+               return NULL;
+
+       for (i = 0; i < count; i++) {
+               if (!filenames[i])
+                       continue;
+
+               disks[i] = disk_image__open(filenames[i], readonly[i]);
+               if (!disks[i]) {
+                       pr_error("Loading disk image '%s' failed", filenames[i]);
+                       goto error;
+               }
+       }
+       return disks;
+error:
+       for (i = 0; i < count; i++)
+               disk_image__close(disks[i]);
+
+       free(disks);
+       return NULL;
+}
+
+int disk_image__flush(struct disk_image *disk)
+{
+       if (disk->ops->flush)
+               return disk->ops->flush(disk);
+
+       return fsync(disk->fd);
+}
+
+int disk_image__close(struct disk_image *disk)
+{
+       /* If there was no disk image then there's nothing to do: */
+       if (!disk)
+               return 0;
+
+       if (disk->ops->close)
+               return disk->ops->close(disk);
+
+       if (close(disk->fd) < 0)
+               pr_warning("close() failed");
+
+       free(disk);
+
+       return 0;
+}
+
+void disk_image__close_all(struct disk_image **disks, int count)
+{
+       while (count)
+               disk_image__close(disks[--count]);
+
+       free(disks);
+}
+
+/*
+ * Fill iov with disk data, starting from sector 'sector'.
+ * Return amount of bytes read.
+ */
+ssize_t disk_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount)
+{
+       ssize_t total = 0;
+       ssize_t nr;
+
+       if (debug_iodelay)
+               msleep(debug_iodelay);
+
+       if (disk->ops->read_sector_iov) {
+               /*
+                * Try mulitple buffer based operation first
+                */
+               total           = disk->ops->read_sector_iov(disk, sector, iov, iovcount);
+               if (total < 0) {
+                       pr_info("disk_image__read error: total=%ld\n", (long)total);
+                       return -1;
+               }
+       } else if (disk->ops->read_sector) {
+               /*
+                * Fallback to single buffer based operation
+                */
+               while (iovcount--) {
+                       nr      = disk->ops->read_sector(disk, sector, iov->iov_base, iov->iov_len);
+                       if (nr != (ssize_t)iov->iov_len) {
+                               pr_info("disk_image__read error: nr = %ld iov_len=%ld\n", (long)nr, (long)iov->iov_len);
+                               return -1;
+                       }
+                       sector  += iov->iov_len >> SECTOR_SHIFT;
+                       iov++;
+                       total   += nr;
+               }
+       } else
+               die("No disk image operation for read\n");
+
+       return total;
+}
+
+/*
+ * Write iov to disk, starting from sector 'sector'.
+ * Return amount of bytes written.
+ */
+ssize_t disk_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount)
+{
+       ssize_t total = 0;
+       ssize_t nr;
+
+       if (debug_iodelay)
+               msleep(debug_iodelay);
+
+       if (disk->ops->write_sector_iov) {
+               /*
+                * Try writev based operation first
+                */
+               total = disk->ops->write_sector_iov(disk, sector, iov, iovcount);
+               if (total < 0) {
+                       pr_info("disk_image__write error: total=%ld\n", (long)total);
+                       return -1;
+               }
+       } else if (disk->ops->write_sector) {
+               /*
+                * Fallback to single buffer based operation
+                */
+               while (iovcount--) {
+                       nr       = disk->ops->write_sector(disk, sector, iov->iov_base, iov->iov_len);
+                       if (nr != (ssize_t)iov->iov_len) {
+                               pr_info("disk_image__write error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len);
+                               return -1;
+                       }
+
+                       sector  += iov->iov_len >> SECTOR_SHIFT;
+                       iov++;
+                       total   += nr;
+               }
+       } else
+               die("No disk image operation for read\n");
+
+       return total;
+}
+
+ssize_t disk_image__get_serial(struct disk_image *disk, void *buffer, ssize_t *len)
+{
+       struct stat st;
+
+       if (fstat(disk->fd, &st) != 0)
+               return 0;
+
+       *len = snprintf(buffer, *len, "%llu%llu%llu", (u64)st.st_dev, (u64)st.st_rdev, (u64)st.st_ino);
+       return *len;
+}
diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c
new file mode 100644 (file)
index 0000000..2471aeb
--- /dev/null
@@ -0,0 +1,1099 @@
+#include "kvm/qcow.h"
+
+#include "kvm/disk-image.h"
+#include "kvm/read-write.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include <linux/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+static int l2_table_insert(struct rb_root *root, struct qcow_l2_table *new)
+{
+       struct rb_node **link = &(root->rb_node), *parent = NULL;
+       u64 offset = new->offset;
+
+       /* search the tree */
+       while (*link) {
+               struct qcow_l2_table *t;
+
+               t = rb_entry(*link, struct qcow_l2_table, node);
+               if (!t)
+                       goto error;
+
+               parent = *link;
+
+               if (t->offset > offset)
+                       link = &(*link)->rb_left;
+               else if (t->offset < offset)
+                       link = &(*link)->rb_right;
+               else
+                       goto out;
+       }
+
+       /* add new node */
+       rb_link_node(&new->node, parent, link);
+       rb_insert_color(&new->node, root);
+out:
+       return 0;
+error:
+       return -1;
+}
+
+static struct qcow_l2_table *l2_table_lookup(struct rb_root *root, u64 offset)
+{
+       struct rb_node *link = root->rb_node;
+
+       while (link) {
+               struct qcow_l2_table *t;
+
+               t = rb_entry(link, struct qcow_l2_table, node);
+               if (!t)
+                       goto out;
+
+               if (t->offset > offset)
+                       link = link->rb_left;
+               else if (t->offset < offset)
+                       link = link->rb_right;
+               else
+                       return t;
+       }
+out:
+       return NULL;
+}
+
+static void l1_table_free_cache(struct qcow_l1_table *l1t)
+{
+       struct rb_root *r = &l1t->root;
+       struct list_head *pos, *n;
+       struct qcow_l2_table *t;
+
+       list_for_each_safe(pos, n, &l1t->lru_list) {
+               /* Remove cache table from the list and RB tree */
+               list_del(pos);
+               t = list_entry(pos, struct qcow_l2_table, list);
+               rb_erase(&t->node, r);
+
+               /* Free the cached node */
+               free(t);
+       }
+}
+
+static int qcow_l2_cache_write(struct qcow *q, struct qcow_l2_table *c)
+{
+       struct qcow_header *header = q->header;
+       u64 size;
+
+       if (!c->dirty)
+               return 0;
+
+       size = 1 << header->l2_bits;
+
+       if (pwrite_in_full(q->fd, c->table, size * sizeof(u64), c->offset) < 0)
+               return -1;
+
+       c->dirty = 0;
+
+       return 0;
+}
+
+static int cache_table(struct qcow *q, struct qcow_l2_table *c)
+{
+       struct qcow_l1_table *l1t = &q->table;
+       struct rb_root *r = &l1t->root;
+       struct qcow_l2_table *lru;
+
+       if (l1t->nr_cached == MAX_CACHE_NODES) {
+               /*
+                * The node at the head of the list is least recently used
+                * node. Remove it from the list and replaced with a new node.
+                */
+               lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, list);
+
+               if (qcow_l2_cache_write(q, lru) < 0)
+                       goto error;
+
+               /* Remove the node from the cache */
+               rb_erase(&lru->node, r);
+               list_del_init(&lru->list);
+               l1t->nr_cached--;
+
+               /* Free the LRUed node */
+               free(lru);
+       }
+
+       /* Add new node in RB Tree: Helps in searching faster */
+       if (l2_table_insert(r, c) < 0)
+               goto error;
+
+       /* Add in LRU replacement list */
+       list_add_tail(&c->list, &l1t->lru_list);
+       l1t->nr_cached++;
+
+       return 0;
+error:
+       return -1;
+}
+
+static struct qcow_l2_table *l2_table_search(struct qcow *q, u64 offset)
+{
+       struct qcow_l1_table *l1t = &q->table;
+       struct qcow_l2_table *l2t;
+
+       l2t = l2_table_lookup(&l1t->root, offset);
+       if (!l2t)
+               return NULL;
+
+       /* Update the LRU state, by moving the searched node to list tail */
+       list_move_tail(&l2t->list, &l1t->lru_list);
+
+       return l2t;
+}
+
+/* Allocates a new node for caching L2 table */
+static struct qcow_l2_table *new_cache_table(struct qcow *q, u64 offset)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_l2_table *c;
+       u64 l2t_sz;
+       u64 size;
+
+       l2t_sz = 1 << header->l2_bits;
+       size   = sizeof(*c) + l2t_sz * sizeof(u64);
+       c      = calloc(1, size);
+       if (!c)
+               goto out;
+
+       c->offset = offset;
+       RB_CLEAR_NODE(&c->node);
+       INIT_LIST_HEAD(&c->list);
+out:
+       return c;
+}
+
+static inline u64 get_l1_index(struct qcow *q, u64 offset)
+{
+       struct qcow_header *header = q->header;
+
+       return offset >> (header->l2_bits + header->cluster_bits);
+}
+
+static inline u64 get_l2_index(struct qcow *q, u64 offset)
+{
+       struct qcow_header *header = q->header;
+
+       return (offset >> (header->cluster_bits)) & ((1 << header->l2_bits)-1);
+}
+
+static inline u64 get_cluster_offset(struct qcow *q, u64 offset)
+{
+       struct qcow_header *header = q->header;
+
+       return offset & ((1 << header->cluster_bits)-1);
+}
+
+static struct qcow_l2_table *qcow_read_l2_table(struct qcow *q, u64 offset)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_l2_table *l2t;
+       u64 size;
+
+       size = 1 << header->l2_bits;
+
+       /* search an entry for offset in cache */
+       l2t = l2_table_search(q, offset);
+       if (l2t)
+               return l2t;
+
+       /* allocate new node for caching l2 table */
+       l2t = new_cache_table(q, offset);
+       if (!l2t)
+               goto error;
+
+       /* table not cached: read from the disk */
+       if (pread_in_full(q->fd, l2t->table, size * sizeof(u64), offset) < 0)
+               goto error;
+
+       /* cache the table */
+       if (cache_table(q, l2t) < 0)
+               goto error;
+
+       return l2t;
+error:
+       free(l2t);
+       return NULL;
+}
+
+static ssize_t qcow_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_len)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_l1_table *l1t = &q->table;
+       struct qcow_l2_table *l2t;
+       u64 cluster_size;
+       u64 clust_offset;
+       u64 clust_start;
+       u64 l2t_offset;
+       size_t length;
+       u64 l2t_size;
+       u64 l1_idx;
+       u64 l2_idx;
+
+       cluster_size = 1 << header->cluster_bits;
+
+       l1_idx = get_l1_index(q, offset);
+       if (l1_idx >= l1t->table_size)
+               return -1;
+
+       clust_offset = get_cluster_offset(q, offset);
+       if (clust_offset >= cluster_size)
+               return -1;
+
+       length = cluster_size - clust_offset;
+       if (length > dst_len)
+               length = dst_len;
+
+       mutex_lock(&q->mutex);
+
+       l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]);
+       if (l2t_offset & QCOW_OFLAG_COMPRESSED) {
+               pr_warning("compressed sectors are not supported");
+               goto out_error;
+       }
+
+       l2t_offset &= QCOW_OFFSET_MASK;
+       if (!l2t_offset)
+               goto zero_cluster;
+
+       l2t_size = 1 << header->l2_bits;
+
+       /* read and cache level 2 table */
+       l2t = qcow_read_l2_table(q, l2t_offset);
+       if (!l2t)
+               goto out_error;
+
+       l2_idx = get_l2_index(q, offset);
+       if (l2_idx >= l2t_size)
+               goto out_error;
+
+       clust_start = be64_to_cpu(l2t->table[l2_idx]);
+       if (clust_start & QCOW_OFLAG_COMPRESSED) {
+               pr_warning("compressed sectors are not supported");
+               goto out_error;
+       }
+
+       clust_start &= QCOW_OFFSET_MASK;
+       if (!clust_start)
+               goto zero_cluster;
+
+       mutex_unlock(&q->mutex);
+
+       if (pread_in_full(q->fd, dst, length, clust_start + clust_offset) < 0)
+               return -1;
+
+       return length;
+
+zero_cluster:
+       mutex_unlock(&q->mutex);
+       memset(dst, 0, length);
+       return length;
+
+out_error:
+       mutex_unlock(&q->mutex);
+       length = -1;
+       return -1;
+}
+
+static ssize_t qcow_read_sector(struct disk_image *disk, u64 sector, void *dst, u32 dst_len)
+{
+       struct qcow *q = disk->priv;
+       struct qcow_header *header = q->header;
+       u32 nr_read;
+       u64 offset;
+       char *buf;
+       u32 nr;
+
+       buf             = dst;
+       nr_read         = 0;
+
+       while (nr_read < dst_len) {
+               offset          = sector << SECTOR_SHIFT;
+               if (offset >= header->size)
+                       return -1;
+
+               nr = qcow_read_cluster(q, offset, buf, dst_len - nr_read);
+               if (nr <= 0)
+                       return -1;
+
+               nr_read         += nr;
+               buf             += nr;
+               sector          += (nr >> SECTOR_SHIFT);
+       }
+
+       return dst_len;
+}
+
+static inline u64 file_size(int fd)
+{
+       struct stat st;
+
+       if (fstat(fd, &st) < 0)
+               return 0;
+
+       return st.st_size;
+}
+
+static inline int qcow_pwrite_sync(int fd, void *buf, size_t count, off_t offset)
+{
+       if (pwrite_in_full(fd, buf, count, offset) < 0)
+               return -1;
+
+       return fdatasync(fd);
+}
+
+/* Writes a level 2 table at the end of the file. */
+static u64 qcow_write_l2_table(struct qcow *q, u64 *table)
+{
+       struct qcow_header *header = q->header;
+       u64 clust_sz;
+       u64 f_sz;
+       u64 off;
+       u64 sz;
+
+       f_sz            = file_size(q->fd);
+       if (!f_sz)
+               return 0;
+
+       sz              = 1 << header->l2_bits;
+       clust_sz        = 1 << header->cluster_bits;
+       off             = ALIGN(f_sz, clust_sz);
+
+       if (pwrite_in_full(q->fd, table, sz * sizeof(u64), off) < 0)
+               return 0;
+
+       return off;
+}
+
+static void refcount_table_free_cache(struct qcow_refcount_table *rft)
+{
+       struct rb_root *r = &rft->root;
+       struct list_head *pos, *n;
+       struct qcow_refcount_block *t;
+
+       list_for_each_safe(pos, n, &rft->lru_list) {
+               list_del(pos);
+               t = list_entry(pos, struct qcow_refcount_block, list);
+               rb_erase(&t->node, r);
+
+               free(t);
+       }
+}
+
+static int refcount_block_insert(struct rb_root *root, struct qcow_refcount_block *new)
+{
+       struct rb_node **link = &(root->rb_node), *parent = NULL;
+       u64 offset = new->offset;
+
+       /* search the tree */
+       while (*link) {
+               struct qcow_refcount_block *t;
+
+               t = rb_entry(*link, struct qcow_refcount_block, node);
+               if (!t)
+                       goto error;
+
+               parent = *link;
+
+               if (t->offset > offset)
+                       link = &(*link)->rb_left;
+               else if (t->offset < offset)
+                       link = &(*link)->rb_right;
+               else
+                       goto out;
+       }
+
+       /* add new node */
+       rb_link_node(&new->node, parent, link);
+       rb_insert_color(&new->node, root);
+out:
+       return 0;
+error:
+       return -1;
+}
+
+static int write_refcount_block(struct qcow *q, struct qcow_refcount_block *rfb)
+{
+       if (!rfb->dirty)
+               return 0;
+
+       if (pwrite_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb->offset) < 0)
+               return -1;
+
+       rfb->dirty = 0;
+
+       return 0;
+}
+
+static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c)
+{
+       struct qcow_refcount_table *rft = &q->refcount_table;
+       struct rb_root *r = &rft->root;
+       struct qcow_refcount_block *lru;
+
+       if (rft->nr_cached == MAX_CACHE_NODES) {
+               lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list);
+
+               if (write_refcount_block(q, lru) < 0)
+                       goto error;
+
+               rb_erase(&lru->node, r);
+               list_del_init(&lru->list);
+               rft->nr_cached--;
+
+               free(lru);
+       }
+
+       if (refcount_block_insert(r, c) < 0)
+               goto error;
+
+       list_add_tail(&c->list, &rft->lru_list);
+       rft->nr_cached++;
+
+       return 0;
+error:
+       return -1;
+}
+
+static struct qcow_refcount_block *new_refcount_block(struct qcow *q, u64 rfb_offset)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_refcount_block *rfb;
+       u64 cluster_size;
+
+       cluster_size = 1 << header->cluster_bits;
+
+       rfb = malloc(sizeof *rfb + cluster_size);
+       if (!rfb)
+               return NULL;
+
+       rfb->offset = rfb_offset;
+       rfb->size = cluster_size / sizeof(u16);
+       RB_CLEAR_NODE(&rfb->node);
+       INIT_LIST_HEAD(&rfb->list);
+
+       return rfb;
+}
+
+static struct qcow_refcount_block *refcount_block_lookup(struct rb_root *root, u64 offset)
+{
+       struct rb_node *link = root->rb_node;
+
+       while (link) {
+               struct qcow_refcount_block *t;
+
+               t = rb_entry(link, struct qcow_refcount_block, node);
+               if (!t)
+                       goto out;
+
+               if (t->offset > offset)
+                       link = link->rb_left;
+               else if (t->offset < offset)
+                       link = link->rb_right;
+               else
+                       return t;
+       }
+out:
+       return NULL;
+}
+
+static struct qcow_refcount_block *refcount_block_search(struct qcow *q, u64 offset)
+{
+       struct qcow_refcount_table *rft = &q->refcount_table;
+       struct qcow_refcount_block *rfb;
+
+       rfb = refcount_block_lookup(&rft->root, offset);
+       if (!rfb)
+               return NULL;
+
+       /* Update the LRU state, by moving the searched node to list tail */
+       list_move_tail(&rfb->list, &rft->lru_list);
+
+       return rfb;
+}
+
+static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64 clust_idx)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_refcount_table *rft = &q->refcount_table;
+       struct qcow_refcount_block *rfb;
+       u64 rfb_offset;
+       u64 rft_idx;
+
+       rft_idx = clust_idx >> (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT);
+       if (rft_idx >= rft->rf_size)
+               return NULL;
+
+       rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]);
+
+       rfb = refcount_block_search(q, rfb_offset);
+       if (rfb)
+               return rfb;
+
+       rfb = new_refcount_block(q, rfb_offset);
+       if (!rfb)
+               return NULL;
+
+       if (pread_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb_offset) < 0)
+               goto error_free_rfb;
+
+       if (cache_refcount_block(q, rfb) < 0)
+               goto error_free_rfb;
+
+       return rfb;
+
+error_free_rfb:
+       free(rfb);
+
+       return NULL;
+}
+
+/*
+ * QCOW file might grow during a write operation. Not only data but metadata is
+ * also written at the end of the file. Therefore it is necessary to ensure
+ * every write is committed to disk. Hence we use uses qcow_pwrite_sync() to
+ * synchronize the in-core state of QCOW image to disk.
+ *
+ * We also try to restore the image to a consistent state if the metdata
+ * operation fails. The two metadat operations are: level 1 and level 2 table
+ * update. If either of them fails the image is truncated to a consistent state.
+ */
+static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src_len)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_l1_table *l1t = &q->table;
+       struct qcow_l2_table *l2t;
+       u64 clust_start;
+       u64 clust_flags;
+       u64 l2t_offset;
+       u64 clust_off;
+       u64 l2t_size;
+       u64 clust_sz;
+       u64 l1t_idx;
+       u64 l2t_idx;
+       u64 f_sz;
+       u64 len;
+
+       l2t             = NULL;
+       l2t_size        = 1 << header->l2_bits;
+       clust_sz        = 1 << header->cluster_bits;
+
+       l1t_idx = get_l1_index(q, offset);
+       if (l1t_idx >= l1t->table_size)
+               return -1;
+
+       l2t_idx = get_l2_index(q, offset);
+       if (l2t_idx >= l2t_size)
+               return -1;
+
+       clust_off = get_cluster_offset(q, offset);
+       if (clust_off >= clust_sz)
+               return -1;
+
+       len = clust_sz - clust_off;
+       if (len > src_len)
+               len = src_len;
+
+       mutex_lock(&q->mutex);
+
+       l2t_offset = be64_to_cpu(l1t->l1_table[l1t_idx]);
+       if (l2t_offset & QCOW_OFLAG_COMPRESSED) {
+               pr_warning("compressed clusters are not supported");
+               goto error;
+       }
+       if (!(l2t_offset & QCOW_OFLAG_COPIED)) {
+               pr_warning("L2 copy-on-write clusters are not supported");
+               goto error;
+       }
+
+       l2t_offset &= QCOW_OFFSET_MASK;
+       if (l2t_offset) {
+               /* read and cache l2 table */
+               l2t = qcow_read_l2_table(q, l2t_offset);
+               if (!l2t)
+                       goto error;
+       } else {
+               l2t = new_cache_table(q, l2t_offset);
+               if (!l2t)
+                       goto error;
+
+               /* Capture the state of the consistent QCOW image */
+               f_sz = file_size(q->fd);
+               if (!f_sz)
+                       goto free_cache;
+
+               /* Write the l2 table of 0's at the end of the file */
+               l2t_offset = qcow_write_l2_table(q, l2t->table);
+               if (!l2t_offset)
+                       goto free_cache;
+
+               if (cache_table(q, l2t) < 0) {
+                       if (ftruncate(q->fd, f_sz) < 0)
+                               goto free_cache;
+
+                       goto free_cache;
+               }
+
+               /* Update the in-core entry */
+               l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_offset);
+       }
+
+       /* Capture the state of the consistent QCOW image */
+       f_sz            = file_size(q->fd);
+       if (!f_sz)
+               goto error;
+
+       clust_start = be64_to_cpu(l2t->table[l2t_idx]);
+
+       clust_flags = clust_start & QCOW_OFLAGS_MASK;
+       if (clust_flags & QCOW_OFLAG_COMPRESSED) {
+               pr_warning("compressed clusters are not supported");
+               goto error;
+       }
+
+       clust_start &= QCOW_OFFSET_MASK;
+       if (!clust_start) {
+               clust_start             = ALIGN(f_sz, clust_sz);
+               l2t->table[l2t_idx]     = cpu_to_be64(clust_start | QCOW_OFLAG_COPIED);
+               l2t->dirty              = 1;
+       }
+
+       if (!(clust_flags & QCOW_OFLAG_COPIED)) {
+               struct qcow_refcount_block *rfb = NULL;
+               u16 clust_refcount;
+               u64 clust_idx;
+               u64 rfb_idx;
+
+               clust_idx = (clust_start & QCOW_OFFSET_MASK) >> (header->cluster_bits);
+
+               rfb = qcow_read_refcount_block(q, clust_idx);
+               if (!rfb) {
+                       pr_warning("L1: error while reading refcount table");
+                       goto error;
+               }
+
+               rfb_idx = clust_idx & (((1ULL << (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+               if (rfb_idx >= rfb->size) {
+                       pr_warning("L1: refcount block index out of bounds");
+                       goto error;
+               }
+
+               clust_refcount = be16_to_cpu(rfb->entries[rfb_idx]);
+               if (!clust_refcount) {
+                       clust_refcount = 1;
+                       rfb->entries[rfb_idx] = cpu_to_be16(clust_refcount);
+                       rfb->dirty = 1;
+               }
+
+               if (clust_refcount > 1) {
+                       pr_warning("L1 copy-on-write clusters are not supported");
+                       goto error;
+               }
+       }
+
+       mutex_unlock(&q->mutex);
+
+       /* Write actual data */
+       if (pwrite_in_full(q->fd, buf, len, clust_start + clust_off) < 0)
+               return -1;
+
+       return len;
+
+free_cache:
+       free(l2t);
+error:
+       mutex_unlock(&q->mutex);
+       return -1;
+}
+
+static ssize_t qcow_write_sector(struct disk_image *disk, u64 sector, void *src, u32 src_len)
+{
+       struct qcow *q = disk->priv;
+       struct qcow_header *header = q->header;
+       u32 nr_written;
+       char *buf;
+       u64 offset;
+       ssize_t nr;
+
+       buf             = src;
+       nr_written      = 0;
+       offset          = sector << SECTOR_SHIFT;
+
+       while (nr_written < src_len) {
+               if (offset >= header->size)
+                       return -1;
+
+               nr = qcow_write_cluster(q, offset, buf, src_len - nr_written);
+               if (nr < 0)
+                       return -1;
+
+               nr_written      += nr;
+               buf             += nr;
+               offset          += nr;
+       }
+
+       return nr_written;
+}
+
+static ssize_t qcow_nowrite_sector(struct disk_image *disk, u64 sector, void *src, u32 src_len)
+{
+       /* I/O error */
+       pr_info("%s: no write support\n", __func__);
+       return -1;
+}
+
+static int qcow_disk_flush(struct disk_image *disk)
+{
+       struct qcow *q = disk->priv;
+       struct qcow_refcount_table *rft;
+       struct qcow_header *header;
+       struct list_head *pos, *n;
+       struct qcow_l1_table *l1t;
+
+       header = q->header;
+       l1t = &q->table;
+       rft = &q->refcount_table;
+
+       mutex_lock(&q->mutex);
+
+       list_for_each_safe(pos, n, &rft->lru_list) {
+               struct qcow_refcount_block *c = list_entry(pos, struct qcow_refcount_block, list);
+
+               if (write_refcount_block(q, c) < 0)
+                       goto error_unlock;
+       }
+
+       if (fdatasync(disk->fd) < 0)
+               goto error_unlock;
+
+       list_for_each_safe(pos, n, &l1t->lru_list) {
+               struct qcow_l2_table *c = list_entry(pos, struct qcow_l2_table, list);
+
+               if (qcow_l2_cache_write(q, c) < 0)
+                       goto error_unlock;
+       }
+
+       if (fdatasync(disk->fd) < 0)
+               goto error_unlock;
+
+       if (pwrite_in_full(disk->fd, l1t->l1_table, l1t->table_size * sizeof(u64), header->l1_table_offset) < 0)
+               goto error_unlock;
+
+       mutex_unlock(&q->mutex);
+
+       return fsync(disk->fd);
+
+error_unlock:
+       mutex_unlock(&q->mutex);
+       return -1;
+}
+
+static int qcow_disk_close(struct disk_image *disk)
+{
+       struct qcow *q;
+
+       if (!disk)
+               return 0;
+
+       q = disk->priv;
+
+       refcount_table_free_cache(&q->refcount_table);
+       l1_table_free_cache(&q->table);
+       free(q->refcount_table.rf_table);
+       free(q->table.l1_table);
+       free(q->header);
+       free(q);
+
+       return 0;
+}
+
+static struct disk_image_operations qcow_disk_readonly_ops = {
+       .read_sector            = qcow_read_sector,
+       .write_sector           = qcow_nowrite_sector,
+       .close                  = qcow_disk_close,
+};
+
+static struct disk_image_operations qcow_disk_ops = {
+       .read_sector            = qcow_read_sector,
+       .write_sector           = qcow_write_sector,
+       .flush                  = qcow_disk_flush,
+       .close                  = qcow_disk_close,
+};
+
+static int qcow_read_refcount_table(struct qcow *q)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_refcount_table *rft = &q->refcount_table;
+       u64 cluster_size;
+
+       cluster_size = 1 << header->cluster_bits;
+
+       rft->rf_size = (header->refcount_table_size * cluster_size) / sizeof(u64);
+
+       rft->rf_table = calloc(rft->rf_size, sizeof(u64));
+       if (!rft->rf_table)
+               return -1;
+
+       rft->root = RB_ROOT;
+       INIT_LIST_HEAD(&rft->lru_list);
+
+       return pread_in_full(q->fd, rft->rf_table, sizeof(u64) * rft->rf_size, header->refcount_table_offset);
+}
+
+static int qcow_read_l1_table(struct qcow *q)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_l1_table *table = &q->table;
+
+       table->table_size       = header->l1_size;
+
+       table->l1_table = calloc(table->table_size, sizeof(u64));
+       if (!table->l1_table)
+               return -1;
+
+       return pread_in_full(q->fd, table->l1_table, sizeof(u64) * table->table_size, header->l1_table_offset);
+}
+
+static void *qcow2_read_header(int fd)
+{
+       struct qcow2_header_disk f_header;
+       struct qcow_header *header;
+
+       header = malloc(sizeof(struct qcow_header));
+       if (!header)
+               return NULL;
+
+       if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0) {
+               free(header);
+               return NULL;
+       }
+
+       be32_to_cpus(&f_header.magic);
+       be32_to_cpus(&f_header.version);
+       be64_to_cpus(&f_header.backing_file_offset);
+       be32_to_cpus(&f_header.backing_file_size);
+       be32_to_cpus(&f_header.cluster_bits);
+       be64_to_cpus(&f_header.size);
+       be32_to_cpus(&f_header.crypt_method);
+       be32_to_cpus(&f_header.l1_size);
+       be64_to_cpus(&f_header.l1_table_offset);
+       be64_to_cpus(&f_header.refcount_table_offset);
+       be32_to_cpus(&f_header.refcount_table_clusters);
+       be32_to_cpus(&f_header.nb_snapshots);
+       be64_to_cpus(&f_header.snapshots_offset);
+
+       *header         = (struct qcow_header) {
+               .size                   = f_header.size,
+               .l1_table_offset        = f_header.l1_table_offset,
+               .l1_size                = f_header.l1_size,
+               .cluster_bits           = f_header.cluster_bits,
+               .l2_bits                = f_header.cluster_bits - 3,
+               .refcount_table_offset  = f_header.refcount_table_offset,
+               .refcount_table_size    = f_header.refcount_table_clusters,
+       };
+
+       return header;
+}
+
+static struct disk_image *qcow2_probe(int fd, bool readonly)
+{
+       struct disk_image *disk_image;
+       struct qcow_l1_table *l1t;
+       struct qcow_header *h;
+       struct qcow *q;
+
+       q = calloc(1, sizeof(struct qcow));
+       if (!q)
+               goto error;
+
+       mutex_init(&q->mutex);
+       q->fd = fd;
+
+       l1t = &q->table;
+
+       l1t->root = RB_ROOT;
+       INIT_LIST_HEAD(&l1t->lru_list);
+
+       h = q->header = qcow2_read_header(fd);
+       if (!h)
+               goto error;
+
+       if (qcow_read_l1_table(q) < 0)
+               goto error;
+
+       if (qcow_read_refcount_table(q) < 0)
+               goto error;
+
+       /*
+        * Do not use mmap use read/write instead
+        */
+       if (readonly)
+               disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_NOMMAP);
+       else
+               disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_NOMMAP);
+
+       if (!disk_image)
+               goto error;
+       disk_image->priv = q;
+
+       return disk_image;
+error:
+       if (!q)
+               return NULL;
+
+       free(q->table.l1_table);
+       free(q->header);
+       free(q);
+
+       return NULL;
+}
+
+static bool qcow2_check_image(int fd)
+{
+       struct qcow2_header_disk f_header;
+
+       if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0)
+               return false;
+
+       be32_to_cpus(&f_header.magic);
+       be32_to_cpus(&f_header.version);
+
+       if (f_header.magic != QCOW_MAGIC)
+               return false;
+
+       if (f_header.version != QCOW2_VERSION)
+               return false;
+
+       return true;
+}
+
+static void *qcow1_read_header(int fd)
+{
+       struct qcow1_header_disk f_header;
+       struct qcow_header *header;
+
+       header = malloc(sizeof(struct qcow_header));
+       if (!header)
+               return NULL;
+
+       if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0) {
+               free(header);
+               return NULL;
+       }
+
+       be32_to_cpus(&f_header.magic);
+       be32_to_cpus(&f_header.version);
+       be64_to_cpus(&f_header.backing_file_offset);
+       be32_to_cpus(&f_header.backing_file_size);
+       be32_to_cpus(&f_header.mtime);
+       be64_to_cpus(&f_header.size);
+       be32_to_cpus(&f_header.crypt_method);
+       be64_to_cpus(&f_header.l1_table_offset);
+
+       *header         = (struct qcow_header) {
+               .size                   = f_header.size,
+               .l1_table_offset        = f_header.l1_table_offset,
+               .l1_size                = f_header.size / ((1 << f_header.l2_bits) * (1 << f_header.cluster_bits)),
+               .cluster_bits           = f_header.cluster_bits,
+               .l2_bits                = f_header.l2_bits,
+       };
+
+       return header;
+}
+
+static struct disk_image *qcow1_probe(int fd, bool readonly)
+{
+       struct disk_image *disk_image;
+       struct qcow_l1_table *l1t;
+       struct qcow_header *h;
+       struct qcow *q;
+
+       q = calloc(1, sizeof(struct qcow));
+       if (!q)
+               goto error;
+
+       mutex_init(&q->mutex);
+       q->fd = fd;
+
+       l1t = &q->table;
+
+       l1t->root = RB_ROOT;
+       INIT_LIST_HEAD(&l1t->lru_list);
+
+       h = q->header = qcow1_read_header(fd);
+       if (!h)
+               goto error;
+
+       if (qcow_read_l1_table(q) < 0)
+               goto error;
+
+       /*
+        * Do not use mmap use read/write instead
+        */
+       if (readonly)
+               disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_NOMMAP);
+       else
+               disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_NOMMAP);
+
+       if (!disk_image)
+               goto error;
+       disk_image->priv = q;
+
+       return disk_image;
+error:
+       if (!q)
+               return NULL;
+
+       free(q->table.l1_table);
+       free(q->header);
+       free(q);
+
+       return NULL;
+}
+
+static bool qcow1_check_image(int fd)
+{
+       struct qcow1_header_disk f_header;
+
+       if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0)
+               return false;
+
+       be32_to_cpus(&f_header.magic);
+       be32_to_cpus(&f_header.version);
+
+       if (f_header.magic != QCOW_MAGIC)
+               return false;
+
+       if (f_header.version != QCOW1_VERSION)
+               return false;
+
+       return true;
+}
+
+struct disk_image *qcow_probe(int fd, bool readonly)
+{
+       if (qcow1_check_image(fd))
+               return qcow1_probe(fd, readonly);
+
+       if (qcow2_check_image(fd))
+               return qcow2_probe(fd, readonly);
+
+       return NULL;
+}
diff --git a/tools/kvm/disk/raw.c b/tools/kvm/disk/raw.c
new file mode 100644 (file)
index 0000000..7f3f8db
--- /dev/null
@@ -0,0 +1,82 @@
+#include "kvm/disk-image.h"
+
+ssize_t raw_image__read_sector_iov(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount)
+{
+       u64 offset = sector << SECTOR_SHIFT;
+
+       return preadv_in_full(disk->fd, iov, iovcount, offset);
+}
+
+ssize_t raw_image__write_sector_iov(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount)
+{
+       u64 offset = sector << SECTOR_SHIFT;
+
+       return pwritev_in_full(disk->fd, iov, iovcount, offset);
+}
+
+ssize_t raw_image__read_sector(struct disk_image *disk, u64 sector, void *dst, u32 dst_len)
+{
+       u64 offset = sector << SECTOR_SHIFT;
+
+       if (offset + dst_len > disk->size)
+               return -1;
+
+       memcpy(dst, disk->priv + offset, dst_len);
+
+       return dst_len;
+}
+
+ssize_t raw_image__write_sector(struct disk_image *disk, u64 sector, void *src, u32 src_len)
+{
+       u64 offset = sector << SECTOR_SHIFT;
+
+       if (offset + src_len > disk->size)
+               return -1;
+
+       memcpy(disk->priv + offset, src, src_len);
+
+       return src_len;
+}
+
+int raw_image__close(struct disk_image *disk)
+{
+       int ret = 0;
+
+       if (disk->priv != MAP_FAILED)
+               ret = munmap(disk->priv, disk->size);
+
+       return ret;
+}
+
+/*
+ * multiple buffer based disk image operations
+ */
+static struct disk_image_operations raw_image_iov_ops = {
+       .read_sector_iov        = raw_image__read_sector_iov,
+       .write_sector_iov       = raw_image__write_sector_iov,
+};
+
+/*
+ * single buffer based disk image operations
+ */
+static struct disk_image_operations raw_image_ops = {
+       .read_sector            = raw_image__read_sector,
+       .write_sector           = raw_image__write_sector,
+       .close                  = raw_image__close,
+};
+
+struct disk_image *raw_image__probe(int fd, struct stat *st, bool readonly)
+{
+
+       if (readonly)
+               /*
+                * Use mmap's MAP_PRIVATE to implement non-persistent write
+                * FIXME: This does not work on 32-bit host.
+                */
+               return disk_image__new(fd, st->st_size, &raw_image_ops, DISK_IMAGE_MMAP);
+       else
+               /*
+                * Use read/write instead of mmap
+                */
+               return disk_image__new(fd, st->st_size, &raw_image_iov_ops, DISK_IMAGE_NOMMAP);
+}
diff --git a/tools/kvm/framebuffer.c b/tools/kvm/framebuffer.c
new file mode 100644 (file)
index 0000000..b6eb1ac
--- /dev/null
@@ -0,0 +1,68 @@
+#include "kvm/framebuffer.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+static LIST_HEAD(framebuffers);
+
+struct framebuffer *fb__register(struct framebuffer *fb)
+{
+       INIT_LIST_HEAD(&fb->node);
+       list_add(&fb->node, &framebuffers);
+
+       return fb;
+}
+
+int fb__attach(struct framebuffer *fb, struct fb_target_operations *ops)
+{
+       if (fb->nr_targets >= FB_MAX_TARGETS)
+               return -1;
+
+       fb->targets[fb->nr_targets++] = ops;
+
+       return 0;
+}
+
+static int start_targets(struct framebuffer *fb)
+{
+       unsigned long i;
+
+       for (i = 0; i < fb->nr_targets; i++) {
+               struct fb_target_operations *ops = fb->targets[i];
+               int err = 0;
+
+               if (ops->start)
+                       err = ops->start(fb);
+
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+int fb__start(void)
+{
+       struct framebuffer *fb;
+
+       list_for_each_entry(fb, &framebuffers, node) {
+               int err;
+
+               err = start_targets(fb);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+void fb__stop(void)
+{
+       struct framebuffer *fb;
+
+       list_for_each_entry(fb, &framebuffers, node) {
+               munmap(fb->mem, fb->mem_size);
+       }
+}
diff --git a/tools/kvm/guest/init.c b/tools/kvm/guest/init.c
new file mode 100644 (file)
index 0000000..7733026
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+ * This is a simple init for shared rootfs guests. It brings up critical
+ * mountpoints and then launches /bin/sh.
+ */
+#include <sys/mount.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+
+static int run_process(char *filename)
+{
+       char *new_argv[] = { filename, NULL };
+       char *new_env[] = { NULL };
+
+       return execve(filename, new_argv, new_env);
+}
+
+static void do_mounts(void)
+{
+       mount("hostfs", "/host", "9p", MS_RDONLY, "trans=virtio,version=9p2000.L");
+       mount("", "/sys", "sysfs", 0, NULL);
+       mount("proc", "/proc", "proc", 0, NULL);
+       mount("devtmpfs", "/dev", "devtmpfs", 0, NULL);
+}
+
+int main(int argc, char *argv[])
+{
+       puts("Mounting...");
+
+       do_mounts();
+
+       puts("Setting up network...");
+
+       system("/bin/sh virt/setnet.sh");
+
+       puts("Starting '/bin/sh'...");
+
+       run_process("/bin/sh");
+
+       printf("Init failed: %s\n", strerror(errno));
+
+       return 0;
+}
diff --git a/tools/kvm/guest/setnet.sh b/tools/kvm/guest/setnet.sh
new file mode 100755 (executable)
index 0000000..3da9c22
--- /dev/null
@@ -0,0 +1,22 @@
+for f in /sys/class/net/*; do
+       type=`cat $f/type`
+       if [ $type -eq 1 ]; then
+               f=${f#/sys/class/net/}
+
+               eval "dhcpcd -A $f 2> /dev/null"
+               if [ $? -eq 0 ]; then
+                       exit
+               fi
+
+               eval "dhclient $f 2> /dev/null"
+               if [ $? -eq 0 ]; then
+                       exit
+               fi
+
+               ifconfig $f 192.168.33.15
+               route add default 192.168.33.1
+               echo "nameserver 8.8.8.8" >> /etc/resolv.conf
+
+               exit
+       fi
+done
diff --git a/tools/kvm/guest_compat.c b/tools/kvm/guest_compat.c
new file mode 100644 (file)
index 0000000..c5bacb8
--- /dev/null
@@ -0,0 +1,104 @@
+#include "kvm/guest_compat.h"
+
+#include "kvm/mutex.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+struct compat_message {
+       int id;
+       char *title;
+       char *desc;
+
+       struct list_head list;
+};
+
+static int id;
+static DEFINE_MUTEX(compat_mtx);
+static LIST_HEAD(messages);
+
+int compat__add_message(const char *title, const char *desc)
+{
+       struct compat_message *msg;
+
+       mutex_lock(&compat_mtx);
+       msg = malloc(sizeof(*msg));
+       if (msg == NULL)
+               goto cleanup;
+
+       *msg = (struct compat_message) {
+               .id = id,
+               .title = strdup(title),
+               .desc = strdup(desc),
+       };
+
+       if (msg->title == NULL || msg->desc == NULL)
+               goto cleanup;
+
+       list_add_tail(&msg->list, &messages);
+
+       mutex_unlock(&compat_mtx);
+
+       return id++;
+
+cleanup:
+       if (msg) {
+               free(msg->title);
+               free(msg->desc);
+               free(msg);
+       }
+
+       mutex_unlock(&compat_mtx);
+
+       return -ENOMEM;
+}
+
+static void compat__free(struct compat_message *msg)
+{
+       free(msg->title);
+       free(msg->desc);
+       free(msg);
+}
+
+int compat__remove_message(int id)
+{
+       struct compat_message *pos, *n;
+
+       mutex_lock(&compat_mtx);
+
+       list_for_each_entry_safe(pos, n, &messages, list) {
+               if (pos->id == id) {
+                       list_del(&pos->list);
+                       compat__free(pos);
+
+                       mutex_unlock(&compat_mtx);
+
+                       return 0;
+               }
+       }
+
+       mutex_unlock(&compat_mtx);
+
+       return -ENOENT;
+}
+
+int compat__print_all_messages(void)
+{
+       mutex_lock(&compat_mtx);
+
+       while (!list_empty(&messages)) {
+               struct compat_message *msg;
+
+               msg = list_first_entry(&messages, struct compat_message, list);
+
+               printf("\n\n*** Compatability Warning ***\n\n\t%s\n\n%s\n",
+                       msg->title, msg->desc);
+
+               list_del(&msg->list);
+               compat__free(msg);
+       }
+
+       mutex_unlock(&compat_mtx);
+
+       return 0;
+}
\ No newline at end of file
diff --git a/tools/kvm/hw/i8042.c b/tools/kvm/hw/i8042.c
new file mode 100644 (file)
index 0000000..3a36425
--- /dev/null
@@ -0,0 +1,348 @@
+#include "kvm/read-write.h"
+#include "kvm/ioport.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+#include "kvm/i8042.h"
+#include "kvm/kvm-cpu.h"
+
+#include <stdint.h>
+
+/*
+ * IRQs
+ */
+#define KBD_IRQ                        1
+#define AUX_IRQ                        12
+
+/*
+ * Registers
+ */
+#define I8042_DATA_REG         0x60
+#define I8042_COMMAND_REG      0x64
+
+/*
+ * Commands
+ */
+#define I8042_CMD_CTL_RCTR     0x20
+#define I8042_CMD_CTL_WCTR     0x60
+#define I8042_CMD_AUX_LOOP     0xD3
+#define I8042_CMD_AUX_SEND     0xD4
+#define I8042_CMD_AUX_TEST     0xA9
+#define I8042_CMD_AUX_DISABLE  0xA7
+#define I8042_CMD_AUX_ENABLE   0xA8
+#define I8042_CMD_SYSTEM_RESET 0xFE
+
+#define RESPONSE_ACK           0xFA
+
+#define MODE_DISABLE_AUX       0x20
+
+#define AUX_ENABLE_REPORTING   0x20
+#define AUX_SCALING_FLAG       0x10
+#define AUX_DEFAULT_RESOLUTION 0x2
+#define AUX_DEFAULT_SAMPLE     100
+
+/*
+ * Status register bits
+ */
+#define I8042_STR_AUXDATA      0x20
+#define I8042_STR_KEYLOCK      0x10
+#define I8042_STR_CMDDAT       0x08
+#define I8042_STR_MUXERR       0x04
+#define I8042_STR_OBF          0x01
+
+#define KBD_MODE_KBD_INT       0x01
+#define KBD_MODE_SYS           0x02
+
+#define QUEUE_SIZE             128
+
+/*
+ * This represents the current state of the PS/2 keyboard system,
+ * including the AUX device (the mouse)
+ */
+struct kbd_state {
+       struct kvm              *kvm;
+
+       char                    kq[QUEUE_SIZE]; /* Keyboard queue */
+       int                     kread, kwrite;  /* Indexes into the queue */
+       int                     kcount;         /* number of elements in queue */
+
+       char                    mq[QUEUE_SIZE];
+       int                     mread, mwrite;
+       int                     mcount;
+
+       u8                      mstatus;        /* Mouse status byte */
+       u8                      mres;           /* Current mouse resolution */
+       u8                      msample;        /* Current mouse samples/second */
+
+       u8                      mode;           /* i8042 mode register */
+       u8                      status;         /* i8042 status register */
+       /*
+        * Some commands (on port 0x64) have arguments;
+        * we store the command here while we wait for the argument
+        */
+       u32                     write_cmd;
+};
+
+static struct kbd_state                state;
+
+/*
+ * If there are packets to be read, set the appropriate IRQs high
+ */
+static void kbd_update_irq(void)
+{
+       u8 klevel = 0;
+       u8 mlevel = 0;
+
+       /* First, clear the kbd and aux output buffer full bits */
+       state.status &= ~(I8042_STR_OBF | I8042_STR_AUXDATA);
+
+       if (state.kcount > 0) {
+               state.status |= I8042_STR_OBF;
+               klevel = 1;
+       }
+
+       /* Keyboard has higher priority than mouse */
+       if (klevel == 0 && state.mcount != 0) {
+               state.status |= I8042_STR_OBF | I8042_STR_AUXDATA;
+               mlevel = 1;
+       }
+
+       kvm__irq_line(state.kvm, KBD_IRQ, klevel);
+       kvm__irq_line(state.kvm, AUX_IRQ, mlevel);
+}
+
+/*
+ * Add a byte to the mouse queue, then set IRQs
+ */
+void mouse_queue(u8 c)
+{
+       if (state.mcount >= QUEUE_SIZE)
+               return;
+
+       state.mq[state.mwrite++ % QUEUE_SIZE] = c;
+
+       state.mcount++;
+       kbd_update_irq();
+}
+
+/*
+ * Add a byte to the keyboard queue, then set IRQs
+ */
+void kbd_queue(u8 c)
+{
+       if (state.kcount >= QUEUE_SIZE)
+               return;
+
+       state.kq[state.kwrite++ % QUEUE_SIZE] = c;
+
+       state.kcount++;
+       kbd_update_irq();
+}
+
+static void kbd_write_command(struct kvm *kvm, u8 val)
+{
+       switch (val) {
+       case I8042_CMD_CTL_RCTR:
+               kbd_queue(state.mode);
+               break;
+       case I8042_CMD_CTL_WCTR:
+       case I8042_CMD_AUX_SEND:
+       case I8042_CMD_AUX_LOOP:
+               state.write_cmd = val;
+               break;
+       case I8042_CMD_AUX_TEST:
+               /* 0 means we're a normal PS/2 mouse */
+               mouse_queue(0);
+               break;
+       case I8042_CMD_AUX_DISABLE:
+               state.mode |= MODE_DISABLE_AUX;
+               break;
+       case I8042_CMD_AUX_ENABLE:
+               state.mode &= ~MODE_DISABLE_AUX;
+               break;
+       case I8042_CMD_SYSTEM_RESET:
+               kvm_cpu__reboot();
+               break;
+       default:
+               break;
+       }
+}
+
+/*
+ * Called when the OS reads from port 0x60 (PS/2 data)
+ */
+static u32 kbd_read_data(void)
+{
+       u32 ret;
+       int i;
+
+       if (state.kcount != 0) {
+               /* Keyboard data gets read first */
+               ret = state.kq[state.kread++ % QUEUE_SIZE];
+               state.kcount--;
+               kvm__irq_line(state.kvm, KBD_IRQ, 0);
+               kbd_update_irq();
+       } else if (state.mcount > 0) {
+               /* Followed by the mouse */
+               ret = state.mq[state.mread++ % QUEUE_SIZE];
+               state.mcount--;
+               kvm__irq_line(state.kvm, AUX_IRQ, 0);
+               kbd_update_irq();
+       } else if (state.kcount == 0) {
+               i = state.kread - 1;
+               if (i < 0)
+                       i = QUEUE_SIZE;
+               ret = state.kq[i];
+       }
+       return ret;
+}
+
+/*
+ * Called when the OS read from port 0x64, the command port
+ */
+static u32 kbd_read_status(void)
+{
+       return (u32)state.status;
+}
+
+/*
+ * Called when the OS writes to port 0x60 (data port)
+ * Things written here are generally arguments to commands previously
+ * written to port 0x64 and stored in state.write_cmd
+ */
+static void kbd_write_data(u32 val)
+{
+       switch (state.write_cmd) {
+       case I8042_CMD_CTL_WCTR:
+               state.mode = val;
+               kbd_update_irq();
+               break;
+       case I8042_CMD_AUX_LOOP:
+               mouse_queue(val);
+               mouse_queue(RESPONSE_ACK);
+               break;
+       case I8042_CMD_AUX_SEND:
+               /* The OS wants to send a command to the mouse */
+               mouse_queue(RESPONSE_ACK);
+               switch (val) {
+               case 0xe6:
+                       /* set scaling = 1:1 */
+                       state.mstatus &= ~AUX_SCALING_FLAG;
+                       break;
+               case 0xe8:
+                       /* set resolution */
+                       state.mres = val;
+                       break;
+               case 0xe9:
+                       /* Report mouse status/config */
+                       mouse_queue(state.mstatus);
+                       mouse_queue(state.mres);
+                       mouse_queue(state.msample);
+                       break;
+               case 0xf2:
+                       /* send ID */
+                       mouse_queue(0); /* normal mouse */
+                       break;
+               case 0xf3:
+                       /* set sample rate */
+                       state.msample = val;
+                       break;
+               case 0xf4:
+                       /* enable reporting */
+                       state.mstatus |= AUX_ENABLE_REPORTING;
+                       break;
+               case 0xf5:
+                       state.mstatus &= ~AUX_ENABLE_REPORTING;
+                       break;
+               case 0xf6:
+                       /* set defaults, just fall through to reset */
+               case 0xff:
+                       /* reset */
+                       state.mstatus = 0x0;
+                       state.mres = AUX_DEFAULT_RESOLUTION;
+                       state.msample = AUX_DEFAULT_SAMPLE;
+                       break;
+               default:
+                       break;
+       }
+       break;
+       case 0:
+               /* Just send the ID */
+               kbd_queue(RESPONSE_ACK);
+               kbd_queue(0xab);
+               kbd_queue(0x41);
+               kbd_update_irq();
+               break;
+       default:
+               /* Yeah whatever */
+               break;
+       }
+       state.write_cmd = 0;
+}
+
+static void kbd_reset(void)
+{
+       state = (struct kbd_state) {
+               .status         = I8042_STR_MUXERR | I8042_STR_CMDDAT | I8042_STR_KEYLOCK, /* 0x1c */
+               .mode           = KBD_MODE_KBD_INT | KBD_MODE_SYS, /* 0x3 */
+               .mres           = AUX_DEFAULT_RESOLUTION,
+               .msample        = AUX_DEFAULT_SAMPLE,
+       };
+}
+
+/*
+ * Called when the OS has written to one of the keyboard's ports (0x60 or 0x64)
+ */
+static bool kbd_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       switch (port) {
+       case I8042_COMMAND_REG: {
+               u8 value = kbd_read_status();
+               ioport__write8(data, value);
+               break;
+       }
+       case I8042_DATA_REG: {
+               u32 value = kbd_read_data();
+               ioport__write32(data, value);
+               break;
+       }
+       default:
+               return false;
+       }
+
+       return true;
+}
+
+static bool kbd_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       switch (port) {
+       case I8042_COMMAND_REG: {
+               u8 value = ioport__read8(data);
+               kbd_write_command(kvm, value);
+               break;
+       }
+       case I8042_DATA_REG: {
+               u32 value = ioport__read32(data);
+               kbd_write_data(value);
+               break;
+       }
+       default:
+               return false;
+       }
+
+       return true;
+}
+
+static struct ioport_operations kbd_ops = {
+       .io_in          = kbd_in,
+       .io_out         = kbd_out,
+};
+
+void kbd__init(struct kvm *kvm)
+{
+       kbd_reset();
+       state.kvm = kvm;
+       ioport__register(I8042_DATA_REG, &kbd_ops, 2, NULL);
+       ioport__register(I8042_COMMAND_REG, &kbd_ops, 2, NULL);
+}
diff --git a/tools/kvm/hw/pci-shmem.c b/tools/kvm/hw/pci-shmem.c
new file mode 100644 (file)
index 0000000..2907a66
--- /dev/null
@@ -0,0 +1,264 @@
+#include "kvm/pci-shmem.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/util.h"
+#include "kvm/ioport.h"
+#include "kvm/ioeventfd.h"
+
+#include <linux/kvm.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+static struct pci_device_header pci_shmem_pci_device = {
+       .vendor_id      = PCI_VENDOR_ID_REDHAT_QUMRANET,
+       .device_id      = 0x1110,
+       .header_type    = PCI_HEADER_TYPE_NORMAL,
+       .class          = 0xFF0000,     /* misc pci device */
+       .status         = PCI_STATUS_CAP_LIST,
+       .capabilities   = (void *)&pci_shmem_pci_device.msix - (void *)&pci_shmem_pci_device,
+       .msix.cap       = PCI_CAP_ID_MSIX,
+       .msix.ctrl      = 1,
+       .msix.table_offset = 1,         /* Use BAR 1 */
+       .msix.pba_offset = 0x1001,      /* Use BAR 1 */
+};
+
+/* registers for the Inter-VM shared memory device */
+enum ivshmem_registers {
+       INTRMASK = 0,
+       INTRSTATUS = 4,
+       IVPOSITION = 8,
+       DOORBELL = 12,
+};
+
+static struct shmem_info *shmem_region;
+static u16 ivshmem_registers;
+static int local_fd;
+static u32 local_id;
+static u64 msix_block;
+static u64 msix_pba;
+static struct msix_table msix_table[2];
+
+int pci_shmem__register_mem(struct shmem_info *si)
+{
+       if (shmem_region == NULL) {
+               shmem_region = si;
+       } else {
+               pr_warning("only single shmem currently avail. ignoring.\n");
+               free(si);
+       }
+       return 0;
+}
+
+static bool shmem_pci__io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       u16 offset = port - ivshmem_registers;
+
+       switch (offset) {
+       case INTRMASK:
+               break;
+       case INTRSTATUS:
+               break;
+       case IVPOSITION:
+               ioport__write32(data, local_id);
+               break;
+       case DOORBELL:
+               break;
+       };
+
+       return true;
+}
+
+static bool shmem_pci__io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       u16 offset = port - ivshmem_registers;
+
+       switch (offset) {
+       case INTRMASK:
+               break;
+       case INTRSTATUS:
+               break;
+       case IVPOSITION:
+               break;
+       case DOORBELL:
+               break;
+       };
+
+       return true;
+}
+
+static struct ioport_operations shmem_pci__io_ops = {
+       .io_in  = shmem_pci__io_in,
+       .io_out = shmem_pci__io_out,
+};
+
+static void callback_mmio_msix(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr)
+{
+       void *mem;
+
+       if (addr - msix_block < 0x1000)
+               mem = &msix_table;
+       else
+               mem = &msix_pba;
+
+       if (is_write)
+               memcpy(mem + addr - msix_block, data, len);
+       else
+               memcpy(data, mem + addr - msix_block, len);
+}
+
+/*
+ * Return an irqfd which can be used by other guests to signal this guest
+ * whenever they need to poke it
+ */
+int pci_shmem__get_local_irqfd(struct kvm *kvm)
+{
+       int fd, gsi, r;
+       struct kvm_irqfd irqfd;
+
+       if (local_fd == 0) {
+               fd = eventfd(0, 0);
+               if (fd < 0)
+                       return fd;
+
+               if (pci_shmem_pci_device.msix.ctrl & PCI_MSIX_FLAGS_ENABLE) {
+                       gsi = irq__add_msix_route(kvm,
+                                 msix_table[0].low,
+                                 msix_table[0].high,
+                                 msix_table[0].data);
+               } else {
+                       gsi = pci_shmem_pci_device.irq_line;
+               }
+
+               irqfd = (struct kvm_irqfd) {
+                       .fd = fd,
+                       .gsi = gsi,
+               };
+
+               r = ioctl(kvm->vm_fd, KVM_IRQFD, &irqfd);
+               if (r < 0)
+                       return r;
+
+               local_fd = fd;
+       }
+
+       return local_fd;
+}
+
+/*
+ * Connect a new client to ivshmem by adding the appropriate datamatch
+ * to the DOORBELL
+ */
+int pci_shmem__add_client(struct kvm *kvm, u32 id, int fd)
+{
+       struct kvm_ioeventfd ioevent;
+
+       ioevent = (struct kvm_ioeventfd) {
+               .addr           = ivshmem_registers + DOORBELL,
+               .len            = sizeof(u32),
+               .datamatch      = id,
+               .fd             = fd,
+               .flags          = KVM_IOEVENTFD_FLAG_PIO | KVM_IOEVENTFD_FLAG_DATAMATCH,
+       };
+
+       return ioctl(kvm->vm_fd, KVM_IOEVENTFD, &ioevent);
+}
+
+/*
+ * Remove a client connected to ivshmem by removing the appropriate datamatch
+ * from the DOORBELL
+ */
+int pci_shmem__remove_client(struct kvm *kvm, u32 id)
+{
+       struct kvm_ioeventfd ioevent;
+
+       ioevent = (struct kvm_ioeventfd) {
+               .addr           = ivshmem_registers + DOORBELL,
+               .len            = sizeof(u32),
+               .datamatch      = id,
+               .flags          = KVM_IOEVENTFD_FLAG_PIO
+                               | KVM_IOEVENTFD_FLAG_DATAMATCH
+                               | KVM_IOEVENTFD_FLAG_DEASSIGN,
+       };
+
+       return ioctl(kvm->vm_fd, KVM_IOEVENTFD, &ioevent);
+}
+
+static void *setup_shmem(const char *key, size_t len, int creating)
+{
+       int fd;
+       int rtn;
+       void *mem;
+       int flag = O_RDWR;
+
+       if (creating)
+               flag |= O_CREAT;
+
+       fd = shm_open(key, flag, S_IRUSR | S_IWUSR);
+       if (fd < 0) {
+               pr_warning("Failed to open shared memory file %s\n", key);
+               return NULL;
+       }
+
+       if (creating) {
+               rtn = ftruncate(fd, (off_t) len);
+               if (rtn < 0)
+                       pr_warning("Can't ftruncate(fd,%zu)\n", len);
+       }
+       mem = mmap(NULL, len,
+                  PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, fd, 0);
+       close(fd);
+
+       if (mem == NULL)
+               pr_warning("Failed to mmap shared memory file");
+
+       return mem;
+}
+
+int pci_shmem__init(struct kvm *kvm)
+{
+       u8 dev, line, pin;
+       char *mem;
+
+       if (shmem_region == 0)
+               return 0;
+
+       /* Register good old INTx */
+       if (irq__register_device(PCI_DEVICE_ID_PCI_SHMEM, &dev, &pin, &line) < 0)
+               return 0;
+
+       pci_shmem_pci_device.irq_pin = pin;
+       pci_shmem_pci_device.irq_line = line;
+
+       /* Register MMIO space for MSI-X */
+       ivshmem_registers = ioport__register(IOPORT_EMPTY, &shmem_pci__io_ops, IOPORT_SIZE, NULL);
+       msix_block = pci_get_io_space_block(0x1010);
+       kvm__register_mmio(kvm, msix_block, 0x1010, callback_mmio_msix, NULL);
+
+       /*
+        * This registers 3 BARs:
+        *
+        * 0 - ivshmem registers
+        * 1 - MSI-X MMIO space
+        * 2 - Shared memory block
+        */
+       pci_shmem_pci_device.bar[0] = ivshmem_registers | PCI_BASE_ADDRESS_SPACE_IO;
+       pci_shmem_pci_device.bar_size[0] = shmem_region->size;
+       pci_shmem_pci_device.bar[1] = msix_block | PCI_BASE_ADDRESS_SPACE_MEMORY;
+       pci_shmem_pci_device.bar_size[1] = 0x1010;
+       pci_shmem_pci_device.bar[2] = shmem_region->phys_addr | PCI_BASE_ADDRESS_SPACE_MEMORY;
+       pci_shmem_pci_device.bar_size[2] = shmem_region->size;
+
+       pci__register(&pci_shmem_pci_device, dev);
+
+       /* Open shared memory and plug it into the guest */
+       mem = setup_shmem(shmem_region->handle, shmem_region->size,
+                               shmem_region->create);
+       if (mem == NULL)
+               return 0;
+       kvm__register_mem(kvm, shmem_region->phys_addr, shmem_region->size,
+                         mem);
+       return 1;
+}
diff --git a/tools/kvm/hw/rtc.c b/tools/kvm/hw/rtc.c
new file mode 100644 (file)
index 0000000..c6879cc
--- /dev/null
@@ -0,0 +1,87 @@
+#include "kvm/rtc.h"
+
+#include "kvm/ioport.h"
+#include "kvm/kvm.h"
+
+#include <time.h>
+
+static u8 cmos_index;
+
+#define CMOS_RTC_SECONDS               0x00
+#define CMOS_RTC_MINUTES               0x02
+#define CMOS_RTC_HOURS                 0x04
+#define CMOS_RTC_DATE_OF_MONTH         0x07
+#define CMOS_RTC_MONTH                 0x08
+#define CMOS_RTC_YEAR                  0x09
+
+static inline unsigned char bin2bcd(unsigned val)
+{
+       return ((val / 10) << 4) + val % 10;
+}
+
+static bool cmos_ram_data_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       struct tm *tm;
+       time_t ti;
+
+       time(&ti);
+
+       tm = gmtime(&ti);
+
+       switch (cmos_index) {
+       case CMOS_RTC_SECONDS:
+               ioport__write8(data, bin2bcd(tm->tm_sec));
+               break;
+       case CMOS_RTC_MINUTES:
+               ioport__write8(data, bin2bcd(tm->tm_min));
+               break;
+       case CMOS_RTC_HOURS:
+               ioport__write8(data, bin2bcd(tm->tm_hour));
+               break;
+       case CMOS_RTC_DATE_OF_MONTH:
+               ioport__write8(data, bin2bcd(tm->tm_mday));
+               break;
+       case CMOS_RTC_MONTH:
+               ioport__write8(data, bin2bcd(tm->tm_mon + 1));
+               break;
+       case CMOS_RTC_YEAR:
+               ioport__write8(data, bin2bcd(tm->tm_year));
+               break;
+       }
+
+       return true;
+}
+
+static bool cmos_ram_data_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       return true;
+}
+
+static struct ioport_operations cmos_ram_data_ioport_ops = {
+       .io_out         = cmos_ram_data_out,
+       .io_in          = cmos_ram_data_in,
+};
+
+static bool cmos_ram_index_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       u8 value;
+
+       value   = ioport__read8(data);
+
+       kvm->nmi_disabled       = value & (1UL << 7);
+
+       cmos_index              = value & ~(1UL << 7);
+
+       return true;
+}
+
+static struct ioport_operations cmos_ram_index_ioport_ops = {
+       .io_out         = cmos_ram_index_out,
+};
+
+void rtc__init(void)
+{
+       /* PORT 0070-007F - CMOS RAM/RTC (REAL TIME CLOCK) */
+       ioport__register(0x0070, &cmos_ram_index_ioport_ops, 1, NULL);
+       ioport__register(0x0071, &cmos_ram_data_ioport_ops, 1, NULL);
+}
diff --git a/tools/kvm/hw/serial.c b/tools/kvm/hw/serial.c
new file mode 100644 (file)
index 0000000..b3b233f
--- /dev/null
@@ -0,0 +1,358 @@
+#include "kvm/8250-serial.h"
+
+#include "kvm/read-write.h"
+#include "kvm/ioport.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+
+#include <linux/types.h>
+#include <linux/serial_reg.h>
+
+#include <pthread.h>
+
+struct serial8250_device {
+       pthread_mutex_t         mutex;
+
+       u16                     iobase;
+       u8                      irq;
+
+       u8                      rbr;            /* receive buffer */
+       u8                      dll;
+       u8                      dlm;
+       u8                      iir;
+       u8                      ier;
+       u8                      fcr;
+       u8                      lcr;
+       u8                      mcr;
+       u8                      lsr;
+       u8                      msr;
+       u8                      scr;
+};
+
+#define SERIAL_REGS_SETTING \
+       .iir                    = UART_IIR_NO_INT, \
+       .lsr                    = UART_LSR_TEMT | UART_LSR_THRE, \
+       .msr                    = UART_MSR_DCD | UART_MSR_DSR | UART_MSR_CTS, \
+       .mcr                    = UART_MCR_OUT2,
+
+static struct serial8250_device devices[] = {
+       /* ttyS0 */
+       [0]     = {
+               .mutex                  = PTHREAD_MUTEX_INITIALIZER,
+
+               .iobase                 = 0x3f8,
+               .irq                    = 4,
+
+               SERIAL_REGS_SETTING
+       },
+       /* ttyS1 */
+       [1]     = {
+               .mutex                  = PTHREAD_MUTEX_INITIALIZER,
+
+               .iobase                 = 0x2f8,
+               .irq                    = 3,
+
+               SERIAL_REGS_SETTING
+       },
+       /* ttyS2 */
+       [2]     = {
+               .mutex                  = PTHREAD_MUTEX_INITIALIZER,
+
+               .iobase                 = 0x3e8,
+               .irq                    = 4,
+
+               SERIAL_REGS_SETTING
+       },
+       /* ttyS3 */
+       [3]     = {
+               .mutex                  = PTHREAD_MUTEX_INITIALIZER,
+
+               .iobase                 = 0x2e8,
+               .irq                    = 3,
+
+               SERIAL_REGS_SETTING
+       },
+};
+
+#define SYSRQ_PENDING_NONE             0
+#define SYSRQ_PENDING_BREAK            1
+#define SYSRQ_PENDING_CMD              2
+
+static int sysrq_pending;
+
+static void serial8250__sysrq(struct kvm *kvm, struct serial8250_device *dev)
+{
+       switch (sysrq_pending) {
+       case SYSRQ_PENDING_BREAK:
+               dev->lsr        |= UART_LSR_DR | UART_LSR_BI;
+
+               sysrq_pending   = SYSRQ_PENDING_CMD;
+               break;
+       case SYSRQ_PENDING_CMD:
+               dev->rbr        = 'p';
+               dev->lsr        |= UART_LSR_DR;
+
+               sysrq_pending   = SYSRQ_PENDING_NONE;
+               break;
+       }
+}
+
+static void serial8250__receive(struct kvm *kvm, struct serial8250_device *dev)
+{
+       int c;
+
+       if (dev->lsr & UART_LSR_DR)
+               return;
+
+       if (sysrq_pending) {
+               serial8250__sysrq(kvm, dev);
+               return;
+       }
+
+       if (!term_readable(CONSOLE_8250))
+               return;
+
+       c               = term_getc(CONSOLE_8250);
+
+       if (c < 0)
+               return;
+
+       dev->rbr        = c;
+       dev->lsr        |= UART_LSR_DR;
+}
+
+/*
+ * Interrupts are injected for ttyS0 only.
+ */
+void serial8250__inject_interrupt(struct kvm *kvm)
+{
+       struct serial8250_device *dev = &devices[0];
+
+       mutex_lock(&dev->mutex);
+
+       serial8250__receive(kvm, dev);
+
+       if (dev->ier & UART_IER_RDI && dev->lsr & UART_LSR_DR)
+               dev->iir                = UART_IIR_RDI;
+       else if (dev->ier & UART_IER_THRI)
+               dev->iir                = UART_IIR_THRI;
+       else
+               dev->iir                = UART_IIR_NO_INT;
+
+       if (dev->iir != UART_IIR_NO_INT) {
+               kvm__irq_line(kvm, dev->irq, 0);
+               kvm__irq_line(kvm, dev->irq, 1);
+       }
+
+       mutex_unlock(&dev->mutex);
+}
+
+void serial8250__inject_sysrq(struct kvm *kvm)
+{
+       sysrq_pending   = SYSRQ_PENDING_BREAK;
+}
+
+static struct serial8250_device *find_device(u16 port)
+{
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(devices); i++) {
+               struct serial8250_device *dev = &devices[i];
+
+               if (dev->iobase == (port & ~0x7))
+                       return dev;
+       }
+       return NULL;
+}
+
+static bool serial8250_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       struct serial8250_device *dev;
+       u16 offset;
+       bool ret = true;
+
+       dev             = find_device(port);
+       if (!dev)
+               return false;
+
+       mutex_lock(&dev->mutex);
+
+       offset          = port - dev->iobase;
+
+       if (dev->lcr & UART_LCR_DLAB) {
+               switch (offset) {
+               case UART_DLL:
+                       dev->dll        = ioport__read8(data);
+                       break;
+               case UART_DLM:
+                       dev->dlm        = ioport__read8(data);
+                       break;
+               case UART_FCR:
+                       dev->fcr        = ioport__read8(data);
+                       break;
+               case UART_LCR:
+                       dev->lcr        = ioport__read8(data);
+                       break;
+               case UART_MCR:
+                       dev->mcr        = ioport__read8(data);
+                       break;
+               case UART_LSR:
+                       /* Factory test */
+                       break;
+               case UART_MSR:
+                       /* Not used */
+                       break;
+               case UART_SCR:
+                       dev->scr        = ioport__read8(data);
+                       break;
+               default:
+                       ret             = false;
+                       goto out_unlock;
+               }
+       } else {
+               switch (offset) {
+               case UART_TX: {
+                       char *addr = data;
+
+                       if (!(dev->mcr & UART_MCR_LOOP))
+                               term_putc(CONSOLE_8250, addr, size);
+
+                       dev->iir                = UART_IIR_NO_INT;
+                       break;
+               }
+               case UART_FCR:
+                       dev->fcr        = ioport__read8(data);
+                       break;
+               case UART_IER:
+                       dev->ier        = ioport__read8(data) & 0x3f;
+                       break;
+               case UART_LCR:
+                       dev->lcr        = ioport__read8(data);
+                       break;
+               case UART_MCR:
+                       dev->mcr        = ioport__read8(data);
+                       break;
+               case UART_LSR:
+                       /* Factory test */
+                       break;
+               case UART_MSR:
+                       /* Not used */
+                       break;
+               case UART_SCR:
+                       dev->scr        = ioport__read8(data);
+                       break;
+               default:
+                       ret             = false;
+                       goto out_unlock;
+               }
+       }
+
+out_unlock:
+       mutex_unlock(&dev->mutex);
+
+       return ret;
+}
+
+static bool serial8250_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       struct serial8250_device *dev;
+       u16 offset;
+       bool ret = true;
+
+       dev             = find_device(port);
+       if (!dev)
+               return false;
+
+       mutex_lock(&dev->mutex);
+
+       offset          = port - dev->iobase;
+
+       if (dev->lcr & UART_LCR_DLAB) {
+               switch (offset) {
+               case UART_DLL:
+                       ioport__write8(data, dev->dll);
+                       goto out_unlock;
+
+               case UART_DLM:
+                       ioport__write8(data, dev->dlm);
+                       goto out_unlock;
+
+               default:
+                       break;
+               }
+       } else {
+               switch (offset) {
+               case UART_RX:
+                       ioport__write8(data, dev->rbr);
+                       dev->lsr                &= ~UART_LSR_DR;
+                       dev->iir                = UART_IIR_NO_INT;
+                       goto out_unlock;
+
+               case UART_IER:
+                       ioport__write8(data, dev->ier);
+                       goto out_unlock;
+
+               default:
+                       break;
+               }
+       }
+
+       switch (offset) {
+       case UART_IIR: {
+               u8 iir = dev->iir;
+
+               if (dev->fcr & UART_FCR_ENABLE_FIFO)
+                       iir             |= 0xc0;
+
+               ioport__write8(data, iir);
+               break;
+       }
+       case UART_LCR:
+               ioport__write8(data, dev->lcr);
+               break;
+       case UART_MCR:
+               ioport__write8(data, dev->mcr);
+               break;
+       case UART_LSR:
+               ioport__write8(data, dev->lsr);
+               dev->lsr                &= ~(UART_LSR_OE|UART_LSR_PE|UART_LSR_FE|UART_LSR_BI);
+               break;
+       case UART_MSR:
+               ioport__write8(data, dev->msr);
+               break;
+       case UART_SCR:
+               ioport__write8(data, dev->scr);
+               break;
+       default:
+               ret             = false;
+               goto out_unlock;
+       }
+out_unlock:
+       mutex_unlock(&dev->mutex);
+
+       return ret;
+}
+
+static struct ioport_operations serial8250_ops = {
+       .io_in          = serial8250_in,
+       .io_out         = serial8250_out,
+};
+
+static void serial8250__device_init(struct kvm *kvm, struct serial8250_device *dev)
+{
+       ioport__register(dev->iobase, &serial8250_ops, 8, NULL);
+       kvm__irq_line(kvm, dev->irq, 0);
+}
+
+void serial8250__init(struct kvm *kvm)
+{
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(devices); i++) {
+               struct serial8250_device *dev = &devices[i];
+
+               serial8250__device_init(kvm, dev);
+       }
+}
diff --git a/tools/kvm/hw/vesa.c b/tools/kvm/hw/vesa.c
new file mode 100644 (file)
index 0000000..22b1652
--- /dev/null
@@ -0,0 +1,77 @@
+#include "kvm/vesa.h"
+
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/framebuffer.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include <sys/mman.h>
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <inttypes.h>
+#include <unistd.h>
+
+static bool vesa_pci_io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       return true;
+}
+
+static bool vesa_pci_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       return true;
+}
+
+static struct ioport_operations vesa_io_ops = {
+       .io_in                  = vesa_pci_io_in,
+       .io_out                 = vesa_pci_io_out,
+};
+
+static struct pci_device_header vesa_pci_device = {
+       .vendor_id              = PCI_VENDOR_ID_REDHAT_QUMRANET,
+       .device_id              = PCI_DEVICE_ID_VESA,
+       .header_type            = PCI_HEADER_TYPE_NORMAL,
+       .revision_id            = 0,
+       .class                  = 0x030000,
+       .subsys_vendor_id       = PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET,
+       .subsys_id              = PCI_SUBSYSTEM_ID_VESA,
+       .bar[1]                 = VESA_MEM_ADDR | PCI_BASE_ADDRESS_SPACE_MEMORY,
+       .bar_size[1]            = VESA_MEM_SIZE,
+};
+
+static struct framebuffer vesafb;
+
+struct framebuffer *vesa__init(struct kvm *kvm)
+{
+       u16 vesa_base_addr;
+       u8 dev, line, pin;
+       char *mem;
+
+       if (irq__register_device(PCI_DEVICE_ID_VESA, &dev, &pin, &line) < 0)
+               return NULL;
+
+       vesa_pci_device.irq_pin         = pin;
+       vesa_pci_device.irq_line        = line;
+       vesa_base_addr                  = ioport__register(IOPORT_EMPTY, &vesa_io_ops, IOPORT_SIZE, NULL);
+       vesa_pci_device.bar[0]          = vesa_base_addr | PCI_BASE_ADDRESS_SPACE_IO;
+       pci__register(&vesa_pci_device, dev);
+
+       mem = mmap(NULL, VESA_MEM_SIZE, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
+       if (mem == MAP_FAILED)
+               return NULL;
+
+       kvm__register_mem(kvm, VESA_MEM_ADDR, VESA_MEM_SIZE, mem);
+
+       vesafb = (struct framebuffer) {
+               .width                  = VESA_WIDTH,
+               .height                 = VESA_HEIGHT,
+               .depth                  = VESA_BPP,
+               .mem                    = mem,
+               .mem_addr               = VESA_MEM_ADDR,
+               .mem_size               = VESA_MEM_SIZE,
+       };
+       return fb__register(&vesafb);
+}
diff --git a/tools/kvm/include/asm/hweight.h b/tools/kvm/include/asm/hweight.h
new file mode 100644 (file)
index 0000000..1a43977
--- /dev/null
@@ -0,0 +1,8 @@
+#ifndef _KVM_ASM_HWEIGHT_H_
+#define _KVM_ASM_HWEIGHT_H_
+
+#include <linux/types.h>
+unsigned int hweight32(unsigned int w);
+unsigned long hweight64(__u64 w);
+
+#endif /* _KVM_ASM_HWEIGHT_H_ */
diff --git a/tools/kvm/include/kvm/8250-serial.h b/tools/kvm/include/kvm/8250-serial.h
new file mode 100644 (file)
index 0000000..7a6d3f3
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef KVM__8250_SERIAL_H
+#define KVM__8250_SERIAL_H
+
+struct kvm;
+
+void serial8250__init(struct kvm *kvm);
+void serial8250__inject_interrupt(struct kvm *kvm);
+void serial8250__inject_sysrq(struct kvm *kvm);
+
+#endif /* KVM__8250_SERIAL_H */
diff --git a/tools/kvm/include/kvm/apic.h b/tools/kvm/include/kvm/apic.h
new file mode 100644 (file)
index 0000000..2129997
--- /dev/null
@@ -0,0 +1,17 @@
+#ifndef KVM_APIC_H_
+#define KVM_APIC_H_
+
+#include <asm/apicdef.h>
+
+/*
+ * APIC, IOAPIC stuff
+ */
+#define APIC_BASE_ADDR_STEP    0x00400000
+#define IOAPIC_BASE_ADDR_STEP  0x00100000
+
+#define APIC_ADDR(apic)                (APIC_DEFAULT_PHYS_BASE + apic * APIC_BASE_ADDR_STEP)
+#define IOAPIC_ADDR(ioapic)    (IO_APIC_DEFAULT_PHYS_BASE + ioapic * IOAPIC_BASE_ADDR_STEP)
+
+#define KVM_APIC_VERSION       0x14 /* xAPIC */
+
+#endif /* KVM_APIC_H_ */
diff --git a/tools/kvm/include/kvm/assembly.h b/tools/kvm/include/kvm/assembly.h
new file mode 100644 (file)
index 0000000..e70baab
--- /dev/null
@@ -0,0 +1,24 @@
+#ifndef ASSEMBLY_H_
+#define ASSEMBLY_H_
+
+#define __ALIGN        .p2align 4, 0x90
+#define ENTRY(name)    \
+       __ALIGN;        \
+       .globl name;    \
+       name:
+
+#define GLOBAL(name)   \
+       .globl name;    \
+       name:
+
+#define ENTRY_END(name)        GLOBAL(name##_end)
+#define END(name)      GLOBAL(name##_end)
+
+/*
+ * gas produces size override prefix with which
+ * we are unhappy, lets make it hardcoded for
+ * 16 bit mode
+ */
+#define IRET   .byte 0xcf
+
+#endif /* ASSEMBLY_H_ */
diff --git a/tools/kvm/include/kvm/barrier.h b/tools/kvm/include/kvm/barrier.h
new file mode 100644 (file)
index 0000000..c11a239
--- /dev/null
@@ -0,0 +1,15 @@
+#ifndef _KVM_BARRIER_H_
+#define _KVM_BARRIER_H_
+
+/*
+ * asm/system.h cannot be #included standalone on 32-bit x86 yet.
+ *
+ * Provide the dependencies here - we can drop these wrappers once
+ * the header is fixed upstream:
+ */
+#include <linux/stringify.h>
+#include <linux/bitops.h>
+#include <asm/alternative.h>
+#include <asm/system.h>
+
+#endif /* _KVM_BARRIER_H_ */
diff --git a/tools/kvm/include/kvm/bios-export.h b/tools/kvm/include/kvm/bios-export.h
new file mode 100644 (file)
index 0000000..23825aa
--- /dev/null
@@ -0,0 +1,13 @@
+#ifndef BIOS_EXPORT_H_
+#define BIOS_EXPORT_H_
+
+struct kvm;
+
+extern char bios_rom[0];
+extern char bios_rom_end[0];
+
+#define bios_rom_size          (bios_rom_end - bios_rom)
+
+extern void setup_bios(struct kvm *kvm);
+
+#endif /* BIOS_EXPORT_H_ */
diff --git a/tools/kvm/include/kvm/bios.h b/tools/kvm/include/kvm/bios.h
new file mode 100644 (file)
index 0000000..469576e
--- /dev/null
@@ -0,0 +1,83 @@
+#ifndef BIOS_H_
+#define BIOS_H_
+
+/*
+ * X86-32 Memory Map (typical)
+ *                                     start      end
+ * Real Mode Interrupt Vector Table    0x00000000 0x000003FF
+ * BDA area                            0x00000400 0x000004FF
+ * Conventional Low Memory             0x00000500 0x0009FBFF
+ * EBDA area                           0x0009FC00 0x0009FFFF
+ * VIDEO RAM                           0x000A0000 0x000BFFFF
+ * VIDEO ROM (BIOS)                    0x000C0000 0x000C7FFF
+ * Motherboard BIOS                    0x000F0000 0x000FFFFF
+ * Extended Memory                     0x00100000 0xFEBFFFFF
+ * Reserved (configs, ACPI, PnP, etc)  0xFEC00000 0xFFFFFFFF
+ */
+
+#define REAL_MODE_IVT_BEGIN            0x00000000
+#define REAL_MODE_IVT_END              0x000003ff
+
+#define BDA_START                      0x00000400
+#define BDA_END                                0x000004ff
+
+#define EBDA_START                     0x0009fc00
+#define EBDA_END                       0x0009ffff
+
+#define E820_MAP_START                 EBDA_START
+
+#define MB_BIOS_BEGIN                  0x000f0000
+#define MB_BIOS_END                    0x000fffff
+
+#define VGA_RAM_BEGIN                  0x000a0000
+#define VGA_RAM_END                    0x000bffff
+
+#define VGA_ROM_BEGIN                  0x000c0000
+#define VGA_ROM_END                    0x000c7fff
+
+/* we handle one page only */
+#define VGA_RAM_SEG                    (VGA_RAM_BEGIN >> 4)
+#define VGA_PAGE_SIZE                  0x007d0 /* 80x25 */
+
+/* real mode interrupt vector table */
+#define REAL_INTR_BASE                 REAL_MODE_IVT_BEGIN
+#define REAL_INTR_VECTORS              256
+
+/*
+ * BIOS stack must be at absolute predefined memory address
+ * We reserve 64 bytes for BIOS stack
+ */
+#define MB_BIOS_SS                     0xfff7
+#define MB_BIOS_SP                     0x40
+
+/*
+ * When interfere with assembler code we need to be sure how
+ * arguments are passed in real mode.
+ */
+#define bioscall __attribute__((regparm(3)))
+
+#ifndef __ASSEMBLER__
+
+#include <linux/types.h>
+
+struct biosregs {
+       u32                     eax;
+       u32                     ebx;
+       u32                     ecx;
+       u32                     edx;
+       u32                     esp;
+       u32                     ebp;
+       u32                     esi;
+       u32                     edi;
+       u32                     es;
+       u32                     fs;
+       u32                     eip;
+       u32                     eflags;
+};
+
+void int10_handler(struct biosregs *regs);
+void int15_handler(struct biosregs *regs);
+
+#endif
+
+#endif /* BIOS_H_ */
diff --git a/tools/kvm/include/kvm/boot-protocol.h b/tools/kvm/include/kvm/boot-protocol.h
new file mode 100644 (file)
index 0000000..85b637f
--- /dev/null
@@ -0,0 +1,16 @@
+/*
+ * Linux boot protocol specifics
+ */
+
+#ifndef BOOT_PROTOCOL_H_
+#define BOOT_PROTOCOL_H_
+
+/*
+ * The protected mode kernel part of a modern bzImage is loaded
+ * at 1 MB by default.
+ */
+#define BZ_DEFAULT_SETUP_SECTS         4
+#define BZ_KERNEL_START                        0x100000UL
+#define INITRD_START                   0x1000000UL
+
+#endif /* BOOT_PROTOCOL_H_ */
diff --git a/tools/kvm/include/kvm/brlock.h b/tools/kvm/include/kvm/brlock.h
new file mode 100644 (file)
index 0000000..bd1d882
--- /dev/null
@@ -0,0 +1,41 @@
+#ifndef KVM__BRLOCK_H
+#define KVM__BRLOCK_H
+
+#include "kvm/kvm.h"
+#include "kvm/barrier.h"
+
+/*
+ * brlock is a lock which is very cheap for reads, but very expensive
+ * for writes.
+ * This lock will be used when updates are very rare and reads are common.
+ * This lock is currently implemented by stopping the guest while
+ * performing the updates. We assume that the only threads whichread from
+ * the locked data are VCPU threads, and the only writer isn't a VCPU thread.
+ */
+
+#ifndef barrier
+#define barrier()              __asm__ __volatile__("": : :"memory")
+#endif
+
+#ifdef KVM_BRLOCK_DEBUG
+
+#include "kvm/rwsem.h"
+
+DECLARE_RWSEM(brlock_sem);
+
+#define br_read_lock()         down_read(&brlock_sem);
+#define br_read_unlock()       up_read(&brlock_sem);
+
+#define br_write_lock()                down_write(&brlock_sem);
+#define br_write_unlock()      up_write(&brlock_sem);
+
+#else
+
+#define br_read_lock()         barrier()
+#define br_read_unlock()       barrier()
+
+#define br_write_lock()                kvm__pause()
+#define br_write_unlock()      kvm__continue()
+#endif
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-balloon.h b/tools/kvm/include/kvm/builtin-balloon.h
new file mode 100644 (file)
index 0000000..85055eb
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef KVM__BALLOON_H
+#define KVM__BALLOON_H
+
+int kvm_cmd_balloon(int argc, const char **argv, const char *prefix);
+void kvm_balloon_help(void);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-debug.h b/tools/kvm/include/kvm/builtin-debug.h
new file mode 100644 (file)
index 0000000..3fc2469
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef KVM__DEBUG_H
+#define KVM__DEBUG_H
+
+int kvm_cmd_debug(int argc, const char **argv, const char *prefix);
+void kvm_debug_help(void);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-help.h b/tools/kvm/include/kvm/builtin-help.h
new file mode 100644 (file)
index 0000000..2946743
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef __KVM_HELP_H__
+#define __KVM_HELP_H__
+
+int kvm_cmd_help(int argc, const char **argv, const char *prefix);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-list.h b/tools/kvm/include/kvm/builtin-list.h
new file mode 100644 (file)
index 0000000..04fca22
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef KVM__LIST_H
+#define KVM__LIST_H
+
+int kvm_cmd_list(int argc, const char **argv, const char *prefix);
+void kvm_list_help(void);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-pause.h b/tools/kvm/include/kvm/builtin-pause.h
new file mode 100644 (file)
index 0000000..540cc8e
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef KVM__PAUSE_H
+#define KVM__PAUSE_H
+
+int kvm_cmd_pause(int argc, const char **argv, const char *prefix);
+void kvm_pause_help(void);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-resume.h b/tools/kvm/include/kvm/builtin-resume.h
new file mode 100644 (file)
index 0000000..9e6e8d7
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef KVM__RESUME_H
+#define KVM__RESUME_H
+
+int kvm_cmd_resume(int argc, const char **argv, const char *prefix);
+void kvm_resume_help(void);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-run.h b/tools/kvm/include/kvm/builtin-run.h
new file mode 100644 (file)
index 0000000..d056ad4
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef __KVM_RUN_H__
+#define __KVM_RUN_H__
+
+int kvm_cmd_run(int argc, const char **argv, const char *prefix);
+void kvm_run_help(void);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-setup.h b/tools/kvm/include/kvm/builtin-setup.h
new file mode 100644 (file)
index 0000000..6e183a1
--- /dev/null
@@ -0,0 +1,8 @@
+#ifndef KVM__SETUP_H
+#define KVM__SETUP_H
+
+int kvm_cmd_setup(int argc, const char **argv, const char *prefix);
+void kvm_setup_help(void);
+int kvm_setup_create_new(const char *guestfs_name);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-stat.h b/tools/kvm/include/kvm/builtin-stat.h
new file mode 100644 (file)
index 0000000..e3ce292
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef KVM__STAT_H
+#define KVM__STAT_H
+
+int kvm_cmd_stat(int argc, const char **argv, const char *prefix);
+void kvm_stat_help(void);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-stop.h b/tools/kvm/include/kvm/builtin-stop.h
new file mode 100644 (file)
index 0000000..7570695
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef KVM__STOP_H
+#define KVM__STOP_H
+
+int kvm_cmd_stop(int argc, const char **argv, const char *prefix);
+void kvm_stop_help(void);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-version.h b/tools/kvm/include/kvm/builtin-version.h
new file mode 100644 (file)
index 0000000..83cac4d
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef KVM__VERSION_H
+#define KVM__VERSION_H
+
+int kvm_cmd_version(int argc, const char **argv, const char *prefix);
+
+#endif
diff --git a/tools/kvm/include/kvm/compiler.h b/tools/kvm/include/kvm/compiler.h
new file mode 100644 (file)
index 0000000..b203480
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef KVM_COMPILER_H_
+#define KVM_COMPILER_H_
+
+#define notrace __attribute__((no_instrument_function))
+
+#endif /* KVM_COMPILER_H_ */
diff --git a/tools/kvm/include/kvm/cpufeature.h b/tools/kvm/include/kvm/cpufeature.h
new file mode 100644 (file)
index 0000000..bc4abbb
--- /dev/null
@@ -0,0 +1,41 @@
+#ifndef KVM__CPUFEATURE_H
+#define KVM__CPUFEATURE_H
+
+#define CPUID_VENDOR_INTEL_1 0x756e6547 /* "Genu" */
+#define CPUID_VENDOR_INTEL_2 0x49656e69 /* "ineI" */
+#define CPUID_VENDOR_INTEL_3 0x6c65746e /* "ntel" */
+
+#define CPUID_VENDOR_AMD_1   0x68747541 /* "Auth" */
+#define CPUID_VENDOR_AMD_2   0x69746e65 /* "enti" */
+#define CPUID_VENDOR_AMD_3   0x444d4163 /* "cAMD" */
+
+/*
+ * CPUID flags we need to deal with
+ */
+#define KVM__X86_FEATURE_VMX           5       /* Hardware virtualization */
+#define KVM__X86_FEATURE_SVM           2       /* Secure virtual machine */
+#define KVM__X86_FEATURE_XSAVE         26      /* XSAVE/XRSTOR/XSETBV/XGETBV */
+
+#define cpu_feature_disable(reg, feature)      \
+       ((reg) & ~(1 << (feature)))
+#define cpu_feature_enable(reg, feature)       \
+       ((reg) |  (1 << (feature)))
+
+struct cpuid_regs {
+       u32     eax;
+       u32     ebx;
+       u32     ecx;
+       u32     edx;
+};
+
+static inline void host_cpuid(struct cpuid_regs *regs)
+{
+       asm volatile("cpuid"
+               : "=a" (regs->eax),
+                 "=b" (regs->ebx),
+                 "=c" (regs->ecx),
+                 "=d" (regs->edx)
+               : "0" (regs->eax), "2" (regs->ecx));
+}
+
+#endif /* KVM__CPUFEATURE_H */
diff --git a/tools/kvm/include/kvm/disk-image.h b/tools/kvm/include/kvm/disk-image.h
new file mode 100644 (file)
index 0000000..75b54f9
--- /dev/null
@@ -0,0 +1,73 @@
+#ifndef KVM__DISK_IMAGE_H
+#define KVM__DISK_IMAGE_H
+
+#include "kvm/read-write.h"
+#include "kvm/util.h"
+
+#include <linux/types.h>
+#include <linux/fs.h>  /* for BLKGETSIZE64 */
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <sys/uio.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define SECTOR_SHIFT           9
+#define SECTOR_SIZE            (1UL << SECTOR_SHIFT)
+
+#define DISK_IMAGE_MMAP                0
+#define DISK_IMAGE_NOMMAP      1
+#define MAX_DISK_IMAGES         4
+
+struct disk_image;
+
+struct disk_image_operations {
+       /*
+        * The following two are used for reading or writing with a single buffer.
+        * The implentation can use readv/writev or memcpy.
+        */
+       ssize_t (*read_sector)(struct disk_image *disk, u64 sector, void *dst, u32 dst_len);
+       ssize_t (*write_sector)(struct disk_image *disk, u64 sector, void *src, u32 src_len);
+       /*
+        * The following two are used for reading or writing with multiple buffers.
+        * The implentation can use readv/writev or memcpy.
+        */
+       ssize_t (*read_sector_iov)(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount);
+       ssize_t (*write_sector_iov)(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount);
+       int (*flush)(struct disk_image *disk);
+       int (*close)(struct disk_image *disk);
+};
+
+struct disk_image {
+       int                             fd;
+       u64                             size;
+       struct disk_image_operations    *ops;
+       void                            *priv;
+};
+
+struct disk_image *disk_image__open(const char *filename, bool readonly);
+struct disk_image **disk_image__open_all(const char **filenames, bool *readonly, int count);
+struct disk_image *disk_image__new(int fd, u64 size, struct disk_image_operations *ops, int mmap);
+int disk_image__close(struct disk_image *disk);
+void disk_image__close_all(struct disk_image **disks, int count);
+int disk_image__flush(struct disk_image *disk);
+ssize_t disk_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount);
+ssize_t disk_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount);
+ssize_t disk_image__get_serial(struct disk_image *disk, void *buffer, ssize_t *len);
+
+struct disk_image *raw_image__probe(int fd, struct stat *st, bool readonly);
+struct disk_image *blkdev__probe(const char *filename, struct stat *st);
+
+ssize_t raw_image__read_sector_iov(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount);
+ssize_t raw_image__write_sector_iov(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount);
+ssize_t raw_image__read_sector(struct disk_image *disk, u64 sector, void *dst, u32 dst_len);
+ssize_t raw_image__write_sector(struct disk_image *disk, u64 sector, void *src, u32 src_len);
+int raw_image__close(struct disk_image *disk);
+
+#endif /* KVM__DISK_IMAGE_H */
diff --git a/tools/kvm/include/kvm/e820.h b/tools/kvm/include/kvm/e820.h
new file mode 100644 (file)
index 0000000..d23c177
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef KVM_E820_H
+#define KVM_E820_H
+
+#include <linux/types.h>
+
+#define SMAP    0x534d4150      /* ASCII "SMAP" */
+
+struct biosregs;
+
+void e820_query_map(struct biosregs *regs);
+
+#endif /* KVM_E820_H */
diff --git a/tools/kvm/include/kvm/framebuffer.h b/tools/kvm/include/kvm/framebuffer.h
new file mode 100644 (file)
index 0000000..b66d0ba
--- /dev/null
@@ -0,0 +1,34 @@
+#ifndef KVM__FRAMEBUFFER_H
+#define KVM__FRAMEBUFFER_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+struct framebuffer;
+
+struct fb_target_operations {
+       int (*start)(struct framebuffer *fb);
+};
+
+#define FB_MAX_TARGETS                 2
+
+struct framebuffer {
+       struct list_head                node;
+
+       u32                             width;
+       u32                             height;
+       u8                              depth;
+       char                            *mem;
+       u64                             mem_addr;
+       u64                             mem_size;
+
+       unsigned long                   nr_targets;
+       struct fb_target_operations     *targets[FB_MAX_TARGETS];
+};
+
+struct framebuffer *fb__register(struct framebuffer *fb);
+int fb__attach(struct framebuffer *fb, struct fb_target_operations *ops);
+int fb__start(void);
+void fb__stop(void);
+
+#endif /* KVM__FRAMEBUFFER_H */
diff --git a/tools/kvm/include/kvm/guest_compat.h b/tools/kvm/include/kvm/guest_compat.h
new file mode 100644 (file)
index 0000000..ae7abbd
--- /dev/null
@@ -0,0 +1,9 @@
+#ifndef KVM__GUEST_COMPAT_H
+#define KVM__GUEST_COMPAT_H
+
+int compat__print_all_messages(void);
+int compat__remove_message(int id);
+int compat__add_message(const char *title, const char *description);
+
+
+#endif
\ No newline at end of file
diff --git a/tools/kvm/include/kvm/i8042.h b/tools/kvm/include/kvm/i8042.h
new file mode 100644 (file)
index 0000000..13f18e2
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef KVM__PCKBD_H
+#define KVM__PCKBD_H
+
+#include <linux/types.h>
+
+struct kvm;
+
+void mouse_queue(u8 c);
+void kbd_queue(u8 c);
+void kbd__init(struct kvm *kvm);
+
+#endif
diff --git a/tools/kvm/include/kvm/interrupt.h b/tools/kvm/include/kvm/interrupt.h
new file mode 100644 (file)
index 0000000..00c7ed7
--- /dev/null
@@ -0,0 +1,26 @@
+#ifndef KVM__INTERRUPT_H
+#define KVM__INTERRUPT_H
+
+#include <linux/types.h>
+#include "kvm/bios.h"
+#include "kvm/bios-export.h"
+
+struct real_intr_desc {
+       u16 offset;
+       u16 segment;
+} __attribute__((packed));
+
+#define REAL_SEGMENT_SHIFT     4
+#define REAL_SEGMENT(addr)     ((addr) >> REAL_SEGMENT_SHIFT)
+#define REAL_OFFSET(addr)      ((addr) & ((1 << REAL_SEGMENT_SHIFT) - 1))
+#define REAL_INTR_SIZE         (REAL_INTR_VECTORS * sizeof(struct real_intr_desc))
+
+struct interrupt_table {
+       struct real_intr_desc entries[REAL_INTR_VECTORS];
+};
+
+void interrupt_table__copy(struct interrupt_table *itable, void *dst, unsigned int size);
+void interrupt_table__setup(struct interrupt_table *itable, struct real_intr_desc *entry);
+void interrupt_table__set(struct interrupt_table *itable, struct real_intr_desc *entry, unsigned int num);
+
+#endif /* KVM__INTERRUPT_H */
diff --git a/tools/kvm/include/kvm/ioeventfd.h b/tools/kvm/include/kvm/ioeventfd.h
new file mode 100644 (file)
index 0000000..df01750
--- /dev/null
@@ -0,0 +1,27 @@
+#ifndef KVM__IOEVENTFD_H
+#define KVM__IOEVENTFD_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <sys/eventfd.h>
+
+struct kvm;
+
+struct ioevent {
+       u64                     io_addr;
+       u8                      io_len;
+       void                    (*fn)(struct kvm *kvm, void *ptr);
+       struct kvm              *fn_kvm;
+       void                    *fn_ptr;
+       int                     fd;
+       u64                     datamatch;
+
+       struct list_head        list;
+};
+
+void ioeventfd__init(void);
+void ioeventfd__start(void);
+void ioeventfd__add_event(struct ioevent *ioevent);
+void ioeventfd__del_event(u64 addr, u64 datamatch);
+
+#endif
diff --git a/tools/kvm/include/kvm/ioport.h b/tools/kvm/include/kvm/ioport.h
new file mode 100644 (file)
index 0000000..5b857dd
--- /dev/null
@@ -0,0 +1,65 @@
+#ifndef KVM__IOPORT_H
+#define KVM__IOPORT_H
+
+#include "kvm/rbtree-interval.h"
+
+#include <stdbool.h>
+#include <limits.h>
+#include <asm/types.h>
+#include <linux/types.h>
+
+/* some ports we reserve for own use */
+#define IOPORT_DBG                     0xe0
+#define IOPORT_START                   0x6200
+#define IOPORT_SIZE                    0x400
+
+#define IOPORT_EMPTY                   USHRT_MAX
+
+struct kvm;
+
+struct ioport {
+       struct rb_int_node              node;
+       struct ioport_operations        *ops;
+       void                            *priv;
+};
+
+struct ioport_operations {
+       bool (*io_in)(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size);
+       bool (*io_out)(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size);
+};
+
+void ioport__setup_legacy(void);
+
+u16 ioport__register(u16 port, struct ioport_operations *ops, int count, void *param);
+
+static inline u8 ioport__read8(u8 *data)
+{
+       return *data;
+}
+
+static inline u16 ioport__read16(u16 *data)
+{
+       return *data;
+}
+
+static inline u32 ioport__read32(u32 *data)
+{
+       return *data;
+}
+
+static inline void ioport__write8(u8 *data, u8 value)
+{
+       *data            = value;
+}
+
+static inline void ioport__write16(u16 *data, u16 value)
+{
+       *data            = value;
+}
+
+static inline void ioport__write32(u32 *data, u32 value)
+{
+       *data            = value;
+}
+
+#endif /* KVM__IOPORT_H */
diff --git a/tools/kvm/include/kvm/irq.h b/tools/kvm/include/kvm/irq.h
new file mode 100644 (file)
index 0000000..401bee9
--- /dev/null
@@ -0,0 +1,29 @@
+#ifndef KVM__IRQ_H
+#define KVM__IRQ_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+
+struct kvm;
+
+struct irq_line {
+       u8                      line;
+       struct list_head        node;
+};
+
+struct pci_dev {
+       struct rb_node          node;
+       u32                     id;
+       u8                      pin;
+       struct list_head        lines;
+};
+
+int irq__register_device(u32 dev, u8 *num, u8 *pin, u8 *line);
+
+struct rb_node *irq__get_pci_tree(void);
+
+void irq__init(struct kvm *kvm);
+int irq__add_msix_route(struct kvm *kvm, u32 low, u32 high, u32 data);
+
+#endif
diff --git a/tools/kvm/include/kvm/kvm-cmd.h b/tools/kvm/include/kvm/kvm-cmd.h
new file mode 100644 (file)
index 0000000..0a73bce
--- /dev/null
@@ -0,0 +1,17 @@
+#ifndef __KVM_CMD_H__
+#define __KVM_CMD_H__
+
+struct cmd_struct {
+       const char *cmd;
+       int (*fn)(int, const char **, const char *);
+       void (*help)(void);
+       int option;
+};
+
+extern struct cmd_struct kvm_commands[];
+struct cmd_struct *kvm_get_command(struct cmd_struct *command,
+                const char *cmd);
+
+int handle_command(struct cmd_struct *command, int argc, const char **argv);
+
+#endif
diff --git a/tools/kvm/include/kvm/kvm-cpu.h b/tools/kvm/include/kvm/kvm-cpu.h
new file mode 100644 (file)
index 0000000..95f3f9d
--- /dev/null
@@ -0,0 +1,44 @@
+#ifndef KVM__KVM_CPU_H
+#define KVM__KVM_CPU_H
+
+#include <linux/kvm.h> /* for struct kvm_regs */
+
+#include <pthread.h>
+
+struct kvm;
+
+struct kvm_cpu {
+       pthread_t               thread;         /* VCPU thread */
+
+       unsigned long           cpu_id;
+
+       struct kvm              *kvm;           /* parent KVM */
+       int                     vcpu_fd;        /* For VCPU ioctls() */
+       struct kvm_run          *kvm_run;
+
+       struct kvm_regs         regs;
+       struct kvm_sregs        sregs;
+       struct kvm_fpu          fpu;
+
+       struct kvm_msrs         *msrs;          /* dynamically allocated */
+
+       u8                      is_running;
+       u8                      paused;
+
+       struct kvm_coalesced_mmio_ring  *ring;
+};
+
+struct kvm_cpu *kvm_cpu__init(struct kvm *kvm, unsigned long cpu_id);
+void kvm_cpu__delete(struct kvm_cpu *vcpu);
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu);
+void kvm_cpu__setup_cpuid(struct kvm_cpu *vcpu);
+void kvm_cpu__enable_singlestep(struct kvm_cpu *vcpu);
+void kvm_cpu__run(struct kvm_cpu *vcpu);
+void kvm_cpu__reboot(void);
+int kvm_cpu__start(struct kvm_cpu *cpu);
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu);
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu);
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu);
+
+#endif /* KVM__KVM_CPU_H */
diff --git a/tools/kvm/include/kvm/kvm.h b/tools/kvm/include/kvm/kvm.h
new file mode 100644 (file)
index 0000000..bb40c4c
--- /dev/null
@@ -0,0 +1,112 @@
+#ifndef KVM__KVM_H
+#define KVM__KVM_H
+
+#include "kvm/interrupt.h"
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <time.h>
+#include <signal.h>
+
+#define KVM_NR_CPUS            (255)
+
+/*
+ * The hole includes VESA framebuffer and PCI memory.
+ */
+#define KVM_32BIT_GAP_SIZE     (768 << 20)
+#define KVM_32BIT_GAP_START    ((1ULL << 32) - KVM_32BIT_GAP_SIZE)
+
+#define SIGKVMEXIT             (SIGRTMIN + 0)
+#define SIGKVMPAUSE            (SIGRTMIN + 1)
+#define SIGKVMADDMEM           (SIGRTMIN + 2)
+#define SIGKVMDELMEM           (SIGRTMIN + 3)
+#define SIGKVMSTOP             (SIGRTMIN + 4)
+#define SIGKVMRESUME           (SIGRTMIN + 5)
+#define SIGKVMMEMSTAT          (SIGRTMIN + 6)
+
+#define KVM_PID_FILE_PATH      "/.kvm-tools/"
+#define HOME_DIR               getenv("HOME")
+
+struct kvm {
+       int                     sys_fd;         /* For system ioctls(), i.e. /dev/kvm */
+       int                     vm_fd;          /* For VM ioctls() */
+       timer_t                 timerid;        /* Posix timer for interrupts */
+
+       int                     nrcpus;         /* Number of cpus to run */
+
+       u32                     mem_slots;      /* for KVM_SET_USER_MEMORY_REGION */
+
+       u64                     ram_size;
+       void                    *ram_start;
+
+       bool                    nmi_disabled;
+
+       bool                    single_step;
+
+       u16                     boot_selector;
+       u16                     boot_ip;
+       u16                     boot_sp;
+
+       struct interrupt_table  interrupt_table;
+
+       const char              *vmlinux;
+       struct disk_image       **disks;
+       int                     nr_disks;
+
+       const char              *name;
+};
+
+struct kvm *kvm__init(const char *kvm_dev, u64 ram_size, const char *name);
+int kvm__recommended_cpus(struct kvm *kvm);
+int kvm__max_cpus(struct kvm *kvm);
+void kvm__init_ram(struct kvm *kvm);
+void kvm__delete(struct kvm *kvm);
+bool kvm__load_kernel(struct kvm *kvm, const char *kernel_filename,
+                       const char *initrd_filename, const char *kernel_cmdline, u16 vidmode);
+void kvm__setup_bios(struct kvm *kvm);
+void kvm__start_timer(struct kvm *kvm);
+void kvm__stop_timer(struct kvm *kvm);
+void kvm__irq_line(struct kvm *kvm, int irq, int level);
+void kvm__irq_trigger(struct kvm *kvm, int irq);
+bool kvm__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int size, u32 count);
+bool kvm__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write);
+void kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void *userspace_addr);
+bool kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, void (*kvm_mmio_callback_fn)(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr), void *ptr);
+bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr);
+void kvm__pause(void);
+void kvm__continue(void);
+void kvm__notify_paused(void);
+pid_t kvm__get_pid_by_instance(const char *name);
+int kvm__enumerate_instances(int (*callback)(const char *name, int pid));
+void kvm__remove_pidfile(const char *name);
+
+/*
+ * Debugging
+ */
+void kvm__dump_mem(struct kvm *kvm, unsigned long addr, unsigned long size);
+
+extern const char *kvm_exit_reasons[];
+
+static inline bool host_ptr_in_ram(struct kvm *kvm, void *p)
+{
+       return kvm->ram_start <= p && p < (kvm->ram_start + kvm->ram_size);
+}
+
+static inline u32 segment_to_flat(u16 selector, u16 offset)
+{
+       return ((u32)selector << 4) + (u32) offset;
+}
+
+static inline void *guest_flat_to_host(struct kvm *kvm, unsigned long offset)
+{
+       return kvm->ram_start + offset;
+}
+
+static inline void *guest_real_to_host(struct kvm *kvm, u16 selector, u16 offset)
+{
+       unsigned long flat = segment_to_flat(selector, offset);
+
+       return guest_flat_to_host(kvm, flat);
+}
+
+#endif /* KVM__KVM_H */
diff --git a/tools/kvm/include/kvm/mptable.h b/tools/kvm/include/kvm/mptable.h
new file mode 100644 (file)
index 0000000..8557ae8
--- /dev/null
@@ -0,0 +1,8 @@
+#ifndef KVM_MPTABLE_H_
+#define KVM_MPTABLE_H_
+
+struct kvm;
+
+void mptable_setup(struct kvm *kvm, unsigned int ncpus);
+
+#endif /* KVM_MPTABLE_H_ */
diff --git a/tools/kvm/include/kvm/mutex.h b/tools/kvm/include/kvm/mutex.h
new file mode 100644 (file)
index 0000000..3286cea
--- /dev/null
@@ -0,0 +1,33 @@
+#ifndef KVM__MUTEX_H
+#define KVM__MUTEX_H
+
+#include <pthread.h>
+
+#include "kvm/util.h"
+
+/*
+ * Kernel-alike mutex API - to make it easier for kernel developers
+ * to write user-space code! :-)
+ */
+
+#define DEFINE_MUTEX(mutex) pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER
+
+static inline void mutex_init(pthread_mutex_t *mutex)
+{
+       if (pthread_mutex_init(mutex, NULL) != 0)
+               die("unexpected pthread_mutex_init() failure!");
+}
+
+static inline void mutex_lock(pthread_mutex_t *mutex)
+{
+       if (pthread_mutex_lock(mutex) != 0)
+               die("unexpected pthread_mutex_lock() failure!");
+}
+
+static inline void mutex_unlock(pthread_mutex_t *mutex)
+{
+       if (pthread_mutex_unlock(mutex) != 0)
+               die("unexpected pthread_mutex_unlock() failure!");
+}
+
+#endif /* KVM__MUTEX_H */
diff --git a/tools/kvm/include/kvm/parse-options.h b/tools/kvm/include/kvm/parse-options.h
new file mode 100644 (file)
index 0000000..b65820d
--- /dev/null
@@ -0,0 +1,213 @@
+#ifndef __PARSE_OPTIONS_H__
+#define __PARSE_OPTIONS_H__
+
+#include <inttypes.h>
+
+enum parse_opt_type {
+       /* special types */
+       OPTION_END,
+       OPTION_ARGUMENT,
+       OPTION_GROUP,
+       /* options with no arguments */
+       OPTION_BIT,
+       OPTION_BOOLEAN,
+       OPTION_INCR,
+       OPTION_SET_UINT,
+       OPTION_SET_PTR,
+       /* options with arguments (usually) */
+       OPTION_STRING,
+       OPTION_INTEGER,
+       OPTION_LONG,
+       OPTION_CALLBACK,
+       OPTION_U64,
+       OPTION_UINTEGER,
+};
+
+enum parse_opt_flags {
+       PARSE_OPT_KEEP_DASHDASH = 1,
+       PARSE_OPT_STOP_AT_NON_OPTION = 2,
+       PARSE_OPT_KEEP_ARGV0 = 4,
+       PARSE_OPT_KEEP_UNKNOWN = 8,
+       PARSE_OPT_NO_INTERNAL_HELP = 16,
+};
+
+enum parse_opt_option_flags {
+       PARSE_OPT_OPTARG  = 1,
+       PARSE_OPT_NOARG   = 2,
+       PARSE_OPT_NONEG   = 4,
+       PARSE_OPT_HIDDEN  = 8,
+       PARSE_OPT_LASTARG_DEFAULT = 16,
+};
+
+struct option;
+typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
+/*
+ * `type`::
+ *   holds the type of the option, you must have an OPTION_END last in your
+ *   array.
+ *
+ * `short_name`::
+ *   the character to use as a short option name, '\0' if none.
+ *
+ * `long_name`::
+ *   the long option name, without the leading dashes, NULL if none.
+ *
+ * `value`::
+ *   stores pointers to the values to be filled.
+ *
+ * `argh`::
+ *   token to explain the kind of argument this option wants. Keep it
+ *   homogenous across the repository.
+ *
+ * `help`::
+ *   the short help associated to what the option does.
+ *   Must never be NULL (except for OPTION_END).
+ *   OPTION_GROUP uses this pointer to store the group header.
+ *
+ * `flags`::
+ *   mask of parse_opt_option_flags.
+ *   PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
+ *   PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
+ *   PARSE_OPT_NONEG: says that this option cannot be negated
+ *   PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
+ *                    the long one.
+ *
+ * `callback`::
+ *   pointer to the callback to use for OPTION_CALLBACK.
+ *
+ * `defval`::
+ *   default value to fill (*->value) with for PARSE_OPT_OPTARG.
+ *   OPTION_{BIT,SET_UINT,SET_PTR} store the {mask,integer,pointer} to put in
+ *   the value when met.
+ *   CALLBACKS can use it like they want.
+ */
+struct option {
+enum parse_opt_type type;
+int short_name;
+const char *long_name;
+void *value;
+const char *argh;
+const char *help;
+
+int flags;
+parse_opt_cb *callback;
+intptr_t defval;
+};
+
+#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
+#define check_vtype(v, type) \
+       (BUILD_BUG_ON_ZERO(!__builtin_types_compatible_p(typeof(v), type)) + v)
+
+#define OPT_INTEGER(s, l, v, h)             \
+{                                           \
+       .type = OPTION_INTEGER,             \
+       .short_name = (s),                  \
+       .long_name = (l),                   \
+       .value = check_vtype(v, int *),     \
+       .help = (h)                         \
+}
+
+#define OPT_U64(s, l, v, h)                 \
+{                                           \
+       .type = OPTION_U64,                 \
+       .short_name = (s),                  \
+       .long_name = (l),                   \
+       .value = check_vtype(v, u64 *),     \
+       .help = (h)                         \
+}
+
+#define OPT_STRING(s, l, v, a, h)           \
+{                                           \
+       .type = OPTION_STRING,              \
+       .short_name = (s),                  \
+       .long_name = (l),                   \
+       .value = check_vtype(v, const char **), (a), \
+       .help = (h)                         \
+}
+
+#define OPT_BOOLEAN(s, l, v, h)             \
+{                                           \
+       .type = OPTION_BOOLEAN,             \
+       .short_name = (s),                  \
+       .long_name = (l),                   \
+       .value = check_vtype(v, bool *),    \
+       .help = (h)                         \
+}
+
+#define OPT_INCR(s, l, v, h)                \
+{                                           \
+       .type = OPTION_INCR,                \
+       .short_name = (s),                  \
+       .long_name = (l),                   \
+       .value = check_vtype(v, int *),     \
+       .help = (h)                         \
+}
+
+#define OPT_GROUP(h)                        \
+{                                           \
+       .type = OPTION_GROUP,               \
+       .help = (h)                         \
+}
+
+#define OPT_CALLBACK(s, l, v, a, h, f)      \
+{                                          \
+       .type = OPTION_CALLBACK,            \
+       .short_name = (s),                  \
+       .long_name = (l),                   \
+       .value = (v),                       \
+       (a),                                \
+       .help = (h),                        \
+       .callback = (f)                     \
+}
+
+#define OPT_CALLBACK_NOOPT(s, l, v, a, h, f) \
+{                                          \
+       .type = OPTION_CALLBACK,            \
+       .short_name = (s),                  \
+       .long_name = (l),                   \
+       .value = (v),                       \
+       (a),                                \
+       .help = (h),                        \
+       .callback = (f),                    \
+       .flags = PARSE_OPT_NOARG            \
+}
+
+#define OPT_CALLBACK_DEFAULT(s, l, v, a, h, f, d) \
+{                                          \
+       .type = OPTION_CALLBACK,            \
+       .short_name = (s),                  \
+       .long_name = (l),                   \
+       .value = (v), (a),                  \
+       .help = (h),                        \
+       .callback = (f),                    \
+       .defval = (intptr_t)d,              \
+       .flags = PARSE_OPT_LASTARG_DEFAULT  \
+}
+
+#define OPT_END() { .type = OPTION_END }
+
+enum {
+       PARSE_OPT_HELP = -1,
+       PARSE_OPT_DONE,
+       PARSE_OPT_UNKNOWN,
+};
+
+/*
+ * It's okay for the caller to consume argv/argc in the usual way.
+ * Other fields of that structure are private to parse-options and should not
+ * be modified in any way.
+ **/
+struct parse_opt_ctx_t {
+       const char **argv;
+       const char **out;
+       int argc, cpidx;
+       const char *opt;
+       int flags;
+};
+
+/* global functions */
+void usage_with_options(const char * const *usagestr,
+               const struct option *opts);
+int parse_options(int argc, const char **argv, const struct option *options,
+               const char * const usagestr[], int flags);
+#endif
diff --git a/tools/kvm/include/kvm/pci-shmem.h b/tools/kvm/include/kvm/pci-shmem.h
new file mode 100644 (file)
index 0000000..599ab37
--- /dev/null
@@ -0,0 +1,28 @@
+#ifndef KVM__PCI_SHMEM_H
+#define KVM__PCI_SHMEM_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+#define SHMEM_DEFAULT_SIZE (16 << MB_SHIFT)
+#define SHMEM_DEFAULT_ADDR (0xc8000000)
+#define SHMEM_DEFAULT_HANDLE "/kvm_shmem"
+
+struct kvm;
+struct shmem_info;
+
+struct shmem_info {
+       u64 phys_addr;
+       u64 size;
+       char *handle;
+       int create;
+};
+
+int pci_shmem__init(struct kvm *self);
+int pci_shmem__register_mem(struct shmem_info *si);
+
+int pci_shmem__get_local_irqfd(struct kvm *kvm);
+int pci_shmem__add_client(struct kvm *kvm, u32 id, int fd);
+int pci_shmem__remove_client(struct kvm *kvm, u32 id);
+
+#endif
diff --git a/tools/kvm/include/kvm/pci.h b/tools/kvm/include/kvm/pci.h
new file mode 100644 (file)
index 0000000..5ee8005
--- /dev/null
@@ -0,0 +1,75 @@
+#ifndef KVM__PCI_H
+#define KVM__PCI_H
+
+#include <linux/types.h>
+
+#include <linux/pci_regs.h>
+
+/*
+ * PCI Configuration Mechanism #1 I/O ports. See Section 3.7.4.1.
+ * ("Configuration Mechanism #1") of the PCI Local Bus Specification 2.1 for
+ * details.
+ */
+#define PCI_CONFIG_ADDRESS     0xcf8
+#define PCI_CONFIG_DATA                0xcfc
+#define PCI_CONFIG_BUS_FORWARD 0xcfa
+#define PCI_IO_SIZE            0x100
+
+struct pci_config_address {
+       unsigned        zeros           : 2;            /* 1  .. 0  */
+       unsigned        register_number : 6;            /* 7  .. 2  */
+       unsigned        function_number : 3;            /* 10 .. 8  */
+       unsigned        device_number   : 5;            /* 15 .. 11 */
+       unsigned        bus_number      : 8;            /* 23 .. 16 */
+       unsigned        reserved        : 7;            /* 30 .. 24 */
+       unsigned        enable_bit      : 1;            /* 31       */
+};
+
+struct msix_table {
+       u32 low;
+       u32 high;
+       u32 data;
+       u32 ctrl;
+};
+
+struct msix_cap {
+       u8 cap;
+       u8 next;
+       u16 ctrl;
+       u32 table_offset;
+       u32 pba_offset;
+};
+
+struct pci_device_header {
+       u16             vendor_id;
+       u16             device_id;
+       u16             command;
+       u16             status;
+       u16             revision_id             :  8;
+       u32             class                   : 24;
+       u8              cacheline_size;
+       u8              latency_timer;
+       u8              header_type;
+       u8              bist;
+       u32             bar[6];
+       u32             card_bus;
+       u16             subsys_vendor_id;
+       u16             subsys_id;
+       u32             exp_rom_bar;
+       u32             capabilities            :  8;
+       u32             reserved1               : 24;
+       u32             reserved2;
+       u8              irq_line;
+       u8              irq_pin;
+       u8              min_gnt;
+       u8              max_lat;
+       struct msix_cap msix;
+       u8              empty[136]; /* Rest of PCI config space */
+       u32             bar_size[6];
+};
+
+void pci__init(void);
+void pci__register(struct pci_device_header *dev, u8 dev_num);
+u32 pci_get_io_space_block(u32 size);
+
+#endif /* KVM__PCI_H */
diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h
new file mode 100644 (file)
index 0000000..46db702
--- /dev/null
@@ -0,0 +1,122 @@
+#ifndef KVM__QCOW_H
+#define KVM__QCOW_H
+
+#include "kvm/mutex.h"
+
+#include <linux/types.h>
+#include <stdbool.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+
+#define QCOW_MAGIC             (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+
+#define QCOW1_VERSION          1
+#define QCOW2_VERSION          2
+
+#define QCOW_OFLAG_COPIED      (1ULL << 63)
+#define QCOW_OFLAG_COMPRESSED  (1ULL << 62)
+
+#define QCOW_OFLAGS_MASK       (QCOW_OFLAG_COPIED|QCOW_OFLAG_COMPRESSED)
+
+#define QCOW_OFFSET_MASK       (~QCOW_OFLAGS_MASK)
+
+#define MAX_CACHE_NODES         32
+
+struct qcow_l2_table {
+       u64                             offset;
+       struct rb_node                  node;
+       struct list_head                list;
+       u8                              dirty;
+       u64                             table[];
+};
+
+struct qcow_l1_table {
+       u32                             table_size;
+       u64                             *l1_table;
+
+       /* Level2 caching data structures */
+       struct rb_root                  root;
+       struct list_head                lru_list;
+       int                             nr_cached;
+};
+
+#define QCOW_REFCOUNT_BLOCK_SHIFT      1
+
+struct qcow_refcount_block {
+       u64                             offset;
+       struct rb_node                  node;
+       struct list_head                list;
+       u64                             size;
+       u8                              dirty;
+       u16                             entries[];
+};
+
+struct qcow_refcount_table {
+       u32                             rf_size;
+       u64                             *rf_table;
+
+       /* Refcount block caching data structures */
+       struct rb_root                  root;
+       struct list_head                lru_list;
+       int                             nr_cached;
+};
+
+struct qcow {
+       pthread_mutex_t                 mutex;
+       void                            *header;
+       struct qcow_l1_table            table;
+       struct qcow_refcount_table      refcount_table;
+       int                             fd;
+};
+
+struct qcow_header {
+       u64                             size;   /* in bytes */
+       u64                             l1_table_offset;
+       u32                             l1_size;
+       u8                              cluster_bits;
+       u8                              l2_bits;
+       u64                             refcount_table_offset;
+       u32                             refcount_table_size;
+};
+
+struct qcow1_header_disk {
+       u32                             magic;
+       u32                             version;
+
+       u64                             backing_file_offset;
+       u32                             backing_file_size;
+       u32                             mtime;
+
+       u64                             size;   /* in bytes */
+
+       u8                              cluster_bits;
+       u8                              l2_bits;
+       u32                             crypt_method;
+
+       u64                             l1_table_offset;
+};
+
+struct qcow2_header_disk {
+       u32                             magic;
+       u32                             version;
+
+       u64                             backing_file_offset;
+       u32                             backing_file_size;
+
+       u32                             cluster_bits;
+       u64                             size;   /* in bytes */
+       u32                             crypt_method;
+
+       u32                             l1_size;
+       u64                             l1_table_offset;
+
+       u64                             refcount_table_offset;
+       u32                             refcount_table_clusters;
+
+       u32                             nb_snapshots;
+       u64                             snapshots_offset;
+};
+
+struct disk_image *qcow_probe(int fd, bool readonly);
+
+#endif /* KVM__QCOW_H */
diff --git a/tools/kvm/include/kvm/rbtree-interval.h b/tools/kvm/include/kvm/rbtree-interval.h
new file mode 100644 (file)
index 0000000..a6688c4
--- /dev/null
@@ -0,0 +1,27 @@
+#ifndef KVM__INTERVAL_RBTREE_H
+#define KVM__INTERVAL_RBTREE_H
+
+#include <linux/rbtree.h>
+#include <linux/types.h>
+
+#define RB_INT_INIT(l, h) (struct rb_int_node){.low = l, .high = h}
+
+struct rb_int_node {
+       struct rb_node  node;
+       u64             low;
+       u64             high;
+
+       /* max_high will store the highest high of it's 2 children. */
+       u64             max_high;
+};
+
+/* Return the rb_int_node interval in which 'point' is located. */
+struct rb_int_node *rb_int_search_single(struct rb_root *root, u64 point);
+
+/* Return the rb_int_node in which start:len is located. */
+struct rb_int_node *rb_int_search_range(struct rb_root *root, u64 low, u64 high);
+
+int rb_int_insert(struct rb_root *root, struct rb_int_node *data);
+void rb_int_erase(struct rb_root *root, struct rb_int_node *node);
+
+#endif
diff --git a/tools/kvm/include/kvm/read-write.h b/tools/kvm/include/kvm/read-write.h
new file mode 100644 (file)
index 0000000..3351103
--- /dev/null
@@ -0,0 +1,32 @@
+#ifndef KVM_READ_WRITE_H
+#define KVM_READ_WRITE_H
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+ssize_t xread(int fd, void *buf, size_t count);
+ssize_t xwrite(int fd, const void *buf, size_t count);
+
+ssize_t read_in_full(int fd, void *buf, size_t count);
+ssize_t write_in_full(int fd, const void *buf, size_t count);
+
+ssize_t xpread(int fd, void *buf, size_t count, off_t offset);
+ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset);
+
+ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset);
+ssize_t pwrite_in_full(int fd, const void *buf, size_t count, off_t offset);
+
+ssize_t xreadv(int fd, const struct iovec *iov, int iovcnt);
+ssize_t xwritev(int fd, const struct iovec *iov, int iovcnt);
+
+ssize_t readv_in_full(int fd, const struct iovec *iov, int iovcnt);
+ssize_t writev_in_full(int fd, const struct iovec *iov, int iovcnt);
+
+ssize_t xpreadv(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+ssize_t xpwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+
+ssize_t preadv_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+ssize_t pwritev_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+
+#endif /* KVM_READ_WRITE_H */
diff --git a/tools/kvm/include/kvm/rtc.h b/tools/kvm/include/kvm/rtc.h
new file mode 100644 (file)
index 0000000..0b8d9f9
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef KVM__RTC_H
+#define KVM__RTC_H
+
+void rtc__init(void);
+
+#endif /* KVM__RTC_H */
diff --git a/tools/kvm/include/kvm/rwsem.h b/tools/kvm/include/kvm/rwsem.h
new file mode 100644 (file)
index 0000000..75a22f8
--- /dev/null
@@ -0,0 +1,39 @@
+#ifndef KVM__RWSEM_H
+#define KVM__RWSEM_H
+
+#include <pthread.h>
+
+#include "kvm/util.h"
+
+/*
+ * Kernel-alike rwsem API - to make it easier for kernel developers
+ * to write user-space code! :-)
+ */
+
+#define DECLARE_RWSEM(sem) pthread_rwlock_t sem = PTHREAD_RWLOCK_INITIALIZER
+
+static inline void down_read(pthread_rwlock_t *rwsem)
+{
+       if (pthread_rwlock_rdlock(rwsem) != 0)
+               die("unexpected pthread_rwlock_rdlock() failure!");
+}
+
+static inline void down_write(pthread_rwlock_t *rwsem)
+{
+       if (pthread_rwlock_wrlock(rwsem) != 0)
+               die("unexpected pthread_rwlock_wrlock() failure!");
+}
+
+static inline void up_read(pthread_rwlock_t *rwsem)
+{
+       if (pthread_rwlock_unlock(rwsem) != 0)
+               die("unexpected pthread_rwlock_unlock() failure!");
+}
+
+static inline void up_write(pthread_rwlock_t *rwsem)
+{
+       if (pthread_rwlock_unlock(rwsem) != 0)
+               die("unexpected pthread_rwlock_unlock() failure!");
+}
+
+#endif /* KVM__RWSEM_H */
diff --git a/tools/kvm/include/kvm/sdl.h b/tools/kvm/include/kvm/sdl.h
new file mode 100644 (file)
index 0000000..a5aa411
--- /dev/null
@@ -0,0 +1,17 @@
+#ifndef KVM__SDL_H
+#define KVM__SDL_H
+
+#include "kvm/util.h"
+
+struct framebuffer;
+
+#ifdef CONFIG_HAS_SDL
+void sdl__init(struct framebuffer *fb);
+#else
+static inline void sdl__init(struct framebuffer *fb)
+{
+       die("SDL support not compiled in. (install the SDL-dev[el] package)");
+}
+#endif
+
+#endif /* KVM__SDL_H */
diff --git a/tools/kvm/include/kvm/segment.h b/tools/kvm/include/kvm/segment.h
new file mode 100644 (file)
index 0000000..362e46d
--- /dev/null
@@ -0,0 +1,16 @@
+#ifndef KVM_SEGMENT_H
+#define KVM_SEGMENT_H
+
+#include <linux/types.h>
+
+static inline u16 flat_to_seg16(u32 address)
+{
+       return address >> 4;
+}
+
+static inline u16 flat_to_off16(u32 address, u32 segment)
+{
+       return address - (segment << 4);
+}
+
+#endif /* KVM_SEGMENT_H */
diff --git a/tools/kvm/include/kvm/strbuf.h b/tools/kvm/include/kvm/strbuf.h
new file mode 100644 (file)
index 0000000..e67ca20
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef __STRBUF_H__
+#define __STRBUF_H__
+
+int prefixcmp(const char *str, const char *prefix);
+
+#endif
diff --git a/tools/kvm/include/kvm/symbol.h b/tools/kvm/include/kvm/symbol.h
new file mode 100644 (file)
index 0000000..5bc2221
--- /dev/null
@@ -0,0 +1,22 @@
+#ifndef KVM__SYMBOL_H
+#define KVM__SYMBOL_H
+
+#include <stddef.h>
+#include <string.h>
+
+struct kvm;
+
+#ifdef CONFIG_HAS_BFD
+void symbol__init(const char *vmlinux);
+char *symbol__lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size);
+#else
+static inline void symbol__init(const char *vmlinux) { }
+static inline char *symbol__lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size)
+{
+       char *s = strncpy(sym, "<unknown>", size);
+       sym[size - 1] = '\0';
+       return s;
+}
+#endif
+
+#endif /* KVM__SYMBOL_H */
diff --git a/tools/kvm/include/kvm/term.h b/tools/kvm/include/kvm/term.h
new file mode 100644 (file)
index 0000000..4d580e1
--- /dev/null
@@ -0,0 +1,17 @@
+#ifndef KVM__TERM_H
+#define KVM__TERM_H
+
+#include <sys/uio.h>
+
+#define CONSOLE_8250   1
+#define CONSOLE_VIRTIO 2
+
+int term_putc_iov(int who, struct iovec *iov, int iovcnt);
+int term_getc_iov(int who, struct iovec *iov, int iovcnt);
+int term_putc(int who, char *addr, int cnt);
+int term_getc(int who);
+
+bool term_readable(int who);
+void term_init(void);
+
+#endif /* KVM__TERM_H */
diff --git a/tools/kvm/include/kvm/threadpool.h b/tools/kvm/include/kvm/threadpool.h
new file mode 100644 (file)
index 0000000..768239f
--- /dev/null
@@ -0,0 +1,37 @@
+#ifndef KVM__THREADPOOL_H
+#define KVM__THREADPOOL_H
+
+#include "kvm/mutex.h"
+
+#include <linux/list.h>
+
+struct kvm;
+
+typedef void (*kvm_thread_callback_fn_t)(struct kvm *kvm, void *data);
+
+struct thread_pool__job {
+       kvm_thread_callback_fn_t        callback;
+       struct kvm                      *kvm;
+       void                            *data;
+
+       int                             signalcount;
+       pthread_mutex_t                 mutex;
+
+       struct list_head                queue;
+};
+
+static inline void thread_pool__init_job(struct thread_pool__job *job, struct kvm *kvm, kvm_thread_callback_fn_t callback, void *data)
+{
+       *job = (struct thread_pool__job) {
+               .kvm            = kvm,
+               .callback       = callback,
+               .data           = data,
+               .mutex          = PTHREAD_MUTEX_INITIALIZER,
+       };
+}
+
+int thread_pool__init(unsigned long thread_count);
+
+void thread_pool__do_job(struct thread_pool__job *job);
+
+#endif
diff --git a/tools/kvm/include/kvm/types.h b/tools/kvm/include/kvm/types.h
new file mode 100644 (file)
index 0000000..0cbc5fb
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef KVM_TYPES_H
+#define KVM_TYPES_H
+
+/* FIXME: include/linux/if_tun.h and include/linux/if_ether.h complains */
+#define __be16 u16
+
+#endif /* KVM_TYPES_H */
diff --git a/tools/kvm/include/kvm/uip.h b/tools/kvm/include/kvm/uip.h
new file mode 100644 (file)
index 0000000..344ec09
--- /dev/null
@@ -0,0 +1,356 @@
+#ifndef KVM__UIP_H
+#define KVM__UIP_H
+
+#include "linux/types.h"
+#include "kvm/mutex.h"
+
+#include <netinet/in.h>
+#include <sys/uio.h>
+
+#define UIP_BUF_STATUS_FREE    0
+#define UIP_BUF_STATUS_INUSE   1
+#define UIP_BUF_STATUS_USED    2
+
+#define UIP_ETH_P_IP           0X0800
+#define UIP_ETH_P_ARP          0X0806
+
+#define UIP_IP_VER_4           0X40
+#define UIP_IP_HDR_LEN         0X05
+#define UIP_IP_TTL             0X40
+#define UIP_IP_P_UDP           0X11
+#define UIP_IP_P_TCP           0X06
+
+#define UIP_TCP_HDR_LEN                0x50
+#define UIP_TCP_WIN_SIZE       14600
+#define UIP_TCP_FLAG_FIN       1
+#define UIP_TCP_FLAG_SYN       2
+#define UIP_TCP_FLAG_RST       4
+#define UIP_TCP_FLAG_PSH       8
+#define UIP_TCP_FLAG_ACK       16
+#define UIP_TCP_FLAG_URG       32
+
+#define UIP_BOOTP_VENDOR_SPECIFIC_LEN  64
+#define UIP_BOOTP_MAX_PAYLOAD_LEN      300
+#define UIP_DHCP_VENDOR_SPECIFIC_LEN   312
+#define UIP_DHCP_PORT_SERVER           67
+#define UIP_DHCP_PORT_CLIENT           68
+#define UIP_DHCP_MACPAD_LEN            10
+#define UIP_DHCP_HOSTNAME_LEN          64
+#define UIP_DHCP_FILENAME_LEN          128
+#define UIP_DHCP_MAGIC_COOKIE          0x63825363
+#define UIP_DHCP_MAGIC_COOKIE_LEN      4
+#define UIP_DHCP_LEASE_TIME            0x00003840
+#define UIP_DHCP_MAX_PAYLOAD_LEN       (UIP_BOOTP_MAX_PAYLOAD_LEN - UIP_BOOTP_VENDOR_SPECIFIC_LEN +  UIP_DHCP_VENDOR_SPECIFIC_LEN)
+#define UIP_DHCP_OPTION_LEN            (UIP_DHCP_VENDOR_SPECIFIC_LEN - UIP_DHCP_MAGIC_COOKIE_LEN)
+#define UIP_DHCP_DISCOVER              1
+#define UIP_DHCP_OFFER                 2
+#define UIP_DHCP_REQUEST               3
+#define UIP_DHCP_ACK                   5
+#define UIP_DHCP_MAX_DNS_SERVER_NR     3
+#define UIP_DHCP_MAX_DOMAIN_NAME_LEN   256
+#define UIP_DHCP_TAG_MSG_TYPE          53
+#define UIP_DHCP_TAG_MSG_TYPE_LEN      1
+#define UIP_DHCP_TAG_SERVER_ID         54
+#define UIP_DHCP_TAG_SERVER_ID_LEN     4
+#define UIP_DHCP_TAG_LEASE_TIME                51
+#define UIP_DHCP_TAG_LEASE_TIME_LEN    4
+#define UIP_DHCP_TAG_SUBMASK           1
+#define UIP_DHCP_TAG_SUBMASK_LEN       4
+#define UIP_DHCP_TAG_ROUTER            3
+#define UIP_DHCP_TAG_ROUTER_LEN                4
+#define UIP_DHCP_TAG_DNS_SERVER                6
+#define UIP_DHCP_TAG_DNS_SERVER_LEN    4
+#define UIP_DHCP_TAG_DOMAIN_NAME       15
+#define UIP_DHCP_TAG_END               255
+
+/*
+ * IP package maxium len == 64 KBytes
+ * IP header == 20 Bytes
+ * TCP header == 20 Bytes
+ * UDP header == 8 Bytes
+ */
+#define UIP_MAX_TCP_PAYLOAD    (64*1024 - 20 - 20 - 1)
+#define UIP_MAX_UDP_PAYLOAD    (64*1024 - 20 -  8 - 1)
+
+struct uip_eth_addr {
+       u8 addr[6];
+};
+
+struct uip_eth {
+       struct uip_eth_addr dst;
+       struct uip_eth_addr src;
+       u16 type;
+} __attribute__((packed));
+
+struct uip_arp {
+       struct uip_eth eth;
+       u16 hwtype;
+       u16 proto;
+       u8 hwlen;
+       u8 protolen;
+       u16 op;
+       struct uip_eth_addr smac;
+       u32 sip;
+       struct uip_eth_addr dmac;
+       u32 dip;
+} __attribute__((packed));
+
+struct uip_ip {
+       struct uip_eth eth;
+       u8 vhl;
+       u8 tos;
+       /*
+        * len = IP hdr +  IP payload
+        */
+       u16 len;
+       u16 id;
+       u16 flgfrag;
+       u8 ttl;
+       u8 proto;
+       u16 csum;
+       u32 sip;
+       u32 dip;
+} __attribute__((packed));
+
+struct uip_icmp {
+       struct uip_ip ip;
+       u8 type;
+       u8 code;
+       u16 csum;
+       u16 id;
+       u16 seq;
+} __attribute__((packed));
+
+struct uip_udp {
+       /*
+        * FIXME: IP Options (IP hdr len > 20 bytes) are not supported
+        */
+       struct uip_ip ip;
+       u16 sport;
+       u16 dport;
+       /*
+        * len = UDP hdr +  UDP payload
+        */
+       u16 len;
+       u16 csum;
+       u8 payload[0];
+} __attribute__((packed));
+
+struct uip_tcp {
+       /*
+        * FIXME: IP Options (IP hdr len > 20 bytes) are not supported
+        */
+       struct uip_ip ip;
+       u16 sport;
+       u16 dport;
+       u32 seq;
+       u32 ack;
+       u8  off;
+       u8  flg;
+       u16 win;
+       u16 csum;
+       u16 urgent;
+} __attribute__((packed));
+
+struct uip_pseudo_hdr {
+       u32 sip;
+       u32 dip;
+       u8 zero;
+       u8 proto;
+       u16 len;
+} __attribute__((packed));
+
+struct uip_dhcp {
+       struct uip_udp udp;
+       u8 msg_type;
+       u8 hardware_type;
+       u8 hardware_len;
+       u8 hops;
+       u32 id;
+       u16 time;
+       u16 flg;
+       u32 client_ip;
+       u32 your_ip;
+       u32 server_ip;
+       u32 agent_ip;
+       struct uip_eth_addr client_mac;
+       u8 pad[UIP_DHCP_MACPAD_LEN];
+       u8 server_hostname[UIP_DHCP_HOSTNAME_LEN];
+       u8 boot_filename[UIP_DHCP_FILENAME_LEN];
+       u32 magic_cookie;
+       u8 option[UIP_DHCP_OPTION_LEN];
+} __attribute__((packed));
+
+struct uip_info {
+       struct list_head udp_socket_head;
+       struct list_head tcp_socket_head;
+       pthread_mutex_t udp_socket_lock;
+       pthread_mutex_t tcp_socket_lock;
+       struct uip_eth_addr guest_mac;
+       struct uip_eth_addr host_mac;
+       pthread_cond_t buf_free_cond;
+       pthread_cond_t buf_used_cond;
+       struct list_head buf_head;
+       pthread_mutex_t buf_lock;
+       pthread_t udp_thread;
+       int udp_epollfd;
+       int buf_free_nr;
+       int buf_used_nr;
+       u32 guest_ip;
+       u32 guest_netmask;
+       u32 host_ip;
+       u32 dns_ip[UIP_DHCP_MAX_DNS_SERVER_NR];
+       char *domain_name;
+       u32 buf_nr;
+};
+
+struct uip_buf {
+       struct list_head list;
+       struct uip_info *info;
+       u32 payload;
+       int vnet_len;
+       int eth_len;
+       int status;
+       char *vnet;
+       char *eth;
+       int id;
+};
+
+struct uip_udp_socket {
+       struct sockaddr_in addr;
+       struct list_head list;
+       pthread_mutex_t *lock;
+       u32 dport, sport;
+       u32 dip, sip;
+       int fd;
+};
+
+struct uip_tcp_socket {
+       struct sockaddr_in addr;
+       struct list_head list;
+       struct uip_info *info;
+       pthread_mutex_t *lock;
+       pthread_t thread;
+       u32 dport, sport;
+       u32 guest_acked;
+       /*
+        * Initial Sequence Number
+        */
+       u32 isn_server;
+       u32 isn_guest;
+       u32 ack_server;
+       u32 seq_server;
+       int write_done;
+       int read_done;
+       u32 dip, sip;
+       u8 *payload;
+       int fd;
+};
+
+struct uip_tx_arg {
+       struct virtio_net_hdr *vnet;
+       struct uip_info *info;
+       struct uip_eth *eth;
+       int vnet_len;
+       int eth_len;
+};
+
+static inline u16 uip_ip_hdrlen(struct uip_ip *ip)
+{
+       return (ip->vhl & 0x0f) * 4;
+}
+
+static inline u16 uip_ip_len(struct uip_ip *ip)
+{
+       return htons(ip->len);
+}
+
+static inline u16 uip_udp_hdrlen(struct uip_udp *udp)
+{
+       return 8;
+}
+
+static inline u16 uip_udp_len(struct uip_udp *udp)
+{
+       return ntohs(udp->len);
+}
+
+static inline u16 uip_tcp_hdrlen(struct uip_tcp *tcp)
+{
+       return (tcp->off >> 4) * 4;
+}
+
+static inline u16 uip_tcp_len(struct uip_tcp *tcp)
+{
+       struct uip_ip *ip;
+
+       ip = &tcp->ip;
+
+       return uip_ip_len(ip) - uip_ip_hdrlen(ip);
+}
+
+static inline u16 uip_tcp_payloadlen(struct uip_tcp *tcp)
+{
+       return uip_tcp_len(tcp) - uip_tcp_hdrlen(tcp);
+}
+
+static inline u8 *uip_tcp_payload(struct uip_tcp *tcp)
+{
+       return (u8 *)&tcp->sport + uip_tcp_hdrlen(tcp);
+}
+
+static inline bool uip_tcp_is_syn(struct uip_tcp *tcp)
+{
+       return (tcp->flg & UIP_TCP_FLAG_SYN) != 0;
+}
+
+static inline bool uip_tcp_is_fin(struct uip_tcp *tcp)
+{
+       return (tcp->flg & UIP_TCP_FLAG_FIN) != 0;
+}
+
+static inline u32 uip_tcp_isn(struct uip_tcp *tcp)
+{
+       return ntohl(tcp->seq);
+}
+
+static inline u32 uip_tcp_isn_alloc(void)
+{
+       /*
+        * FIXME: should increase every 4ms
+        */
+       return 10000000;
+}
+
+static inline u16 uip_eth_hdrlen(struct uip_eth *eth)
+{
+       return sizeof(*eth);
+}
+
+int uip_tx(struct iovec *iov, u16 out, struct uip_info *info);
+int uip_rx(struct iovec *iov, u16 in, struct uip_info *info);
+int uip_init(struct uip_info *info);
+
+int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_icmp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_tcp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_udp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4(struct uip_tx_arg *arg);
+int uip_tx_do_arp(struct uip_tx_arg *arg);
+
+u16 uip_csum_icmp(struct uip_icmp *icmp);
+u16 uip_csum_udp(struct uip_udp *udp);
+u16 uip_csum_tcp(struct uip_tcp *tcp);
+u16 uip_csum_ip(struct uip_ip *ip);
+
+struct uip_buf *uip_buf_set_used(struct uip_info *info, struct uip_buf *buf);
+struct uip_buf *uip_buf_set_free(struct uip_info *info, struct uip_buf *buf);
+struct uip_buf *uip_buf_get_used(struct uip_info *info);
+struct uip_buf *uip_buf_get_free(struct uip_info *info);
+struct uip_buf *uip_buf_clone(struct uip_tx_arg *arg);
+
+int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8 *payload, int payload_len);
+bool uip_udp_is_dhcp(struct uip_udp *udp);
+
+int uip_dhcp_get_dns(struct uip_info *info);
+#endif /* KVM__UIP_H */
diff --git a/tools/kvm/include/kvm/util.h b/tools/kvm/include/kvm/util.h
new file mode 100644 (file)
index 0000000..dc2e0b9
--- /dev/null
@@ -0,0 +1,78 @@
+#include <linux/stringify.h>
+
+#ifndef KVM__UTIL_H
+#define KVM__UTIL_H
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+/*
+ * Some bits are stolen from perf tool :)
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/param.h>
+#include <sys/types.h>
+
+#ifdef __GNUC__
+#define NORETURN __attribute__((__noreturn__))
+#else
+#define NORETURN
+#ifndef __attribute__
+#define __attribute__(x)
+#endif
+#endif
+
+extern bool do_debug_print;
+
+#define PROT_RW (PROT_READ|PROT_WRITE)
+#define MAP_ANON_NORESERVE (MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE)
+
+extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
+extern void die_perror(const char *s) NORETURN;
+extern int pr_error(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void pr_warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void pr_info(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
+
+#define pr_debug(fmt, ...)                                             \
+       do {                                                            \
+               if (do_debug_print)                                     \
+                       pr_info("(%s) %s:%d: " fmt, __FILE__,           \
+                               __func__, __LINE__, ##__VA_ARGS__);     \
+       } while (0)
+
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+
+#define DIE_IF(cnd)                                            \
+do {                                                           \
+       if (cnd)                                                \
+       die(" at (" __FILE__ ":" __stringify(__LINE__) "): "    \
+               __stringify(cnd) "\n");                         \
+} while (0)
+
+extern size_t strlcat(char *dest, const char *src, size_t count);
+
+/* some inline functions */
+
+static inline const char *skip_prefix(const char *str, const char *prefix)
+{
+       size_t len = strlen(prefix);
+       return strncmp(str, prefix, len) ? NULL : str + len;
+}
+
+#define MSECS_TO_USECS(s) ((s) * 1000)
+
+/* Millisecond sleep */
+static inline void msleep(unsigned int msecs)
+{
+       usleep(MSECS_TO_USECS(msecs));
+}
+#endif /* KVM__UTIL_H */
diff --git a/tools/kvm/include/kvm/vesa.h b/tools/kvm/include/kvm/vesa.h
new file mode 100644 (file)
index 0000000..ac041d9
--- /dev/null
@@ -0,0 +1,18 @@
+#ifndef KVM__VESA_H
+#define KVM__VESA_H
+
+#include <linux/types.h>
+
+#define VESA_WIDTH     640
+#define VESA_HEIGHT    480
+
+#define VESA_MEM_ADDR  0xd0000000
+#define VESA_MEM_SIZE  (4*VESA_WIDTH*VESA_HEIGHT)
+#define VESA_BPP       32
+
+struct kvm;
+struct biosregs;
+
+struct framebuffer *vesa__init(struct kvm *self);
+
+#endif
diff --git a/tools/kvm/include/kvm/virtio-9p.h b/tools/kvm/include/kvm/virtio-9p.h
new file mode 100644 (file)
index 0000000..07084c3
--- /dev/null
@@ -0,0 +1,75 @@
+#ifndef KVM__VIRTIO_9P_H
+#define KVM__VIRTIO_9P_H
+#include "kvm/virtio.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/virtio-pci.h"
+
+#include <sys/types.h>
+#include <dirent.h>
+#include <linux/list.h>
+
+#define NUM_VIRT_QUEUES                1
+#define VIRTQUEUE_NUM          128
+#define        VIRTIO_P9_DEFAULT_TAG   "kvm_9p"
+#define VIRTIO_P9_HDR_LEN      (sizeof(u32)+sizeof(u8)+sizeof(u16))
+#define VIRTIO_P9_MAX_FID      256
+#define VIRTIO_P9_VERSION_DOTL "9P2000.L"
+#define MAX_TAG_LEN            32
+
+struct p9_msg {
+       u32                     size;
+       u8                      cmd;
+       u16                     tag;
+       u8                      msg[0];
+} __attribute__((packed));
+
+struct p9_fid {
+       u32                     fid;
+       u32                     uid;
+       u8                      is_dir;
+       char                    abs_path[PATH_MAX];
+       char                    *path;
+       DIR                     *dir;
+       int                     fd;
+};
+
+struct p9_dev_job {
+       struct virt_queue       *vq;
+       struct p9_dev           *p9dev;
+       struct thread_pool__job job_id;
+};
+
+struct p9_dev {
+       struct list_head        list;
+       struct virtio_pci       vpci;
+
+       struct virtio_9p_config *config;
+       int                     compat_id;
+       u32                     features;
+
+       /* virtio queue */
+       struct virt_queue       vqs[NUM_VIRT_QUEUES];
+       struct p9_dev_job       jobs[NUM_VIRT_QUEUES];
+       struct p9_fid           fids[VIRTIO_P9_MAX_FID];
+       char                    root_dir[PATH_MAX];
+};
+
+struct p9_pdu {
+       u32                     queue_head;
+       size_t                  read_offset;
+       size_t                  write_offset;
+       u16                     out_iov_cnt;
+       u16                     in_iov_cnt;
+       struct iovec            in_iov[VIRTQUEUE_NUM];
+       struct iovec            out_iov[VIRTQUEUE_NUM];
+};
+
+struct kvm;
+
+int virtio_9p__register(struct kvm *kvm, const char *root, const char *tag_name);
+int virtio_9p__init(struct kvm *kvm);
+int virtio_p9_pdu_readf(struct p9_pdu *pdu, const char *fmt, ...);
+int virtio_p9_pdu_writef(struct p9_pdu *pdu, const char *fmt, ...);
+
+#endif
diff --git a/tools/kvm/include/kvm/virtio-balloon.h b/tools/kvm/include/kvm/virtio-balloon.h
new file mode 100644 (file)
index 0000000..eb49fd4
--- /dev/null
@@ -0,0 +1,8 @@
+#ifndef KVM__BLN_VIRTIO_H
+#define KVM__BLN_VIRTIO_H
+
+struct kvm;
+
+void virtio_bln__init(struct kvm *kvm);
+
+#endif /* KVM__BLN_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio-blk.h b/tools/kvm/include/kvm/virtio-blk.h
new file mode 100644 (file)
index 0000000..8c4fb91
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef KVM__BLK_VIRTIO_H
+#define KVM__BLK_VIRTIO_H
+
+#include "kvm/disk-image.h"
+
+struct kvm;
+
+void virtio_blk__init(struct kvm *kvm, struct disk_image *disk);
+void virtio_blk__init_all(struct kvm *kvm);
+void virtio_blk__delete_all(struct kvm *kvm);
+
+#endif /* KVM__BLK_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio-console.h b/tools/kvm/include/kvm/virtio-console.h
new file mode 100644 (file)
index 0000000..50d8653
--- /dev/null
@@ -0,0 +1,9 @@
+#ifndef KVM__CONSOLE_VIRTIO_H
+#define KVM__CONSOLE_VIRTIO_H
+
+struct kvm;
+
+void virtio_console__init(struct kvm *kvm);
+void virtio_console__inject_interrupt(struct kvm *kvm);
+
+#endif /* KVM__CONSOLE_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio-net.h b/tools/kvm/include/kvm/virtio-net.h
new file mode 100644 (file)
index 0000000..c30deb8
--- /dev/null
@@ -0,0 +1,21 @@
+#ifndef KVM__VIRTIO_NET_H
+#define KVM__VIRTIO_NET_H
+
+struct kvm;
+
+struct virtio_net_parameters {
+       const char *guest_ip;
+       const char *host_ip;
+       const char *script;
+       char guest_mac[6];
+       char host_mac[6];
+       struct kvm *kvm;
+       int mode;
+};
+
+void virtio_net__init(const struct virtio_net_parameters *params);
+
+#define NET_MODE_USER  0
+#define NET_MODE_TAP   1
+
+#endif /* KVM__VIRTIO_NET_H */
diff --git a/tools/kvm/include/kvm/virtio-pci-dev.h b/tools/kvm/include/kvm/virtio-pci-dev.h
new file mode 100644 (file)
index 0000000..bfcb076
--- /dev/null
@@ -0,0 +1,27 @@
+#ifndef VIRTIO_PCI_DEV_H_
+#define VIRTIO_PCI_DEV_H_
+
+#include <linux/virtio_ids.h>
+
+/*
+ * Virtio PCI device constants and resources
+ * they do use (such as irqs and pins).
+ */
+
+#define PCI_DEVICE_ID_VIRTIO_NET               0x1000
+#define PCI_DEVICE_ID_VIRTIO_BLK               0x1001
+#define PCI_DEVICE_ID_VIRTIO_CONSOLE           0x1003
+#define PCI_DEVICE_ID_VIRTIO_RNG               0x1004
+#define PCI_DEVICE_ID_VIRTIO_BLN               0x1005
+#define PCI_DEVICE_ID_VIRTIO_P9                        0x1009
+#define PCI_DEVICE_ID_VESA                     0x2000
+#define PCI_DEVICE_ID_PCI_SHMEM                        0x0001
+
+#define PCI_VENDOR_ID_REDHAT_QUMRANET          0x1af4
+#define PCI_VENDOR_ID_PCI_SHMEM                        0x0001
+#define PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET        0x1af4
+
+#define PCI_SUBSYSTEM_ID_VESA                  0x0004
+#define PCI_SUBSYSTEM_ID_PCI_SHMEM             0x0001
+
+#endif /* VIRTIO_PCI_DEV_H_ */
diff --git a/tools/kvm/include/kvm/virtio-pci.h b/tools/kvm/include/kvm/virtio-pci.h
new file mode 100644 (file)
index 0000000..ce44e84
--- /dev/null
@@ -0,0 +1,59 @@
+#ifndef KVM__VIRTIO_PCI_H
+#define KVM__VIRTIO_PCI_H
+
+#include "kvm/pci.h"
+
+#include <linux/types.h>
+
+#define VIRTIO_PCI_MAX_VQ 3
+
+struct kvm;
+
+struct virtio_pci_ops {
+       void (*set_config)(struct kvm *kvm, void *dev, u8 data, u32 offset);
+       u8 (*get_config)(struct kvm *kvm, void *dev, u32 offset);
+
+       u32 (*get_host_features)(struct kvm *kvm, void *dev);
+       void (*set_guest_features)(struct kvm *kvm, void *dev, u32 features);
+
+       int (*init_vq)(struct kvm *kvm, void *dev, u32 vq, u32 pfn);
+       int (*notify_vq)(struct kvm *kvm, void *dev, u32 vq);
+       int (*get_pfn_vq)(struct kvm *kvm, void *dev, u32 vq);
+       int (*get_size_vq)(struct kvm *kvm, void *dev, u32 vq);
+};
+
+struct virtio_pci_ioevent_param {
+       struct virtio_pci       *vpci;
+       u32                     vq;
+};
+
+struct virtio_pci {
+       struct pci_device_header pci_hdr;
+       struct virtio_pci_ops   ops;
+       void                    *dev;
+
+       u16                     base_addr;
+       u8                      status;
+       u8                      isr;
+
+       /* MSI-X */
+       u16                     config_vector;
+       u32                     config_gsi;
+       u32                     vq_vector[VIRTIO_PCI_MAX_VQ];
+       u32                     gsis[VIRTIO_PCI_MAX_VQ];
+       u32                     msix_io_block;
+       u32                     msix_pba_block;
+       u64                     msix_pba;
+       struct msix_table       msix_table[VIRTIO_PCI_MAX_VQ + 1];
+
+       /* virtio queue */
+       u16                     queue_selector;
+       struct virtio_pci_ioevent_param ioeventfds[VIRTIO_PCI_MAX_VQ];
+};
+
+int virtio_pci__init(struct kvm *kvm, struct virtio_pci *vpci, void *dev,
+                       int device_id, int subsys_id);
+int virtio_pci__signal_vq(struct kvm *kvm, struct virtio_pci *vpci, u32 vq);
+int virtio_pci__signal_config(struct kvm *kvm, struct virtio_pci *vpci);
+
+#endif
diff --git a/tools/kvm/include/kvm/virtio-rng.h b/tools/kvm/include/kvm/virtio-rng.h
new file mode 100644 (file)
index 0000000..c0a413b
--- /dev/null
@@ -0,0 +1,9 @@
+#ifndef KVM__RNG_VIRTIO_H
+#define KVM__RNG_VIRTIO_H
+
+struct kvm;
+
+void virtio_rng__init(struct kvm *kvm);
+void virtio_rng__delete_all(struct kvm *kvm);
+
+#endif /* KVM__RNG_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio.h b/tools/kvm/include/kvm/virtio.h
new file mode 100644 (file)
index 0000000..3442338
--- /dev/null
@@ -0,0 +1,65 @@
+#ifndef KVM__VIRTIO_H
+#define KVM__VIRTIO_H
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+
+#include <linux/types.h>
+#include <sys/uio.h>
+
+#include "kvm/kvm.h"
+
+#define VIRTIO_IRQ_LOW         0
+#define VIRTIO_IRQ_HIGH                1
+
+#define VIRTIO_PCI_O_CONFIG    0
+#define VIRTIO_PCI_O_MSIX      1
+#define VIRTIO_PCI_O_FEATURES  2
+
+struct virt_queue {
+       struct vring    vring;
+       u32             pfn;
+       /* The last_avail_idx field is an index to ->ring of struct vring_avail.
+          It's where we assume the next request index is at.  */
+       u16             last_avail_idx;
+};
+
+static inline u16 virt_queue__pop(struct virt_queue *queue)
+{
+       return queue->vring.avail->ring[queue->last_avail_idx++ % queue->vring.num];
+}
+
+static inline struct vring_desc *virt_queue__get_desc(struct virt_queue *queue, u16 desc_ndx)
+{
+       return &queue->vring.desc[desc_ndx];
+}
+
+static inline bool virt_queue__available(struct virt_queue *vq)
+{
+       if (!vq->vring.avail)
+               return 0;
+       return vq->vring.avail->idx !=  vq->last_avail_idx;
+}
+
+/*
+ * Warning: on 32-bit hosts, shifting pfn left may cause a truncation of pfn values
+ * higher than 4GB - thus, pointing to the wrong area in guest virtual memory space
+ * and breaking the virt queue which owns this pfn.
+ */
+static inline void *guest_pfn_to_host(struct kvm *kvm, u32 pfn)
+{
+       return guest_flat_to_host(kvm, (unsigned long)pfn << VIRTIO_PCI_QUEUE_ADDR_SHIFT);
+}
+
+struct vring_used_elem *virt_queue__set_used_elem(struct virt_queue *queue, u32 head, u32 len);
+
+u16 virt_queue__get_iov(struct virt_queue *queue, struct iovec iov[], u16 *out, u16 *in, struct kvm *kvm);
+u16 virt_queue__get_inout_iov(struct kvm *kvm, struct virt_queue *queue,
+                             struct iovec in_iov[], struct iovec out_iov[],
+                             u16 *in, u16 *out);
+
+void virt_queue__trigger_irq(struct virt_queue *vq, int irq, u8 *isr, struct kvm *kvm);
+
+int virtio__get_dev_specific_field(int offset, bool msix, bool features_hi, u32 *config_off);
+
+#endif /* KVM__VIRTIO_H */
diff --git a/tools/kvm/include/kvm/vnc.h b/tools/kvm/include/kvm/vnc.h
new file mode 100644 (file)
index 0000000..da2f635
--- /dev/null
@@ -0,0 +1,14 @@
+#ifndef KVM__VNC_H
+#define KVM__VNC_H
+
+struct framebuffer;
+
+#ifdef CONFIG_HAS_VNCSERVER
+void vnc__init(struct framebuffer *fb);
+#else
+static inline void vnc__init(struct framebuffer *fb)
+{
+}
+#endif
+
+#endif /* KVM__VNC_H */
diff --git a/tools/kvm/include/linux/bitops.h b/tools/kvm/include/linux/bitops.h
new file mode 100644 (file)
index 0000000..56448b7
--- /dev/null
@@ -0,0 +1,33 @@
+#ifndef _KVM_LINUX_BITOPS_H_
+#define _KVM_LINUX_BITOPS_H_
+
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <asm/hweight.h>
+
+#define BITS_PER_LONG __WORDSIZE
+#define BITS_PER_BYTE           8
+#define BITS_TO_LONGS(nr)       DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
+
+static inline void set_bit(int nr, unsigned long *addr)
+{
+       addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
+}
+
+static inline void clear_bit(int nr, unsigned long *addr)
+{
+       addr[nr / BITS_PER_LONG] &= ~(1UL << (nr % BITS_PER_LONG));
+}
+
+static __always_inline int test_bit(unsigned int nr, const unsigned long *addr)
+{
+       return ((1UL << (nr % BITS_PER_LONG)) &
+               (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
+}
+
+static inline unsigned long hweight_long(unsigned long w)
+{
+       return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
+}
+
+#endif
diff --git a/tools/kvm/include/linux/byteorder.h b/tools/kvm/include/linux/byteorder.h
new file mode 100644 (file)
index 0000000..c490de8
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef __BYTE_ORDER_H__
+#define __BYTE_ORDER_H__
+
+#include <asm/byteorder.h>
+#include <linux/byteorder/generic.h>
+
+#endif
diff --git a/tools/kvm/include/linux/kernel.h b/tools/kvm/include/linux/kernel.h
new file mode 100644 (file)
index 0000000..d2ec4a3
--- /dev/null
@@ -0,0 +1,39 @@
+
+#ifndef KVM__LINUX_KERNEL_H_
+#define KVM__LINUX_KERNEL_H_
+
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+
+#define ALIGN(x,a)             __ALIGN_MASK(x,(typeof(x))(a)-1)
+#define __ALIGN_MASK(x,mask)   (((x)+(mask))&~(mask))
+
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
+#ifndef container_of
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:       the pointer to the member.
+ * @type:      the type of the container struct this is embedded in.
+ * @member:    the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({                     \
+       const typeof(((type *)0)->member) * __mptr = (ptr);     \
+       (type *)((char *)__mptr - offsetof(type, member)); })
+#endif
+
+#define min(x, y) ({                           \
+       typeof(x) _min1 = (x);                  \
+       typeof(y) _min2 = (y);                  \
+       (void) (&_min1 == &_min2);              \
+       _min1 < _min2 ? _min1 : _min2; })
+
+#define max(x, y) ({                           \
+       typeof(x) _max1 = (x);                  \
+       typeof(y) _max2 = (y);                  \
+       (void) (&_max1 == &_max2);              \
+       _max1 > _max2 ? _max1 : _max2; })
+
+#endif
diff --git a/tools/kvm/include/linux/module.h b/tools/kvm/include/linux/module.h
new file mode 100644 (file)
index 0000000..0e4c6a3
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef KVM__LINUX_MODULE_H
+#define KVM__LINUX_MODULE_H
+
+#define EXPORT_SYMBOL(name)
+
+#endif
diff --git a/tools/kvm/include/linux/prefetch.h b/tools/kvm/include/linux/prefetch.h
new file mode 100644 (file)
index 0000000..62f6788
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef KVM__LINUX_PREFETCH_H
+#define KVM__LINUX_PREFETCH_H
+
+static inline void prefetch(void *a __attribute__((unused))) { }
+
+#endif
diff --git a/tools/kvm/include/linux/types.h b/tools/kvm/include/linux/types.h
new file mode 100644 (file)
index 0000000..357799c
--- /dev/null
@@ -0,0 +1,50 @@
+#ifndef LINUX_TYPES_H
+#define LINUX_TYPES_H
+
+#include <kvm/compiler.h>
+#include <asm/types.h>
+
+typedef __u64 u64;
+typedef __s64 s64;
+
+typedef __u32 u32;
+typedef __s32 s32;
+
+typedef __u16 u16;
+typedef __s16 s16;
+
+typedef __u8  u8;
+typedef __s8  s8;
+
+#ifdef __CHECKER__
+#define __bitwise__ __attribute__((bitwise))
+#else
+#define __bitwise__
+#endif
+#ifdef __CHECK_ENDIAN__
+#define __bitwise __bitwise__
+#else
+#define __bitwise
+#endif
+
+
+typedef __u16 __bitwise __le16;
+typedef __u16 __bitwise __be16;
+typedef __u32 __bitwise __le32;
+typedef __u32 __bitwise __be32;
+typedef __u64 __bitwise __le64;
+typedef __u64 __bitwise __be64;
+
+struct list_head {
+       struct list_head *next, *prev;
+};
+
+struct hlist_head {
+       struct hlist_node *first;
+};
+
+struct hlist_node {
+       struct hlist_node *next, **pprev;
+};
+
+#endif /* LINUX_TYPES_H */
diff --git a/tools/kvm/interrupt.c b/tools/kvm/interrupt.c
new file mode 100644 (file)
index 0000000..e4636c4
--- /dev/null
@@ -0,0 +1,27 @@
+#include "kvm/interrupt.h"
+
+#include "kvm/util.h"
+
+#include <string.h>
+
+void interrupt_table__copy(struct interrupt_table *itable, void *dst, unsigned int size)
+{
+       if (size < sizeof(itable->entries))
+               die("An attempt to overwrite host memory");
+
+       memcpy(dst, itable->entries, sizeof(itable->entries));
+}
+
+void interrupt_table__setup(struct interrupt_table *itable, struct real_intr_desc *entry)
+{
+       unsigned int i;
+
+       for (i = 0; i < REAL_INTR_VECTORS; i++)
+               itable->entries[i] = *entry;
+}
+
+void interrupt_table__set(struct interrupt_table *itable, struct real_intr_desc *entry, unsigned int num)
+{
+       if (num < REAL_INTR_VECTORS)
+               itable->entries[num] = *entry;
+}
diff --git a/tools/kvm/ioeventfd.c b/tools/kvm/ioeventfd.c
new file mode 100644 (file)
index 0000000..3a240e4
--- /dev/null
@@ -0,0 +1,128 @@
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <signal.h>
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/types.h>
+
+#include "kvm/ioeventfd.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#define IOEVENTFD_MAX_EVENTS   20
+
+static struct  epoll_event events[IOEVENTFD_MAX_EVENTS];
+static int     epoll_fd;
+static LIST_HEAD(used_ioevents);
+
+void ioeventfd__init(void)
+{
+       epoll_fd = epoll_create(IOEVENTFD_MAX_EVENTS);
+       if (epoll_fd < 0)
+               die("Failed creating epoll fd");
+}
+
+void ioeventfd__add_event(struct ioevent *ioevent)
+{
+       struct kvm_ioeventfd kvm_ioevent;
+       struct epoll_event epoll_event;
+       struct ioevent *new_ioevent;
+       int event;
+
+       new_ioevent = malloc(sizeof(*new_ioevent));
+       if (new_ioevent == NULL)
+               die("Failed allocating memory for new ioevent");
+
+       *new_ioevent = *ioevent;
+       event = new_ioevent->fd;
+
+       kvm_ioevent = (struct kvm_ioeventfd) {
+               .addr                   = ioevent->io_addr,
+               .len                    = ioevent->io_len,
+               .datamatch              = ioevent->datamatch,
+               .fd                     = event,
+               .flags                  = KVM_IOEVENTFD_FLAG_PIO | KVM_IOEVENTFD_FLAG_DATAMATCH,
+       };
+
+       if (ioctl(ioevent->fn_kvm->vm_fd, KVM_IOEVENTFD, &kvm_ioevent) != 0)
+               die("Failed creating new ioeventfd");
+
+       epoll_event = (struct epoll_event) {
+               .events                 = EPOLLIN,
+               .data.ptr               = new_ioevent,
+       };
+
+       if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, event, &epoll_event) != 0)
+               die("Failed assigning new event to the epoll fd");
+
+       list_add_tail(&new_ioevent->list, &used_ioevents);
+}
+
+void ioeventfd__del_event(u64 addr, u64 datamatch)
+{
+       struct kvm_ioeventfd kvm_ioevent;
+       struct ioevent *ioevent;
+       u8 found = 0;
+
+       list_for_each_entry(ioevent, &used_ioevents, list) {
+               if (ioevent->io_addr == addr) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       if (found == 0 || ioevent == NULL)
+               return;
+
+       kvm_ioevent = (struct kvm_ioeventfd) {
+               .addr                   = ioevent->io_addr,
+               .len                    = ioevent->io_len,
+               .datamatch              = ioevent->datamatch,
+               .flags                  = KVM_IOEVENTFD_FLAG_PIO
+                                       | KVM_IOEVENTFD_FLAG_DEASSIGN
+                                       | KVM_IOEVENTFD_FLAG_DATAMATCH,
+       };
+
+       ioctl(ioevent->fn_kvm->vm_fd, KVM_IOEVENTFD, &kvm_ioevent);
+
+       epoll_ctl(epoll_fd, EPOLL_CTL_DEL, ioevent->fd, NULL);
+
+       list_del(&ioevent->list);
+
+       close(ioevent->fd);
+       free(ioevent);
+}
+
+static void *ioeventfd__thread(void *param)
+{
+       for (;;) {
+               int nfds, i;
+
+               nfds = epoll_wait(epoll_fd, events, IOEVENTFD_MAX_EVENTS, -1);
+               for (i = 0; i < nfds; i++) {
+                       u64 tmp;
+                       struct ioevent *ioevent;
+
+                       ioevent = events[i].data.ptr;
+
+                       if (read(ioevent->fd, &tmp, sizeof(tmp)) < 0)
+                               die("Failed reading event");
+
+                       ioevent->fn(ioevent->fn_kvm, ioevent->fn_ptr);
+               }
+       }
+
+       return NULL;
+}
+
+void ioeventfd__start(void)
+{
+       pthread_t thread;
+
+       if (pthread_create(&thread, NULL, ioeventfd__thread, NULL) != 0)
+               die("Failed starting ioeventfd thread");
+}
diff --git a/tools/kvm/ioport.c b/tools/kvm/ioport.c
new file mode 100644 (file)
index 0000000..7cbc44e
--- /dev/null
@@ -0,0 +1,192 @@
+#include "kvm/ioport.h"
+
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+#include "kvm/brlock.h"
+#include "kvm/rbtree-interval.h"
+#include "kvm/mutex.h"
+
+#include <linux/kvm.h> /* for KVM_EXIT_* */
+#include <linux/types.h>
+
+#include <stdbool.h>
+#include <assert.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define ioport_node(n) rb_entry(n, struct ioport, node)
+
+DEFINE_MUTEX(ioport_mutex);
+
+static u16                     free_io_port_idx; /* protected by ioport_mutex */
+
+static struct rb_root          ioport_tree = RB_ROOT;
+bool                           ioport_debug;
+
+static u16 ioport__find_free_port(void)
+{
+       u16 free_port;
+
+       mutex_lock(&ioport_mutex);
+       free_port = IOPORT_START + free_io_port_idx * IOPORT_SIZE;
+       free_io_port_idx++;
+       mutex_unlock(&ioport_mutex);
+
+       return free_port;
+}
+
+static struct ioport *ioport_search(struct rb_root *root, u64 addr)
+{
+       struct rb_int_node *node;
+
+       node = rb_int_search_single(root, addr);
+       if (node == NULL)
+               return NULL;
+
+       return ioport_node(node);
+}
+
+static int ioport_insert(struct rb_root *root, struct ioport *data)
+{
+       return rb_int_insert(root, &data->node);
+}
+
+static bool debug_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       exit(EXIT_SUCCESS);
+}
+
+static struct ioport_operations debug_ops = {
+       .io_out         = debug_io_out,
+};
+
+static bool dummy_io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       return true;
+}
+
+static bool dummy_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       return true;
+}
+
+static struct ioport_operations dummy_read_write_ioport_ops = {
+       .io_in          = dummy_io_in,
+       .io_out         = dummy_io_out,
+};
+
+static struct ioport_operations dummy_write_only_ioport_ops = {
+       .io_out         = dummy_io_out,
+};
+
+u16 ioport__register(u16 port, struct ioport_operations *ops, int count, void *param)
+{
+       struct ioport *entry;
+
+       br_write_lock();
+       if (port == IOPORT_EMPTY)
+               port = ioport__find_free_port();
+
+       entry = ioport_search(&ioport_tree, port);
+       if (entry) {
+               pr_warning("ioport re-registered: %x", port);
+               rb_int_erase(&ioport_tree, &entry->node);
+       }
+
+       entry = malloc(sizeof(*entry));
+       if (entry == NULL)
+               die("Failed allocating new ioport entry");
+
+       *entry = (struct ioport) {
+               .node   = RB_INT_INIT(port, port + count),
+               .ops    = ops,
+               .priv   = param,
+       };
+
+       ioport_insert(&ioport_tree, entry);
+
+       br_write_unlock();
+
+       return port;
+}
+
+static const char *to_direction(int direction)
+{
+       if (direction == KVM_EXIT_IO_IN)
+               return "IN";
+       else
+               return "OUT";
+}
+
+static void ioport_error(u16 port, void *data, int direction, int size, u32 count)
+{
+       fprintf(stderr, "IO error: %s port=%x, size=%d, count=%u\n", to_direction(direction), port, size, count);
+}
+
+bool kvm__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int size, u32 count)
+{
+       struct ioport_operations *ops;
+       bool ret = false;
+       struct ioport *entry;
+       void *ptr = data;
+
+       br_read_lock();
+       entry = ioport_search(&ioport_tree, port);
+       if (!entry)
+               goto error;
+
+       ops     = entry->ops;
+
+       while (count--) {
+               if (direction == KVM_EXIT_IO_IN) {
+                       if (ops->io_in)
+                               ret = ops->io_in(entry, kvm, port, ptr, size);
+               } else {
+                       if (ops->io_out)
+                               ret = ops->io_out(entry, kvm, port, ptr, size);
+               }
+
+               ptr += size;
+       }
+
+       br_read_unlock();
+
+       if (!ret)
+               goto error;
+
+       return true;
+error:
+       br_read_unlock();
+
+       if (ioport_debug)
+               ioport_error(port, data, direction, size, count);
+
+       return !ioport_debug;
+}
+
+void ioport__setup_legacy(void)
+{
+       /* 0x0020 - 0x003F - 8259A PIC 1 */
+       ioport__register(0x0020, &dummy_read_write_ioport_ops, 2, NULL);
+
+       /* PORT 0040-005F - PIT - PROGRAMMABLE INTERVAL TIMER (8253, 8254) */
+       ioport__register(0x0040, &dummy_read_write_ioport_ops, 4, NULL);
+
+       /* 0x00A0 - 0x00AF - 8259A PIC 2 */
+       ioport__register(0x00A0, &dummy_read_write_ioport_ops, 2, NULL);
+
+       /* PORT 00E0-00EF are 'motherboard specific' so we use them for our
+          internal debugging purposes.  */
+       ioport__register(IOPORT_DBG, &debug_ops, 1, NULL);
+
+       /* PORT 00ED - DUMMY PORT FOR DELAY??? */
+       ioport__register(0x00ED, &dummy_write_only_ioport_ops, 1, NULL);
+
+       /* 0x00F0 - 0x00FF - Math co-processor */
+       ioport__register(0x00F0, &dummy_write_only_ioport_ops, 2, NULL);
+
+       /* PORT 03D4-03D5 - COLOR VIDEO - CRT CONTROL REGISTERS */
+       ioport__register(0x03D4, &dummy_read_write_ioport_ops, 1, NULL);
+       ioport__register(0x03D5, &dummy_write_only_ioport_ops, 1, NULL);
+}
diff --git a/tools/kvm/irq.c b/tools/kvm/irq.c
new file mode 100644 (file)
index 0000000..e35bf18
--- /dev/null
@@ -0,0 +1,193 @@
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/kvm.h>
+#include <sys/ioctl.h>
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#define IRQ_MAX_GSI                    64
+#define IRQCHIP_MASTER                 0
+#define IRQCHIP_SLAVE                  1
+#define IRQCHIP_IOAPIC                 2
+
+static u8              next_line       = 3;
+static u8              next_dev        = 1;
+static struct rb_root  pci_tree        = RB_ROOT;
+
+/* First 24 GSIs are routed between IRQCHIPs and IOAPICs */
+static u32 gsi = 24;
+
+struct kvm_irq_routing *irq_routing;
+
+static int irq__add_routing(u32 gsi, u32 type, u32 irqchip, u32 pin)
+{
+       if (gsi >= IRQ_MAX_GSI)
+               return -ENOSPC;
+
+       irq_routing->entries[irq_routing->nr++] =
+               (struct kvm_irq_routing_entry) {
+                       .gsi = gsi,
+                       .type = type,
+                       .u.irqchip.irqchip = irqchip,
+                       .u.irqchip.pin = pin,
+               };
+
+       return 0;
+}
+
+static struct pci_dev *search(struct rb_root *root, u32 id)
+{
+       struct rb_node *node = root->rb_node;
+
+       while (node) {
+               struct pci_dev *data = container_of(node, struct pci_dev, node);
+               int result;
+
+               result = id - data->id;
+
+               if (result < 0)
+                       node = node->rb_left;
+               else if (result > 0)
+                       node = node->rb_right;
+               else
+                       return data;
+       }
+       return NULL;
+}
+
+static int insert(struct rb_root *root, struct pci_dev *data)
+{
+       struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+       /* Figure out where to put new node */
+       while (*new) {
+               struct pci_dev *this    = container_of(*new, struct pci_dev, node);
+               int result              = data->id - this->id;
+
+               parent = *new;
+               if (result < 0)
+                       new = &((*new)->rb_left);
+               else if (result > 0)
+                       new = &((*new)->rb_right);
+               else
+                       return 0;
+       }
+
+       /* Add new node and rebalance tree. */
+       rb_link_node(&data->node, parent, new);
+       rb_insert_color(&data->node, root);
+
+       return 1;
+}
+
+int irq__register_device(u32 dev, u8 *num, u8 *pin, u8 *line)
+{
+       struct pci_dev *node;
+
+       node = search(&pci_tree, dev);
+
+       if (!node) {
+               /* We haven't found a node - First device of it's kind */
+               node = malloc(sizeof(*node));
+               if (node == NULL)
+                       return -1;
+
+               *node = (struct pci_dev) {
+                       .id     = dev,
+                       /*
+                        * PCI supports only INTA#,B#,C#,D# per device.
+                        * A#,B#,C#,D# are allowed for multifunctional
+                        * devices so stick with A# for our single
+                        * function devices.
+                        */
+                       .pin    = 1,
+               };
+
+               INIT_LIST_HEAD(&node->lines);
+
+               if (insert(&pci_tree, node) != 1) {
+                       free(node);
+                       return -1;
+               }
+       }
+
+       if (node) {
+               /* This device already has a pin assigned, give out a new line and device id */
+               struct irq_line *new = malloc(sizeof(*new));
+               if (new == NULL)
+                       return -1;
+
+               new->line       = next_line++;
+               *line           = new->line;
+               *pin            = node->pin;
+               *num            = next_dev++;
+
+               list_add(&new->node, &node->lines);
+
+               return 0;
+       }
+
+       return -1;
+}
+
+void irq__init(struct kvm *kvm)
+{
+       int i, r;
+
+       irq_routing = malloc(sizeof(struct kvm_irq_routing) +
+                       IRQ_MAX_GSI * sizeof(struct kvm_irq_routing_entry));
+       if (irq_routing == NULL)
+               die("Failed allocating space for GSI table");
+
+       /* Hook first 8 GSIs to master IRQCHIP */
+       for (i = 0; i < 8; i++)
+               if (i != 2)
+                       irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_MASTER, i);
+
+       /* Hook next 8 GSIs to slave IRQCHIP */
+       for (i = 8; i < 16; i++)
+               irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_SLAVE, i - 8);
+
+       /* Last but not least, IOAPIC */
+       for (i = 0; i < 24; i++) {
+               if (i == 0)
+                       irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_IOAPIC, 2);
+               else if (i != 2)
+                       irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_IOAPIC, i);
+       }
+
+       r = ioctl(kvm->vm_fd, KVM_SET_GSI_ROUTING, irq_routing);
+       if (r)
+               die("Failed setting GSI routes");
+}
+
+int irq__add_msix_route(struct kvm *kvm, u32 low, u32 high, u32 data)
+{
+       int r;
+
+       irq_routing->entries[irq_routing->nr++] =
+               (struct kvm_irq_routing_entry) {
+                       .gsi = gsi,
+                       .type = KVM_IRQ_ROUTING_MSI,
+                       .u.msi.address_lo = low,
+                       .u.msi.address_hi = high,
+                       .u.msi.data = data,
+               };
+
+       r = ioctl(kvm->vm_fd, KVM_SET_GSI_ROUTING, irq_routing);
+       if (r)
+               return r;
+
+       return gsi++;
+}
+
+struct rb_node *irq__get_pci_tree(void)
+{
+       return rb_first(&pci_tree);
+}
diff --git a/tools/kvm/kvm-cmd.c b/tools/kvm/kvm-cmd.c
new file mode 100644 (file)
index 0000000..96108a8
--- /dev/null
@@ -0,0 +1,91 @@
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+#include <assert.h>
+
+/* user defined header files */
+#include "kvm/builtin-debug.h"
+#include "kvm/builtin-pause.h"
+#include "kvm/builtin-resume.h"
+#include "kvm/builtin-balloon.h"
+#include "kvm/builtin-list.h"
+#include "kvm/builtin-version.h"
+#include "kvm/builtin-setup.h"
+#include "kvm/builtin-stop.h"
+#include "kvm/builtin-stat.h"
+#include "kvm/builtin-help.h"
+#include "kvm/kvm-cmd.h"
+#include "kvm/builtin-run.h"
+#include "kvm/util.h"
+
+struct cmd_struct kvm_commands[] = {
+       { "pause",      kvm_cmd_pause,          kvm_pause_help,         0 },
+       { "resume",     kvm_cmd_resume,         kvm_resume_help,        0 },
+       { "debug",      kvm_cmd_debug,          kvm_debug_help,         0 },
+       { "balloon",    kvm_cmd_balloon,        kvm_balloon_help,       0 },
+       { "list",       kvm_cmd_list,           kvm_list_help,          0 },
+       { "version",    kvm_cmd_version,        NULL,                   0 },
+       { "--version",  kvm_cmd_version,        NULL,                   0 },
+       { "stop",       kvm_cmd_stop,           kvm_stop_help,          0 },
+       { "stat",       kvm_cmd_stat,           kvm_stat_help,          0 },
+       { "help",       kvm_cmd_help,           NULL,                   0 },
+       { "setup",      kvm_cmd_setup,          kvm_setup_help,         0 },
+       { "run",        kvm_cmd_run,            kvm_run_help,           0 },
+       { NULL,         NULL,                   NULL,                   0 },
+};
+
+/*
+ * kvm_get_command: Searches the command in an array of the commands and
+ * returns a pointer to cmd_struct if a match is found.
+ *
+ * Input parameters:
+ * command: Array of possible commands. The last entry in the array must be
+ *          NULL.
+ * cmd: A string command to search in the array
+ *
+ * Return Value:
+ * NULL: If the cmd is not matched with any of the command in the command array
+ * p: Pointer to cmd_struct of the matching command
+ */
+struct cmd_struct *kvm_get_command(struct cmd_struct *command,
+               const char *cmd)
+{
+       struct cmd_struct *p = command;
+
+       while (p->cmd) {
+               if (!strcmp(p->cmd, cmd))
+                       return p;
+               p++;
+       }
+       return NULL;
+}
+
+int handle_command(struct cmd_struct *command, int argc, const char **argv)
+{
+       struct cmd_struct *p;
+       const char *prefix = NULL;
+       int ret = 0;
+
+       if (!argv || !*argv) {
+               p = kvm_get_command(command, "help");
+               assert(p);
+               return p->fn(argc, argv, prefix);
+       }
+
+       p = kvm_get_command(command, argv[0]);
+       if (!p) {
+               p = kvm_get_command(command, "help");
+               assert(p);
+               p->fn(0, NULL, prefix);
+               return EINVAL;
+       }
+
+       ret = p->fn(argc - 1, &argv[1], prefix);
+       if (ret < 0) {
+               if (errno == EPERM)
+                       die("Permission error - are you root?");
+       }
+
+       return ret;
+}
diff --git a/tools/kvm/kvm-cpu.c b/tools/kvm/kvm-cpu.c
new file mode 100644 (file)
index 0000000..fc0d6d4
--- /dev/null
@@ -0,0 +1,509 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/symbol.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <asm/msr-index.h>
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+
+#define PAGE_SIZE (sysconf(_SC_PAGE_SIZE))
+
+extern struct kvm_cpu *kvm_cpus[KVM_NR_CPUS];
+extern __thread struct kvm_cpu *current_kvm_cpu;
+
+static inline bool is_in_protected_mode(struct kvm_cpu *vcpu)
+{
+       return vcpu->sregs.cr0 & 0x01;
+}
+
+static inline u64 ip_to_flat(struct kvm_cpu *vcpu, u64 ip)
+{
+       u64 cs;
+
+       /*
+        * NOTE! We should take code segment base address into account here.
+        * Luckily it's usually zero because Linux uses flat memory model.
+        */
+       if (is_in_protected_mode(vcpu))
+               return ip;
+
+       cs = vcpu->sregs.cs.selector;
+
+       return ip + (cs << 4);
+}
+
+static inline u32 selector_to_base(u16 selector)
+{
+       /*
+        * KVM on Intel requires 'base' to be 'selector * 16' in real mode.
+        */
+       return (u32)selector * 16;
+}
+
+static struct kvm_cpu *kvm_cpu__new(struct kvm *kvm)
+{
+       struct kvm_cpu *vcpu;
+
+       vcpu            = calloc(1, sizeof *vcpu);
+       if (!vcpu)
+               return NULL;
+
+       vcpu->kvm       = kvm;
+
+       return vcpu;
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+       if (vcpu->msrs)
+               free(vcpu->msrs);
+
+       free(vcpu);
+}
+
+struct kvm_cpu *kvm_cpu__init(struct kvm *kvm, unsigned long cpu_id)
+{
+       struct kvm_cpu *vcpu;
+       int mmap_size;
+       int coalesced_offset;
+
+       vcpu            = kvm_cpu__new(kvm);
+       if (!vcpu)
+               return NULL;
+
+       vcpu->cpu_id    = cpu_id;
+
+       vcpu->vcpu_fd = ioctl(vcpu->kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+       if (vcpu->vcpu_fd < 0)
+               die_perror("KVM_CREATE_VCPU ioctl");
+
+       mmap_size = ioctl(vcpu->kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+       if (mmap_size < 0)
+               die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+       vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, vcpu->vcpu_fd, 0);
+       if (vcpu->kvm_run == MAP_FAILED)
+               die("unable to mmap vcpu fd");
+
+       coalesced_offset = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO);
+       if (coalesced_offset)
+               vcpu->ring = (void *)vcpu->kvm_run + (coalesced_offset * PAGE_SIZE);
+
+       vcpu->is_running = true;
+
+       return vcpu;
+}
+
+void kvm_cpu__enable_singlestep(struct kvm_cpu *vcpu)
+{
+       struct kvm_guest_debug debug = {
+               .control        = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP,
+       };
+
+       if (ioctl(vcpu->vcpu_fd, KVM_SET_GUEST_DEBUG, &debug) < 0)
+               pr_warning("KVM_SET_GUEST_DEBUG failed");
+}
+
+static struct kvm_msrs *kvm_msrs__new(size_t nmsrs)
+{
+       struct kvm_msrs *vcpu = calloc(1, sizeof(*vcpu) + (sizeof(struct kvm_msr_entry) * nmsrs));
+
+       if (!vcpu)
+               die("out of memory");
+
+       return vcpu;
+}
+
+#define KVM_MSR_ENTRY(_index, _data)   \
+       (struct kvm_msr_entry) { .index = _index, .data = _data }
+
+static void kvm_cpu__setup_msrs(struct kvm_cpu *vcpu)
+{
+       unsigned long ndx = 0;
+
+       vcpu->msrs = kvm_msrs__new(100);
+
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_CS,        0x0);
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_ESP,       0x0);
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_EIP,       0x0);
+#ifdef CONFIG_X86_64
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_STAR,                    0x0);
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_CSTAR,                   0x0);
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_KERNEL_GS_BASE,          0x0);
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_SYSCALL_MASK,            0x0);
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_LSTAR,                   0x0);
+#endif
+       vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_TSC,                0x0);
+
+       vcpu->msrs->nmsrs       = ndx;
+
+       if (ioctl(vcpu->vcpu_fd, KVM_SET_MSRS, vcpu->msrs) < 0)
+               die_perror("KVM_SET_MSRS failed");
+}
+
+static void kvm_cpu__setup_fpu(struct kvm_cpu *vcpu)
+{
+       vcpu->fpu = (struct kvm_fpu) {
+               .fcw            = 0x37f,
+               .mxcsr          = 0x1f80,
+       };
+
+       if (ioctl(vcpu->vcpu_fd, KVM_SET_FPU, &vcpu->fpu) < 0)
+               die_perror("KVM_SET_FPU failed");
+}
+
+static void kvm_cpu__setup_regs(struct kvm_cpu *vcpu)
+{
+       vcpu->regs = (struct kvm_regs) {
+               /* We start the guest in 16-bit real mode  */
+               .rflags         = 0x0000000000000002ULL,
+
+               .rip            = vcpu->kvm->boot_ip,
+               .rsp            = vcpu->kvm->boot_sp,
+               .rbp            = vcpu->kvm->boot_sp,
+       };
+
+       if (vcpu->regs.rip > USHRT_MAX)
+               die("ip 0x%llx is too high for real mode", (u64) vcpu->regs.rip);
+
+       if (ioctl(vcpu->vcpu_fd, KVM_SET_REGS, &vcpu->regs) < 0)
+               die_perror("KVM_SET_REGS failed");
+}
+
+static void kvm_cpu__setup_sregs(struct kvm_cpu *vcpu)
+{
+
+       if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+               die_perror("KVM_GET_SREGS failed");
+
+       vcpu->sregs.cs.selector = vcpu->kvm->boot_selector;
+       vcpu->sregs.cs.base     = selector_to_base(vcpu->kvm->boot_selector);
+       vcpu->sregs.ss.selector = vcpu->kvm->boot_selector;
+       vcpu->sregs.ss.base     = selector_to_base(vcpu->kvm->boot_selector);
+       vcpu->sregs.ds.selector = vcpu->kvm->boot_selector;
+       vcpu->sregs.ds.base     = selector_to_base(vcpu->kvm->boot_selector);
+       vcpu->sregs.es.selector = vcpu->kvm->boot_selector;
+       vcpu->sregs.es.base     = selector_to_base(vcpu->kvm->boot_selector);
+       vcpu->sregs.fs.selector = vcpu->kvm->boot_selector;
+       vcpu->sregs.fs.base     = selector_to_base(vcpu->kvm->boot_selector);
+       vcpu->sregs.gs.selector = vcpu->kvm->boot_selector;
+       vcpu->sregs.gs.base     = selector_to_base(vcpu->kvm->boot_selector);
+
+       if (ioctl(vcpu->vcpu_fd, KVM_SET_SREGS, &vcpu->sregs) < 0)
+               die_perror("KVM_SET_SREGS failed");
+}
+
+/**
+ * kvm_cpu__reset_vcpu - reset virtual CPU to a known state
+ */
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+       kvm_cpu__setup_sregs(vcpu);
+       kvm_cpu__setup_regs(vcpu);
+       kvm_cpu__setup_fpu(vcpu);
+       kvm_cpu__setup_msrs(vcpu);
+}
+
+static void print_dtable(const char *name, struct kvm_dtable *dtable)
+{
+       printf(" %s                 %016llx  %08hx\n",
+               name, (u64) dtable->base, (u16) dtable->limit);
+}
+
+static void print_segment(const char *name, struct kvm_segment *seg)
+{
+       printf(" %s       %04hx      %016llx  %08x  %02hhx    %x %x   %x  %x %x %x %x\n",
+               name, (u16) seg->selector, (u64) seg->base, (u32) seg->limit,
+               (u8) seg->type, seg->present, seg->dpl, seg->db, seg->s, seg->l, seg->g, seg->avl);
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+       unsigned long cr0, cr2, cr3;
+       unsigned long cr4, cr8;
+       unsigned long rax, rbx, rcx;
+       unsigned long rdx, rsi, rdi;
+       unsigned long rbp,  r8,  r9;
+       unsigned long r10, r11, r12;
+       unsigned long r13, r14, r15;
+       unsigned long rip, rsp;
+       struct kvm_sregs sregs;
+       unsigned long rflags;
+       struct kvm_regs regs;
+       int i;
+
+       if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &regs) < 0)
+               die("KVM_GET_REGS failed");
+
+       rflags = regs.rflags;
+
+       rip = regs.rip; rsp = regs.rsp;
+       rax = regs.rax; rbx = regs.rbx; rcx = regs.rcx;
+       rdx = regs.rdx; rsi = regs.rsi; rdi = regs.rdi;
+       rbp = regs.rbp; r8  = regs.r8;  r9  = regs.r9;
+       r10 = regs.r10; r11 = regs.r11; r12 = regs.r12;
+       r13 = regs.r13; r14 = regs.r14; r15 = regs.r15;
+
+       printf("\n Registers:\n");
+       printf(  " ----------\n");
+       printf(" rip: %016lx   rsp: %016lx flags: %016lx\n", rip, rsp, rflags);
+       printf(" rax: %016lx   rbx: %016lx   rcx: %016lx\n", rax, rbx, rcx);
+       printf(" rdx: %016lx   rsi: %016lx   rdi: %016lx\n", rdx, rsi, rdi);
+       printf(" rbp: %016lx    r8: %016lx    r9: %016lx\n", rbp, r8,  r9);
+       printf(" r10: %016lx   r11: %016lx   r12: %016lx\n", r10, r11, r12);
+       printf(" r13: %016lx   r14: %016lx   r15: %016lx\n", r13, r14, r15);
+
+       if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
+               die("KVM_GET_REGS failed");
+
+       cr0 = sregs.cr0; cr2 = sregs.cr2; cr3 = sregs.cr3;
+       cr4 = sregs.cr4; cr8 = sregs.cr8;
+
+       printf(" cr0: %016lx   cr2: %016lx   cr3: %016lx\n", cr0, cr2, cr3);
+       printf(" cr4: %016lx   cr8: %016lx\n", cr4, cr8);
+       printf("\n Segment registers:\n");
+       printf(  " ------------------\n");
+       printf(" register  selector  base              limit     type  p dpl db s l g avl\n");
+       print_segment("cs ", &sregs.cs);
+       print_segment("ss ", &sregs.ss);
+       print_segment("ds ", &sregs.ds);
+       print_segment("es ", &sregs.es);
+       print_segment("fs ", &sregs.fs);
+       print_segment("gs ", &sregs.gs);
+       print_segment("tr ", &sregs.tr);
+       print_segment("ldt", &sregs.ldt);
+       print_dtable("gdt", &sregs.gdt);
+       print_dtable("idt", &sregs.idt);
+
+       printf("\n APIC:\n");
+       printf(  " -----\n");
+       printf(" efer: %016llx  apic base: %016llx  nmi: %s\n",
+               (u64) sregs.efer, (u64) sregs.apic_base,
+               (vcpu->kvm->nmi_disabled ? "disabled" : "enabled"));
+
+       printf("\n Interrupt bitmap:\n");
+       printf(  " -----------------\n");
+       for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++)
+               printf(" %016llx", (u64) sregs.interrupt_bitmap[i]);
+       printf("\n");
+}
+
+#define MAX_SYM_LEN            128
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+       unsigned int code_bytes = 64;
+       unsigned int code_prologue = code_bytes * 43 / 64;
+       unsigned int code_len = code_bytes;
+       char sym[MAX_SYM_LEN];
+       unsigned char c;
+       unsigned int i;
+       u8 *ip;
+
+       if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &vcpu->regs) < 0)
+               die("KVM_GET_REGS failed");
+
+       if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+               die("KVM_GET_SREGS failed");
+
+       ip = guest_flat_to_host(vcpu->kvm, ip_to_flat(vcpu, vcpu->regs.rip) - code_prologue);
+
+       printf("\n Code:\n");
+       printf(  " -----\n");
+
+       symbol__lookup(vcpu->kvm, vcpu->regs.rip, sym, MAX_SYM_LEN);
+
+       printf(" rip: [<%016lx>] %s\n\n", (unsigned long) vcpu->regs.rip, sym);
+
+       for (i = 0; i < code_len; i++, ip++) {
+               if (!host_ptr_in_ram(vcpu->kvm, ip))
+                       break;
+
+               c = *ip;
+
+               if (ip == guest_flat_to_host(vcpu->kvm, ip_to_flat(vcpu, vcpu->regs.rip)))
+                       printf(" <%02x>", c);
+               else
+                       printf(" %02x", c);
+       }
+
+       printf("\n");
+
+       printf("\n Stack:\n");
+       printf(  " ------\n");
+       kvm__dump_mem(vcpu->kvm, vcpu->regs.rsp, 32);
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+       u64 *pte1;
+       u64 *pte2;
+       u64 *pte3;
+       u64 *pte4;
+
+       if (!is_in_protected_mode(vcpu))
+               return;
+
+       if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+               die("KVM_GET_SREGS failed");
+
+       pte4    = guest_flat_to_host(vcpu->kvm, vcpu->sregs.cr3);
+       if (!host_ptr_in_ram(vcpu->kvm, pte4))
+               return;
+
+       pte3    = guest_flat_to_host(vcpu->kvm, (*pte4 & ~0xfff));
+       if (!host_ptr_in_ram(vcpu->kvm, pte3))
+               return;
+
+       pte2    = guest_flat_to_host(vcpu->kvm, (*pte3 & ~0xfff));
+       if (!host_ptr_in_ram(vcpu->kvm, pte2))
+               return;
+
+       pte1    = guest_flat_to_host(vcpu->kvm, (*pte2 & ~0xfff));
+       if (!host_ptr_in_ram(vcpu->kvm, pte1))
+               return;
+
+       printf("Page Tables:\n");
+       if (*pte2 & (1 << 7))
+               printf(" pte4: %016llx   pte3: %016llx"
+                       "   pte2: %016llx\n",
+                       *pte4, *pte3, *pte2);
+       else
+               printf(" pte4: %016llx  pte3: %016llx   pte2: %016"
+                       "llx   pte1: %016llx\n",
+                       *pte4, *pte3, *pte2, *pte1);
+}
+
+void kvm_cpu__run(struct kvm_cpu *vcpu)
+{
+       int err;
+
+       err = ioctl(vcpu->vcpu_fd, KVM_RUN, 0);
+       if (err && (errno != EINTR && errno != EAGAIN))
+               die_perror("KVM_RUN failed");
+}
+
+static void kvm_cpu_signal_handler(int signum)
+{
+       if (signum == SIGKVMEXIT) {
+               if (current_kvm_cpu && current_kvm_cpu->is_running) {
+                       current_kvm_cpu->is_running = false;
+                       pthread_kill(pthread_self(), SIGKVMEXIT);
+               }
+       } else if (signum == SIGKVMPAUSE) {
+               current_kvm_cpu->paused = 1;
+       }
+}
+
+static void kvm_cpu__handle_coalesced_mmio(struct kvm_cpu *cpu)
+{
+       if (cpu->ring) {
+               while (cpu->ring->first != cpu->ring->last) {
+                       struct kvm_coalesced_mmio *m;
+                       m = &cpu->ring->coalesced_mmio[cpu->ring->first];
+                       kvm__emulate_mmio(cpu->kvm,
+                                       m->phys_addr,
+                                       m->data,
+                                       m->len,
+                                       1);
+                       cpu->ring->first = (cpu->ring->first + 1) % KVM_COALESCED_MMIO_MAX;
+               }
+       }
+}
+
+void kvm_cpu__reboot(void)
+{
+       int i;
+
+       for (i = 0; i < KVM_NR_CPUS; i++)
+               if (kvm_cpus[i])
+                       pthread_kill(kvm_cpus[i]->thread, SIGKVMEXIT);
+}
+
+int kvm_cpu__start(struct kvm_cpu *cpu)
+{
+       sigset_t sigset;
+
+       sigemptyset(&sigset);
+       sigaddset(&sigset, SIGALRM);
+
+       pthread_sigmask(SIG_BLOCK, &sigset, NULL);
+
+       signal(SIGKVMEXIT, kvm_cpu_signal_handler);
+       signal(SIGKVMPAUSE, kvm_cpu_signal_handler);
+
+       kvm_cpu__setup_cpuid(cpu);
+       kvm_cpu__reset_vcpu(cpu);
+
+       if (cpu->kvm->single_step)
+               kvm_cpu__enable_singlestep(cpu);
+
+       while (cpu->is_running) {
+               if (cpu->paused) {
+                       kvm__notify_paused();
+                       cpu->paused = 0;
+               }
+
+               kvm_cpu__run(cpu);
+
+               switch (cpu->kvm_run->exit_reason) {
+               case KVM_EXIT_UNKNOWN:
+                       break;
+               case KVM_EXIT_DEBUG:
+                       kvm_cpu__show_registers(cpu);
+                       kvm_cpu__show_code(cpu);
+                       break;
+               case KVM_EXIT_IO: {
+                       bool ret;
+
+                       ret = kvm__emulate_io(cpu->kvm,
+                                       cpu->kvm_run->io.port,
+                                       (u8 *)cpu->kvm_run +
+                                       cpu->kvm_run->io.data_offset,
+                                       cpu->kvm_run->io.direction,
+                                       cpu->kvm_run->io.size,
+                                       cpu->kvm_run->io.count);
+
+                       if (!ret)
+                               goto panic_kvm;
+                       break;
+               }
+               case KVM_EXIT_MMIO: {
+                       bool ret;
+
+                       ret = kvm__emulate_mmio(cpu->kvm,
+                                       cpu->kvm_run->mmio.phys_addr,
+                                       cpu->kvm_run->mmio.data,
+                                       cpu->kvm_run->mmio.len,
+                                       cpu->kvm_run->mmio.is_write);
+
+                       if (!ret)
+                               goto panic_kvm;
+                       break;
+               }
+               case KVM_EXIT_INTR:
+                       if (cpu->is_running)
+                               break;
+                       goto exit_kvm;
+               case KVM_EXIT_SHUTDOWN:
+                       goto exit_kvm;
+               default:
+                       goto panic_kvm;
+               }
+               kvm_cpu__handle_coalesced_mmio(cpu);
+       }
+
+exit_kvm:
+       return 0;
+
+panic_kvm:
+       return 1;
+}
diff --git a/tools/kvm/kvm.c b/tools/kvm/kvm.c
new file mode 100644 (file)
index 0000000..199c9b9
--- /dev/null
@@ -0,0 +1,749 @@
+#include "kvm/kvm.h"
+
+#include "kvm/boot-protocol.h"
+#include "kvm/cpufeature.h"
+#include "kvm/read-write.h"
+#include "kvm/interrupt.h"
+#include "kvm/mptable.h"
+#include "kvm/util.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm-cpu.h"
+
+#include <linux/kvm.h>
+
+#include <asm/bootparam.h>
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <assert.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <time.h>
+#include <sys/eventfd.h>
+#include <asm/unistd.h>
+#include <dirent.h>
+
+#define DEFINE_KVM_EXIT_REASON(reason) [reason] = #reason
+
+const char *kvm_exit_reasons[] = {
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_UNKNOWN),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_EXCEPTION),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_IO),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_HYPERCALL),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_DEBUG),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_HLT),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_MMIO),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_IRQ_WINDOW_OPEN),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_SHUTDOWN),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_FAIL_ENTRY),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTR),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_SET_TPR),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_TPR_ACCESS),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_SIEIC),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_RESET),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_DCR),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_NMI),
+       DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTERNAL_ERROR),
+};
+
+#define DEFINE_KVM_EXT(ext)            \
+       .name = #ext,                   \
+       .code = ext
+
+struct {
+       const char *name;
+       int code;
+} kvm_req_ext[] = {
+       { DEFINE_KVM_EXT(KVM_CAP_COALESCED_MMIO) },
+       { DEFINE_KVM_EXT(KVM_CAP_SET_TSS_ADDR) },
+       { DEFINE_KVM_EXT(KVM_CAP_PIT2) },
+       { DEFINE_KVM_EXT(KVM_CAP_USER_MEMORY) },
+       { DEFINE_KVM_EXT(KVM_CAP_IRQ_ROUTING) },
+       { DEFINE_KVM_EXT(KVM_CAP_IRQCHIP) },
+       { DEFINE_KVM_EXT(KVM_CAP_HLT) },
+       { DEFINE_KVM_EXT(KVM_CAP_IRQ_INJECT_STATUS) },
+       { DEFINE_KVM_EXT(KVM_CAP_EXT_CPUID) },
+};
+
+extern struct kvm *kvm;
+extern struct kvm_cpu *kvm_cpus[KVM_NR_CPUS];
+static int pause_event;
+static DEFINE_MUTEX(pause_lock);
+
+static bool kvm__supports_extension(struct kvm *kvm, unsigned int extension)
+{
+       int ret;
+
+       ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, extension);
+       if (ret < 0)
+               return false;
+
+       return ret;
+}
+
+static int kvm__check_extensions(struct kvm *kvm)
+{
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(kvm_req_ext); i++) {
+               if (!kvm__supports_extension(kvm, kvm_req_ext[i].code)) {
+                       pr_error("Unsuppored KVM extension detected: %s",
+                               kvm_req_ext[i].name);
+                       return (int)-i;
+               }
+       }
+
+       return 0;
+}
+
+static struct kvm *kvm__new(void)
+{
+       struct kvm *kvm = calloc(1, sizeof *kvm);
+
+       if (!kvm)
+               die("out of memory");
+
+       return kvm;
+}
+
+static void kvm__create_pidfile(struct kvm *kvm)
+{
+       int fd;
+       char full_name[PATH_MAX], pid[10];
+
+       if (!kvm->name)
+               return;
+
+       sprintf(full_name, "%s/%s", HOME_DIR, KVM_PID_FILE_PATH);
+       mkdir(full_name, 0777);
+       sprintf(full_name, "%s/%s/%s.pid", HOME_DIR, KVM_PID_FILE_PATH, kvm->name);
+       fd = open(full_name, O_CREAT | O_WRONLY, 0666);
+       sprintf(pid, "%u\n", getpid());
+       if (write(fd, pid, strlen(pid)) <= 0)
+               die("Failed creating PID file");
+       close(fd);
+}
+
+void kvm__remove_pidfile(const char *name)
+{
+       char full_name[PATH_MAX];
+
+       sprintf(full_name, "%s/%s/%s.pid", HOME_DIR, KVM_PID_FILE_PATH, name);
+       unlink(full_name);
+}
+
+pid_t kvm__get_pid_by_instance(const char *name)
+{
+       int fd;
+       pid_t pid;
+       char pid_str[10], pid_file[PATH_MAX];
+
+       sprintf(pid_file, "%s/%s/%s.pid", HOME_DIR, KVM_PID_FILE_PATH, name);
+       fd = open(pid_file, O_RDONLY);
+       if (fd < 0)
+               return -1;
+
+       if (read(fd, pid_str, 10) == 0)
+               return -1;
+
+       pid = atoi(pid_str);
+       if (pid < 0)
+               return -1;
+
+       close(fd);
+
+       return pid;
+}
+
+int kvm__enumerate_instances(int (*callback)(const char *name, int pid))
+{
+       char full_name[PATH_MAX];
+       int pid;
+       DIR *dir;
+       struct dirent entry, *result;
+       int ret = 0;
+
+       sprintf(full_name, "%s/%s", HOME_DIR, KVM_PID_FILE_PATH);
+       dir = opendir(full_name);
+
+       while (dir != NULL) {
+               readdir_r(dir, &entry, &result);
+               if (result == NULL)
+                       break;
+               if (entry.d_type == DT_REG) {
+                       entry.d_name[strlen(entry.d_name)-4] = 0;
+                       pid = kvm__get_pid_by_instance(entry.d_name);
+                       ret = callback(entry.d_name, pid);
+                       if (ret < 0)
+                               break;
+               }
+       }
+
+       closedir(dir);
+
+       return ret;
+}
+
+void kvm__delete(struct kvm *kvm)
+{
+       kvm__stop_timer(kvm);
+
+       munmap(kvm->ram_start, kvm->ram_size);
+       kvm__remove_pidfile(kvm->name);
+       free(kvm);
+}
+
+static bool kvm__cpu_supports_vm(void)
+{
+       struct cpuid_regs regs;
+       u32 eax_base;
+       int feature;
+
+       regs    = (struct cpuid_regs) {
+               .eax            = 0x00,
+       };
+       host_cpuid(&regs);
+
+       switch (regs.ebx) {
+       case CPUID_VENDOR_INTEL_1:
+               eax_base        = 0x00;
+               feature         = KVM__X86_FEATURE_VMX;
+               break;
+
+       case CPUID_VENDOR_AMD_1:
+               eax_base        = 0x80000000;
+               feature         = KVM__X86_FEATURE_SVM;
+               break;
+
+       default:
+               return false;
+       }
+
+       regs    = (struct cpuid_regs) {
+               .eax            = eax_base,
+       };
+       host_cpuid(&regs);
+
+       if (regs.eax < eax_base + 0x01)
+               return false;
+
+       regs    = (struct cpuid_regs) {
+               .eax            = eax_base + 0x01
+       };
+       host_cpuid(&regs);
+
+       return regs.ecx & (1 << feature);
+}
+
+/*
+ * Note: KVM_SET_USER_MEMORY_REGION assumes that we don't pass overlapping
+ * memory regions to it. Therefore, be careful if you use this function for
+ * registering memory regions for emulating hardware.
+ */
+void kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void *userspace_addr)
+{
+       struct kvm_userspace_memory_region mem;
+       int ret;
+
+       mem = (struct kvm_userspace_memory_region) {
+               .slot                   = kvm->mem_slots++,
+               .guest_phys_addr        = guest_phys,
+               .memory_size            = size,
+               .userspace_addr         = (unsigned long)userspace_addr,
+       };
+
+       ret = ioctl(kvm->vm_fd, KVM_SET_USER_MEMORY_REGION, &mem);
+       if (ret < 0)
+               die_perror("KVM_SET_USER_MEMORY_REGION ioctl");
+}
+
+/*
+ * Allocating RAM size bigger than 4GB requires us to leave a gap
+ * in the RAM which is used for PCI MMIO, hotplug, and unconfigured
+ * devices (see documentation of e820_setup_gap() for details).
+ *
+ * If we're required to initialize RAM bigger than 4GB, we will create
+ * a gap between 0xe0000000 and 0x100000000 in the guest virtual mem space.
+ */
+
+void kvm__init_ram(struct kvm *kvm)
+{
+       u64     phys_start, phys_size;
+       void    *host_mem;
+
+       if (kvm->ram_size < KVM_32BIT_GAP_START) {
+               /* Use a single block of RAM for 32bit RAM */
+
+               phys_start = 0;
+               phys_size  = kvm->ram_size;
+               host_mem   = kvm->ram_start;
+
+               kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+       } else {
+               /* First RAM range from zero to the PCI gap: */
+
+               phys_start = 0;
+               phys_size  = KVM_32BIT_GAP_START;
+               host_mem   = kvm->ram_start;
+
+               kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+
+               /* Second RAM range from 4GB to the end of RAM: */
+
+               phys_start = 0x100000000ULL;
+               phys_size  = kvm->ram_size - phys_size;
+               host_mem   = kvm->ram_start + phys_start;
+
+               kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+       }
+}
+
+int kvm__recommended_cpus(struct kvm *kvm)
+{
+       int ret;
+
+       ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_NR_VCPUS);
+       if (ret <= 0)
+               die_perror("KVM_CAP_NR_VCPUS");
+
+       return ret;
+}
+
+/*
+ * The following hack should be removed once 'x86: Raise the hard
+ * VCPU count limit' makes it's way into the mainline.
+ */
+#ifndef KVM_CAP_MAX_VCPUS
+#define KVM_CAP_MAX_VCPUS 66
+#endif
+
+int kvm__max_cpus(struct kvm *kvm)
+{
+       int ret;
+
+       ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_MAX_VCPUS);
+       if (ret <= 0)
+               ret = kvm__recommended_cpus(kvm);
+
+       return ret;
+}
+
+struct kvm *kvm__init(const char *kvm_dev, u64 ram_size, const char *name)
+{
+       struct kvm_pit_config pit_config = { .flags = 0, };
+       struct kvm *kvm;
+       int ret;
+
+       if (!kvm__cpu_supports_vm())
+               die("Your CPU does not support hardware virtualization");
+
+       kvm = kvm__new();
+
+       kvm->sys_fd = open(kvm_dev, O_RDWR);
+       if (kvm->sys_fd < 0) {
+               if (errno == ENOENT)
+                       die("'%s' not found. Please make sure your kernel has CONFIG_KVM enabled and that the KVM modules are loaded.", kvm_dev);
+               if (errno == ENODEV)
+                       die("'%s' KVM driver not available.\n  # (If the KVM module is loaded then 'dmesg' may offer further clues about the failure.)", kvm_dev);
+
+               fprintf(stderr, "  Fatal, could not open %s: ", kvm_dev);
+               perror(NULL);
+               exit(1);
+       }
+
+       ret = ioctl(kvm->sys_fd, KVM_GET_API_VERSION, 0);
+       if (ret != KVM_API_VERSION)
+               die_perror("KVM_API_VERSION ioctl");
+
+       kvm->vm_fd = ioctl(kvm->sys_fd, KVM_CREATE_VM, 0);
+       if (kvm->vm_fd < 0)
+               die_perror("KVM_CREATE_VM ioctl");
+
+       if (kvm__check_extensions(kvm))
+               die("A required KVM extention is not supported by OS");
+
+       ret = ioctl(kvm->vm_fd, KVM_SET_TSS_ADDR, 0xfffbd000);
+       if (ret < 0)
+               die_perror("KVM_SET_TSS_ADDR ioctl");
+
+       ret = ioctl(kvm->vm_fd, KVM_CREATE_PIT2, &pit_config);
+       if (ret < 0)
+               die_perror("KVM_CREATE_PIT2 ioctl");
+
+       kvm->ram_size           = ram_size;
+
+       if (kvm->ram_size < KVM_32BIT_GAP_START) {
+               kvm->ram_start = mmap(NULL, ram_size, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
+       } else {
+               kvm->ram_start = mmap(NULL, ram_size + KVM_32BIT_GAP_SIZE, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
+               if (kvm->ram_start != MAP_FAILED) {
+                       /*
+                        * We mprotect the gap (see kvm__init_ram() for details) PROT_NONE so that
+                        * if we accidently write to it, we will know.
+                        */
+                       mprotect(kvm->ram_start + KVM_32BIT_GAP_START, KVM_32BIT_GAP_SIZE, PROT_NONE);
+               }
+       }
+       if (kvm->ram_start == MAP_FAILED)
+               die("out of memory");
+
+       madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE);
+
+       ret = ioctl(kvm->vm_fd, KVM_CREATE_IRQCHIP);
+       if (ret < 0)
+               die_perror("KVM_CREATE_IRQCHIP ioctl");
+
+       kvm->name = name;
+
+       kvm__create_pidfile(kvm);
+
+       return kvm;
+}
+
+#define BOOT_LOADER_SELECTOR   0x1000
+#define BOOT_LOADER_IP         0x0000
+#define BOOT_LOADER_SP         0x8000
+#define BOOT_CMDLINE_OFFSET    0x20000
+
+#define BOOT_PROTOCOL_REQUIRED 0x206
+#define LOAD_HIGH              0x01
+
+static int load_flat_binary(struct kvm *kvm, int fd)
+{
+       void *p;
+       int nr;
+
+       if (lseek(fd, 0, SEEK_SET) < 0)
+               die_perror("lseek");
+
+       p = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP);
+
+       while ((nr = read(fd, p, 65536)) > 0)
+               p += nr;
+
+       kvm->boot_selector      = BOOT_LOADER_SELECTOR;
+       kvm->boot_ip            = BOOT_LOADER_IP;
+       kvm->boot_sp            = BOOT_LOADER_SP;
+
+       return true;
+}
+
+static const char *BZIMAGE_MAGIC       = "HdrS";
+
+static bool load_bzimage(struct kvm *kvm, int fd_kernel,
+                       int fd_initrd, const char *kernel_cmdline, u16 vidmode)
+{
+       struct boot_params *kern_boot;
+       unsigned long setup_sects;
+       struct boot_params boot;
+       size_t cmdline_size;
+       ssize_t setup_size;
+       void *p;
+       int nr;
+
+       /*
+        * See Documentation/x86/boot.txt for details no bzImage on-disk and
+        * memory layout.
+        */
+
+       if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+               die_perror("lseek");
+
+       if (read(fd_kernel, &boot, sizeof(boot)) != sizeof(boot))
+               return false;
+
+       if (memcmp(&boot.hdr.header, BZIMAGE_MAGIC, strlen(BZIMAGE_MAGIC)))
+               return false;
+
+       if (boot.hdr.version < BOOT_PROTOCOL_REQUIRED)
+               die("Too old kernel");
+
+       if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+               die_perror("lseek");
+
+       if (!boot.hdr.setup_sects)
+               boot.hdr.setup_sects = BZ_DEFAULT_SETUP_SECTS;
+       setup_sects = boot.hdr.setup_sects + 1;
+
+       setup_size = setup_sects << 9;
+       p = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP);
+
+       /* copy setup.bin to mem*/
+       if (read(fd_kernel, p, setup_size) != setup_size)
+               die_perror("read");
+
+       /* copy vmlinux.bin to BZ_KERNEL_START*/
+       p = guest_flat_to_host(kvm, BZ_KERNEL_START);
+
+       while ((nr = read(fd_kernel, p, 65536)) > 0)
+               p += nr;
+
+       p = guest_flat_to_host(kvm, BOOT_CMDLINE_OFFSET);
+       if (kernel_cmdline) {
+               cmdline_size = strlen(kernel_cmdline) + 1;
+               if (cmdline_size > boot.hdr.cmdline_size)
+                       cmdline_size = boot.hdr.cmdline_size;
+
+               memset(p, 0, boot.hdr.cmdline_size);
+               memcpy(p, kernel_cmdline, cmdline_size - 1);
+       }
+
+       kern_boot       = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, 0x00);
+
+       kern_boot->hdr.cmd_line_ptr     = BOOT_CMDLINE_OFFSET;
+       kern_boot->hdr.type_of_loader   = 0xff;
+       kern_boot->hdr.heap_end_ptr     = 0xfe00;
+       kern_boot->hdr.loadflags        |= CAN_USE_HEAP;
+       kern_boot->hdr.vid_mode         = vidmode;
+
+       /*
+        * Read initrd image into guest memory
+        */
+       if (fd_initrd >= 0) {
+               struct stat initrd_stat;
+               unsigned long addr;
+
+               if (fstat(fd_initrd, &initrd_stat))
+                       die_perror("fstat");
+
+               addr = boot.hdr.initrd_addr_max & ~0xfffff;
+               for (;;) {
+                       if (addr < BZ_KERNEL_START)
+                               die("Not enough memory for initrd");
+                       else if (addr < (kvm->ram_size - initrd_stat.st_size))
+                               break;
+                       addr -= 0x100000;
+               }
+
+               p = guest_flat_to_host(kvm, addr);
+               nr = read(fd_initrd, p, initrd_stat.st_size);
+               if (nr != initrd_stat.st_size)
+                       die("Failed to read initrd");
+
+               kern_boot->hdr.ramdisk_image    = addr;
+               kern_boot->hdr.ramdisk_size     = initrd_stat.st_size;
+       }
+
+       kvm->boot_selector      = BOOT_LOADER_SELECTOR;
+       /*
+        * The real-mode setup code starts at offset 0x200 of a bzImage. See
+        * Documentation/x86/boot.txt for details.
+        */
+       kvm->boot_ip            = BOOT_LOADER_IP + 0x200;
+       kvm->boot_sp            = BOOT_LOADER_SP;
+
+       return true;
+}
+
+/* RFC 1952 */
+#define GZIP_ID1               0x1f
+#define GZIP_ID2               0x8b
+
+static bool initrd_check(int fd)
+{
+       unsigned char id[2];
+
+       if (read_in_full(fd, id, ARRAY_SIZE(id)) < 0)
+               return false;
+
+       if (lseek(fd, 0, SEEK_SET) < 0)
+               die_perror("lseek");
+
+       return id[0] == GZIP_ID1 && id[1] == GZIP_ID2;
+}
+
+bool kvm__load_kernel(struct kvm *kvm, const char *kernel_filename,
+               const char *initrd_filename, const char *kernel_cmdline, u16 vidmode)
+{
+       bool ret;
+       int fd_kernel = -1, fd_initrd = -1;
+
+       fd_kernel = open(kernel_filename, O_RDONLY);
+       if (fd_kernel < 0)
+               die("Unable to open kernel %s", kernel_filename);
+
+       if (initrd_filename) {
+               fd_initrd = open(initrd_filename, O_RDONLY);
+               if (fd_initrd < 0)
+                       die("Unable to open initrd %s", initrd_filename);
+
+               if (!initrd_check(fd_initrd))
+                       die("%s is not an initrd", initrd_filename);
+       }
+
+       ret = load_bzimage(kvm, fd_kernel, fd_initrd, kernel_cmdline, vidmode);
+
+       if (initrd_filename)
+               close(fd_initrd);
+
+       if (ret)
+               goto found_kernel;
+
+       pr_warning("%s is not a bzImage. Trying to load it as a flat binary...", kernel_filename);
+
+       ret = load_flat_binary(kvm, fd_kernel);
+       if (ret)
+               goto found_kernel;
+
+       close(fd_kernel);
+
+       die("%s is not a valid bzImage or flat binary", kernel_filename);
+
+found_kernel:
+       close(fd_kernel);
+
+       return ret;
+}
+
+/**
+ * kvm__setup_bios - inject BIOS into guest system memory
+ * @kvm - guest system descriptor
+ *
+ * This function is a main routine where we poke guest memory
+ * and install BIOS there.
+ */
+void kvm__setup_bios(struct kvm *kvm)
+{
+       /* standart minimal configuration */
+       setup_bios(kvm);
+
+       /* FIXME: SMP, ACPI and friends here */
+
+       /* MP table */
+       mptable_setup(kvm, kvm->nrcpus);
+}
+
+#define TIMER_INTERVAL_NS 1000000      /* 1 msec */
+
+/*
+ * This function sets up a timer that's used to inject interrupts from the
+ * userspace hypervisor into the guest at periodical intervals. Please note
+ * that clock interrupt, for example, is not handled here.
+ */
+void kvm__start_timer(struct kvm *kvm)
+{
+       struct itimerspec its;
+       struct sigevent sev;
+
+       memset(&sev, 0, sizeof(struct sigevent));
+       sev.sigev_value.sival_int       = 0;
+       sev.sigev_notify                = SIGEV_THREAD_ID;
+       sev.sigev_signo                 = SIGALRM;
+       sev._sigev_un._tid              = syscall(__NR_gettid);
+
+       if (timer_create(CLOCK_REALTIME, &sev, &kvm->timerid) < 0)
+               die("timer_create()");
+
+       its.it_value.tv_sec             = TIMER_INTERVAL_NS / 1000000000;
+       its.it_value.tv_nsec            = TIMER_INTERVAL_NS % 1000000000;
+       its.it_interval.tv_sec          = its.it_value.tv_sec;
+       its.it_interval.tv_nsec         = its.it_value.tv_nsec;
+
+       if (timer_settime(kvm->timerid, 0, &its, NULL) < 0)
+               die("timer_settime()");
+}
+
+void kvm__stop_timer(struct kvm *kvm)
+{
+       if (kvm->timerid)
+               if (timer_delete(kvm->timerid) < 0)
+                       die("timer_delete()");
+
+       kvm->timerid = 0;
+}
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+       struct kvm_irq_level irq_level;
+
+       irq_level       = (struct kvm_irq_level) {
+               {
+                       .irq            = irq,
+               },
+               .level          = level,
+       };
+
+       if (ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level) < 0)
+               die_perror("KVM_IRQ_LINE failed");
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+       kvm__irq_line(kvm, irq, 1);
+       kvm__irq_line(kvm, irq, 0);
+}
+
+void kvm__dump_mem(struct kvm *kvm, unsigned long addr, unsigned long size)
+{
+       unsigned char *p;
+       unsigned long n;
+
+       size &= ~7; /* mod 8 */
+       if (!size)
+               return;
+
+       p = guest_flat_to_host(kvm, addr);
+
+       for (n = 0; n < size; n += 8) {
+               if (!host_ptr_in_ram(kvm, p + n))
+                       break;
+
+               printf("  0x%08lx: %02x %02x %02x %02x  %02x %02x %02x %02x\n",
+                       addr + n, p[n + 0], p[n + 1], p[n + 2], p[n + 3],
+                                 p[n + 4], p[n + 5], p[n + 6], p[n + 7]);
+       }
+}
+
+void kvm__pause(void)
+{
+       int i, paused_vcpus = 0;
+
+       /* Check if the guest is running */
+       if (!kvm_cpus[0] || kvm_cpus[0]->thread == 0)
+               return;
+
+       mutex_lock(&pause_lock);
+
+       pause_event = eventfd(0, 0);
+       if (pause_event < 0)
+               die("Failed creating pause notification event");
+       for (i = 0; i < kvm->nrcpus; i++)
+               pthread_kill(kvm_cpus[i]->thread, SIGKVMPAUSE);
+
+       while (paused_vcpus < kvm->nrcpus) {
+               u64 cur_read;
+
+               if (read(pause_event, &cur_read, sizeof(cur_read)) < 0)
+                       die("Failed reading pause event");
+               paused_vcpus += cur_read;
+       }
+       close(pause_event);
+}
+
+void kvm__continue(void)
+{
+       /* Check if the guest is running */
+       if (!kvm_cpus[0] || kvm_cpus[0]->thread == 0)
+               return;
+
+       mutex_unlock(&pause_lock);
+}
+
+void kvm__notify_paused(void)
+{
+       u64 p = 1;
+
+       if (write(pause_event, &p, sizeof(p)) < 0)
+               die("Failed notifying of paused VCPU.");
+
+       mutex_lock(&pause_lock);
+       mutex_unlock(&pause_lock);
+}
diff --git a/tools/kvm/main.c b/tools/kvm/main.c
new file mode 100644 (file)
index 0000000..2138e7b
--- /dev/null
@@ -0,0 +1,14 @@
+#include <stdio.h>
+
+/* user defined header files */
+#include <kvm/kvm-cmd.h>
+
+static int handle_kvm_command(int argc, char **argv)
+{
+       return handle_command(kvm_commands, argc, (const char **) &argv[0]);
+}
+
+int main(int argc, char *argv[])
+{
+       return handle_kvm_command(argc - 1, &argv[1]);
+}
diff --git a/tools/kvm/mmio.c b/tools/kvm/mmio.c
new file mode 100644 (file)
index 0000000..de7320f
--- /dev/null
@@ -0,0 +1,132 @@
+#include "kvm/kvm.h"
+#include "kvm/rbtree-interval.h"
+#include "kvm/brlock.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sys/ioctl.h>
+#include <linux/kvm.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+#define mmio_node(n) rb_entry(n, struct mmio_mapping, node)
+
+struct mmio_mapping {
+       struct rb_int_node      node;
+       void                    (*mmio_fn)(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr);
+       void                    *ptr;
+};
+
+static struct rb_root mmio_tree = RB_ROOT;
+
+static struct mmio_mapping *mmio_search(struct rb_root *root, u64 addr, u64 len)
+{
+       struct rb_int_node *node;
+
+       node = rb_int_search_range(root, addr, addr + len);
+       if (node == NULL)
+               return NULL;
+
+       return mmio_node(node);
+}
+
+/* Find lowest match, Check for overlap */
+static struct mmio_mapping *mmio_search_single(struct rb_root *root, u64 addr)
+{
+       struct rb_int_node *node;
+
+       node = rb_int_search_single(root, addr);
+       if (node == NULL)
+               return NULL;
+
+       return mmio_node(node);
+}
+
+static int mmio_insert(struct rb_root *root, struct mmio_mapping *data)
+{
+       return rb_int_insert(root, &data->node);
+}
+
+static const char *to_direction(u8 is_write)
+{
+       if (is_write)
+               return "write";
+
+       return "read";
+}
+
+bool kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, void (*mmio_fn)(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr), void *ptr)
+{
+       struct mmio_mapping *mmio;
+       struct kvm_coalesced_mmio_zone zone;
+       int ret;
+
+       mmio = malloc(sizeof(*mmio));
+       if (mmio == NULL)
+               return false;
+
+       *mmio = (struct mmio_mapping) {
+               .node = RB_INT_INIT(phys_addr, phys_addr + phys_addr_len),
+               .mmio_fn = mmio_fn,
+               .ptr    = ptr,
+       };
+
+       zone = (struct kvm_coalesced_mmio_zone) {
+               .addr   = phys_addr,
+               .size   = phys_addr_len,
+       };
+       ret = ioctl(kvm->vm_fd, KVM_REGISTER_COALESCED_MMIO, &zone);
+       if (ret < 0) {
+               free(mmio);
+               return false;
+       }
+
+       br_write_lock();
+       ret = mmio_insert(&mmio_tree, mmio);
+       br_write_unlock();
+
+       return ret;
+}
+
+bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr)
+{
+       struct mmio_mapping *mmio;
+       struct kvm_coalesced_mmio_zone zone;
+
+       br_write_lock();
+       mmio = mmio_search_single(&mmio_tree, phys_addr);
+       if (mmio == NULL) {
+               br_write_unlock();
+               return false;
+       }
+
+       zone = (struct kvm_coalesced_mmio_zone) {
+               .addr   = phys_addr,
+               .size   = 1,
+       };
+       ioctl(kvm->vm_fd, KVM_UNREGISTER_COALESCED_MMIO, &zone);
+
+       rb_int_erase(&mmio_tree, &mmio->node);
+       br_write_unlock();
+
+       free(mmio);
+       return true;
+}
+
+bool kvm__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+       struct mmio_mapping *mmio;
+
+       br_read_lock();
+       mmio = mmio_search(&mmio_tree, phys_addr, len);
+
+       if (mmio)
+               mmio->mmio_fn(phys_addr, data, len, is_write, mmio->ptr);
+       else
+               fprintf(stderr, "Warning: Ignoring MMIO %s at %016llx (length %u)\n",
+                       to_direction(is_write), phys_addr, len);
+       br_read_unlock();
+
+       return true;
+}
diff --git a/tools/kvm/mptable.c b/tools/kvm/mptable.c
new file mode 100644 (file)
index 0000000..cfc7d79
--- /dev/null
@@ -0,0 +1,284 @@
+#include "kvm/kvm.h"
+#include "kvm/bios.h"
+#include "kvm/apic.h"
+#include "kvm/mptable.h"
+#include "kvm/util.h"
+#include "kvm/irq.h"
+
+#include <linux/kernel.h>
+#include <string.h>
+
+/*
+ * If kernel is not configured yet this macro
+ * might not be defined, fix it by own definition
+ */
+#ifndef NR_CPUS
+#define NR_CPUS KVM_NR_CPUS
+#endif
+
+#include <asm/mpspec_def.h>
+#include <linux/types.h>
+
+/*
+ * FIXME: please make sure the addresses borrowed
+ * for apic/ioapic never overlaped! We need a global
+ * tracker of system resources (including io, mmio,
+ * and friends).
+ */
+
+static unsigned int mpf_checksum(unsigned char *mp, int len)
+{
+       unsigned int sum = 0;
+
+       while (len--)
+               sum += *mp++;
+
+       return sum & 0xFF;
+}
+
+static unsigned int gen_cpu_flag(unsigned int cpu, unsigned int ncpu)
+{
+       /* sets enabled/disabled | BSP/AP processor */
+       return ( (cpu < ncpu) ? CPU_ENABLED       : 0) |
+               ((cpu == 0)   ? CPU_BOOTPROCESSOR : 0x00);
+}
+
+#define MPTABLE_SIG_FLOATING   "_MP_"
+#define MPTABLE_OEM            "KVMCPU00"
+#define MPTABLE_PRODUCTID      "0.1         "
+#define MPTABLE_PCIBUSTYPE     "PCI   "
+#define MPTABLE_ISABUSTYPE     "ISA   "
+
+#define MPTABLE_STRNCPY(d, s)  memcpy(d, s, sizeof(d))
+
+/* It should be more than enough */
+#define MPTABLE_MAX_SIZE       (32 << 20)
+
+/*
+ * Too many cpus will require x2apic mode
+ * and rather ACPI support so we limit it
+ * here for a while.
+ */
+#define MPTABLE_MAX_CPUS       255
+
+static void mptable_add_irq_src(struct mpc_intsrc *mpc_intsrc,
+                               u16 srcbusid,   u16 srcbusirq,
+                               u16 dstapic,    u16 dstirq)
+{
+       *mpc_intsrc = (struct mpc_intsrc) {
+               .type           = MP_INTSRC,
+               .irqtype        = mp_INT,
+               .irqflag        = MP_IRQDIR_DEFAULT,
+               .srcbus         = srcbusid,
+               .srcbusirq      = srcbusirq,
+               .dstapic        = dstapic,
+               .dstirq         = dstirq
+       };
+}
+
+/**
+ * mptable_setup - create mptable and fill guest memory with it
+ */
+void mptable_setup(struct kvm *kvm, unsigned int ncpus)
+{
+       unsigned long real_mpc_table, real_mpf_intel, size;
+       struct mpf_intel *mpf_intel;
+       struct mpc_table *mpc_table;
+       struct mpc_cpu *mpc_cpu;
+       struct mpc_bus *mpc_bus;
+       struct mpc_ioapic *mpc_ioapic;
+       struct mpc_intsrc *mpc_intsrc;
+       struct rb_node *pci_tree;
+
+       const int pcibusid = 0;
+       const int isabusid = 1;
+
+       unsigned int i, nentries = 0;
+       unsigned int ioapicid;
+       void *last_addr;
+
+       /* That is where MP table will be in guest memory */
+       real_mpc_table = ALIGN(MB_BIOS_BEGIN + bios_rom_size, 16);
+
+       if (ncpus > MPTABLE_MAX_CPUS) {
+               pr_warning("Too many cpus: %d limited to %d",
+                       ncpus, MPTABLE_MAX_CPUS);
+               ncpus = MPTABLE_MAX_CPUS;
+       }
+
+       mpc_table = calloc(1, MPTABLE_MAX_SIZE);
+       if (!mpc_table)
+               die("Out of memory");
+
+       MPTABLE_STRNCPY(mpc_table->signature,   MPC_SIGNATURE);
+       MPTABLE_STRNCPY(mpc_table->oem,         MPTABLE_OEM);
+       MPTABLE_STRNCPY(mpc_table->productid,   MPTABLE_PRODUCTID);
+
+       mpc_table->spec         = 4;
+       mpc_table->lapic        = APIC_ADDR(0);
+       mpc_table->oemcount     = ncpus; /* will be updated again at end */
+
+       /*
+        * CPUs enumeration. Technically speaking we should
+        * ask either host or HV for apic version supported
+        * but for a while we simply put some random value
+        * here.
+        */
+       mpc_cpu = (void *)&mpc_table[1];
+       for (i = 0; i < ncpus; i++) {
+               mpc_cpu->type           = MP_PROCESSOR;
+               mpc_cpu->apicid         = i;
+               mpc_cpu->apicver        = KVM_APIC_VERSION;
+               mpc_cpu->cpuflag        = gen_cpu_flag(i, ncpus);
+               mpc_cpu->cpufeature     = 0x600; /* some default value */
+               mpc_cpu->featureflag    = 0x201; /* some default value */
+               mpc_cpu++;
+       }
+
+       last_addr = (void *)mpc_cpu;
+       nentries += ncpus;
+
+       /*
+        * PCI buses.
+        * FIXME: Some callback here to obtain real number
+        * of PCI buses present in system.
+        */
+       mpc_bus         = last_addr;
+       mpc_bus->type   = MP_BUS;
+       mpc_bus->busid  = pcibusid;
+       MPTABLE_STRNCPY(mpc_bus->bustype, MPTABLE_PCIBUSTYPE);
+
+       last_addr = (void *)&mpc_bus[1];
+       nentries++;
+
+       /*
+        * ISA bus.
+        * FIXME: Same issue as for PCI bus.
+        */
+       mpc_bus         = last_addr;
+       mpc_bus->type   = MP_BUS;
+       mpc_bus->busid  = isabusid;
+       MPTABLE_STRNCPY(mpc_bus->bustype, MPTABLE_ISABUSTYPE);
+
+       last_addr = (void *)&mpc_bus[1];
+       nentries++;
+
+       /*
+        * IO-APIC chip.
+        */
+       ioapicid                = ncpus + 1;
+       mpc_ioapic              = last_addr;
+       mpc_ioapic->type        = MP_IOAPIC;
+       mpc_ioapic->apicid      = ioapicid;
+       mpc_ioapic->apicver     = KVM_APIC_VERSION;
+       mpc_ioapic->flags       = MPC_APIC_USABLE;
+       mpc_ioapic->apicaddr    = IOAPIC_ADDR(0);
+
+       last_addr = (void *)&mpc_ioapic[1];
+       nentries++;
+
+       /*
+        * IRQ sources.
+        *
+        * FIXME: Same issue as with buses. We definitely
+        * need kind of collector routine which enumerate
+        * resources used first and pass them here.
+        * At moment we know we have only virtio block device
+        * and virtio console but this is g00berfish.
+        *
+        * Also note we use PCI irqs here, no for ISA bus yet.
+        */
+
+       for (pci_tree = irq__get_pci_tree(); pci_tree; pci_tree = rb_next(pci_tree)) {
+               struct pci_dev *dev = rb_entry(pci_tree, struct pci_dev, node);
+               struct irq_line *irq_line;
+
+               list_for_each_entry(irq_line, &dev->lines, node) {
+                       unsigned char srcbusirq;
+
+                       srcbusirq = (dev->id << 2) | (dev->pin - 1);
+
+                       mpc_intsrc = last_addr;
+
+                       mptable_add_irq_src(mpc_intsrc, pcibusid, srcbusirq, ioapicid, irq_line->line);
+                       last_addr = (void *)&mpc_intsrc[1];
+                       nentries++;
+               }
+       }
+
+       /*
+        * Local IRQs assignment (LINT0, LINT1)
+        */
+       mpc_intsrc              = last_addr;
+       mpc_intsrc->type        = MP_LINTSRC;
+       mpc_intsrc->irqtype     = mp_ExtINT;
+       mpc_intsrc->irqtype     = mp_INT;
+       mpc_intsrc->irqflag     = MP_IRQDIR_DEFAULT;
+       mpc_intsrc->srcbus      = isabusid;
+       mpc_intsrc->srcbusirq   = 0;
+       mpc_intsrc->dstapic     = 0; /* FIXME: BSP apic */
+       mpc_intsrc->dstirq      = 0; /* LINT0 */
+
+       last_addr = (void *)&mpc_intsrc[1];
+       nentries++;
+
+       mpc_intsrc              = last_addr;
+       mpc_intsrc->type        = MP_LINTSRC;
+       mpc_intsrc->irqtype     = mp_NMI;
+       mpc_intsrc->irqflag     = MP_IRQDIR_DEFAULT;
+       mpc_intsrc->srcbus      = isabusid;
+       mpc_intsrc->srcbusirq   = 0;
+       mpc_intsrc->dstapic     = 0; /* FIXME: BSP apic */
+       mpc_intsrc->dstirq      = 1; /* LINT1 */
+
+       last_addr = (void *)&mpc_intsrc[1];
+       nentries++;
+
+       /*
+        * Floating MP table finally.
+        */
+       real_mpf_intel  = ALIGN((unsigned long)last_addr - (unsigned long)mpc_table, 16);
+       mpf_intel       = (void *)((unsigned long)mpc_table + real_mpf_intel);
+
+       MPTABLE_STRNCPY(mpf_intel->signature, MPTABLE_SIG_FLOATING);
+       mpf_intel->length       = 1;
+       mpf_intel->specification= 4;
+       mpf_intel->physptr      = (unsigned int)real_mpc_table;
+       mpf_intel->checksum     = -mpf_checksum((unsigned char *)mpf_intel, sizeof(*mpf_intel));
+
+       /*
+        * No last_addr inclrement here please, we need last
+        * active position here to compute table size.
+        */
+
+       /*
+        * Don't forget to update header in fixed table.
+       */
+       mpc_table->oemcount     = nentries;
+       mpc_table->length       = last_addr - (void *)mpc_table;
+       mpc_table->checksum     = -mpf_checksum((unsigned char *)mpc_table, mpc_table->length);
+
+
+       /*
+        * We will copy the whole table, no need to separate
+        * floating structure and table itkvm.
+        */
+       size = (unsigned long)mpf_intel + sizeof(*mpf_intel) - (unsigned long)mpc_table;
+
+       /*
+        * The finial check -- never get out of system bios
+        * area. Lets also check for allocated memory overrun,
+        * in real it's late but still usefull.
+        */
+
+       if (size > (unsigned long)(MB_BIOS_END - bios_rom_size) ||
+           size > MPTABLE_MAX_SIZE)
+               die("MP table is too big");
+
+       /*
+        * OK, it is time to move it to guest memory.
+        */
+       memcpy(guest_flat_to_host(kvm, real_mpc_table), mpc_table, size);
+
+       free(mpc_table);
+}
diff --git a/tools/kvm/net/uip/arp.c b/tools/kvm/net/uip/arp.c
new file mode 100644 (file)
index 0000000..98423da
--- /dev/null
@@ -0,0 +1,30 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_arp(struct uip_tx_arg *arg)
+{
+       struct uip_arp *arp, *arp2;
+       struct uip_info *info;
+       struct uip_buf *buf;
+
+       info = arg->info;
+       buf = uip_buf_clone(arg);
+
+       arp      = (struct uip_arp *)(arg->eth);
+       arp2     = (struct uip_arp *)(buf->eth);
+
+       /*
+        * ARP replay code: 2
+        */
+       arp2->op   = htons(0x2);
+       arp2->dmac = arp->smac;
+       arp2->dip  = arp->sip;
+
+       if (arp->dip == htonl(info->host_ip)) {
+               arp2->smac = info->host_mac;
+               arp2->sip = htonl(info->host_ip);
+
+               uip_buf_set_used(info, buf);
+       }
+
+       return 0;
+}
diff --git a/tools/kvm/net/uip/buf.c b/tools/kvm/net/uip/buf.c
new file mode 100644 (file)
index 0000000..5e564a9
--- /dev/null
@@ -0,0 +1,114 @@
+#include "kvm/uip.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+struct uip_buf *uip_buf_get_used(struct uip_info *info)
+{
+       struct uip_buf *buf;
+       bool found = false;
+
+       mutex_lock(&info->buf_lock);
+
+       while (!(info->buf_used_nr > 0))
+               pthread_cond_wait(&info->buf_used_cond, &info->buf_lock);
+
+       list_for_each_entry(buf, &info->buf_head, list) {
+               if (buf->status == UIP_BUF_STATUS_USED) {
+                       /*
+                        * Set status to INUSE immediately to prevent
+                        * someone from using this buf until we free it
+                        */
+                       buf->status = UIP_BUF_STATUS_INUSE;
+                       info->buf_used_nr--;
+                       found = true;
+                       break;
+               }
+       }
+
+       mutex_unlock(&info->buf_lock);
+
+       return found ? buf : NULL;
+}
+
+struct uip_buf *uip_buf_get_free(struct uip_info *info)
+{
+       struct uip_buf *buf;
+       bool found = false;
+
+       mutex_lock(&info->buf_lock);
+
+       while (!(info->buf_free_nr > 0))
+               pthread_cond_wait(&info->buf_free_cond, &info->buf_lock);
+
+       list_for_each_entry(buf, &info->buf_head, list) {
+               if (buf->status == UIP_BUF_STATUS_FREE) {
+                       /*
+                        * Set status to INUSE immediately to prevent
+                        * someone from using this buf until we free it
+                        */
+                       buf->status = UIP_BUF_STATUS_INUSE;
+                       info->buf_free_nr--;
+                       found = true;
+                       break;
+               }
+       }
+
+       mutex_unlock(&info->buf_lock);
+
+       return found ? buf : NULL;
+}
+
+struct uip_buf *uip_buf_set_used(struct uip_info *info, struct uip_buf *buf)
+{
+       mutex_lock(&info->buf_lock);
+
+       buf->status = UIP_BUF_STATUS_USED;
+       info->buf_used_nr++;
+       pthread_cond_signal(&info->buf_used_cond);
+
+       mutex_unlock(&info->buf_lock);
+
+       return buf;
+}
+
+struct uip_buf *uip_buf_set_free(struct uip_info *info, struct uip_buf *buf)
+{
+       mutex_lock(&info->buf_lock);
+
+       buf->status = UIP_BUF_STATUS_FREE;
+       info->buf_free_nr++;
+       pthread_cond_signal(&info->buf_free_cond);
+
+       mutex_unlock(&info->buf_lock);
+
+       return buf;
+}
+
+struct uip_buf *uip_buf_clone(struct uip_tx_arg *arg)
+{
+       struct uip_buf *buf;
+       struct uip_eth *eth2;
+       struct uip_info *info;
+
+       info = arg->info;
+
+       /*
+        * Get buffer from device to guest
+        */
+       buf = uip_buf_get_free(info);
+
+       /*
+        * Clone buffer
+        */
+       memcpy(buf->vnet, arg->vnet, arg->vnet_len);
+       memcpy(buf->eth, arg->eth, arg->eth_len);
+       buf->vnet_len   = arg->vnet_len;
+       buf->eth_len    = arg->eth_len;
+
+       eth2            = (struct uip_eth *)buf->eth;
+       eth2->src       = info->host_mac;
+       eth2->dst       = arg->eth->src;
+
+       return buf;
+}
diff --git a/tools/kvm/net/uip/core.c b/tools/kvm/net/uip/core.c
new file mode 100644 (file)
index 0000000..2e7603c
--- /dev/null
@@ -0,0 +1,190 @@
+#include "kvm/mutex.h"
+#include "kvm/uip.h"
+
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+int uip_tx(struct iovec *iov, u16 out, struct uip_info *info)
+{
+       struct virtio_net_hdr *vnet;
+       struct uip_tx_arg arg;
+       int eth_len, vnet_len;
+       struct uip_eth *eth;
+       u8 *buf = NULL;
+       u16 proto;
+       int i;
+
+       /*
+        * Buffer from guest to device
+        */
+       vnet_len = iov[0].iov_len;
+       vnet     = iov[0].iov_base;
+
+       eth_len  = iov[1].iov_len;
+       eth      = iov[1].iov_base;
+
+       /*
+        * In case, ethernet frame is in more than one iov entry.
+        * Copy iov buffer into one linear buffer.
+        */
+       if (out > 2) {
+               eth_len = 0;
+               for (i = 1; i < out; i++)
+                       eth_len += iov[i].iov_len;
+
+               buf = malloc(eth_len);
+               if (!buf)
+                       return -1;
+
+               eth = (struct uip_eth *)buf;
+               for (i = 1; i < out; i++) {
+                       memcpy(buf, iov[i].iov_base, iov[i].iov_len);
+                       buf += iov[i].iov_len;
+               }
+       }
+
+       memset(&arg, 0, sizeof(arg));
+
+       arg.vnet_len = vnet_len;
+       arg.eth_len = eth_len;
+       arg.info = info;
+       arg.vnet = vnet;
+       arg.eth = eth;
+
+       /*
+        * Check package type
+        */
+       proto = ntohs(eth->type);
+
+       switch (proto) {
+       case UIP_ETH_P_ARP:
+               uip_tx_do_arp(&arg);
+               break;
+       case UIP_ETH_P_IP:
+               uip_tx_do_ipv4(&arg);
+               break;
+       default:
+               break;
+       }
+
+       if (out > 2 && buf)
+               free(eth);
+
+       return vnet_len + eth_len;
+}
+
+int uip_rx(struct iovec *iov, u16 in, struct uip_info *info)
+{
+       struct virtio_net_hdr *vnet;
+       struct uip_eth *eth;
+       struct uip_buf *buf;
+       int vnet_len;
+       int eth_len;
+       char *p;
+       int len;
+       int cnt;
+       int i;
+
+       /*
+        * Sleep until there is a buffer for guest
+        */
+       buf = uip_buf_get_used(info);
+
+       /*
+        * Fill device to guest buffer, vnet hdr fisrt
+        */
+       vnet_len = iov[0].iov_len;
+       vnet = iov[0].iov_base;
+       if (buf->vnet_len > vnet_len) {
+               len = -1;
+               goto out;
+       }
+       memcpy(vnet, buf->vnet, buf->vnet_len);
+
+       /*
+        * Then, the real eth data
+        * Note: Be sure buf->eth_len is not bigger than the buffer len that guest provides
+        */
+       cnt = buf->eth_len;
+       p = buf->eth;
+       for (i = 1; i < in; i++) {
+               eth_len = iov[i].iov_len;
+               eth = iov[i].iov_base;
+               if (cnt > eth_len) {
+                       memcpy(eth, p, eth_len);
+                       cnt -= eth_len;
+                       p += eth_len;
+               } else {
+                       memcpy(eth, p, cnt);
+                       cnt -= cnt;
+                       break;
+               }
+       }
+
+       if (cnt) {
+               pr_warning("uip_rx error");
+               len = -1;
+               goto out;
+       }
+
+       len = buf->vnet_len + buf->eth_len;
+
+out:
+       uip_buf_set_free(info, buf);
+       return len;
+}
+
+int uip_init(struct uip_info *info)
+{
+       struct list_head *udp_socket_head;
+       struct list_head *tcp_socket_head;
+       struct list_head *buf_head;
+       struct uip_buf *buf;
+       int buf_nr;
+       int i;
+
+       udp_socket_head = &info->udp_socket_head;
+       tcp_socket_head = &info->tcp_socket_head;
+       buf_head        = &info->buf_head;
+       buf_nr          = info->buf_nr;
+
+       INIT_LIST_HEAD(udp_socket_head);
+       INIT_LIST_HEAD(tcp_socket_head);
+       INIT_LIST_HEAD(buf_head);
+
+       pthread_mutex_init(&info->udp_socket_lock, NULL);
+       pthread_mutex_init(&info->tcp_socket_lock, NULL);
+       pthread_mutex_init(&info->buf_lock, NULL);
+
+       pthread_cond_init(&info->buf_used_cond, NULL);
+       pthread_cond_init(&info->buf_free_cond, NULL);
+
+
+       for (i = 0; i < buf_nr; i++) {
+               buf = malloc(sizeof(*buf));
+               memset(buf, 0, sizeof(*buf));
+
+               buf->status     = UIP_BUF_STATUS_FREE;
+               buf->info       = info;
+               buf->id         = i;
+               list_add_tail(&buf->list, buf_head);
+       }
+
+       list_for_each_entry(buf, buf_head, list) {
+               buf->vnet       = malloc(sizeof(struct virtio_net_hdr));
+               buf->vnet_len   = sizeof(struct virtio_net_hdr);
+               buf->eth        = malloc(1024*64 + sizeof(struct uip_pseudo_hdr));
+               buf->eth_len    = 1024*64 + sizeof(struct uip_pseudo_hdr);
+
+               memset(buf->vnet, 0, buf->vnet_len);
+               memset(buf->eth, 0, buf->eth_len);
+       }
+
+       info->buf_free_nr = buf_nr;
+       info->buf_used_nr = 0;
+
+       uip_dhcp_get_dns(info);
+
+       return 0;
+}
diff --git a/tools/kvm/net/uip/csum.c b/tools/kvm/net/uip/csum.c
new file mode 100644 (file)
index 0000000..7ca8bad
--- /dev/null
@@ -0,0 +1,92 @@
+#include "kvm/uip.h"
+
+static u16 uip_csum(u16 csum, u8 *addr, u16 count)
+{
+       long sum = csum;
+
+       while (count > 1) {
+               sum     += *(u16 *)addr;
+               addr    += 2;
+               count   -= 2;
+       }
+
+       if (count > 0)
+               sum += *(unsigned char *)addr;
+
+       while (sum>>16)
+               sum = (sum & 0xffff) + (sum >> 16);
+
+       return ~sum;
+}
+
+u16 uip_csum_ip(struct uip_ip *ip)
+{
+       return uip_csum(0, &ip->vhl, uip_ip_hdrlen(ip));
+}
+
+u16 uip_csum_icmp(struct uip_icmp *icmp)
+{
+       struct uip_ip *ip;
+
+       ip = &icmp->ip;
+       return icmp->csum = uip_csum(0, &icmp->type, htons(ip->len) - uip_ip_hdrlen(ip) - 8); /* icmp header len = 8 */
+}
+
+u16 uip_csum_udp(struct uip_udp *udp)
+{
+       struct uip_pseudo_hdr hdr;
+       struct uip_ip *ip;
+       int udp_len;
+       u8 *pad;
+
+       ip        = &udp->ip;
+
+       hdr.sip   = ip->sip;
+       hdr.dip   = ip->dip;
+       hdr.zero  = 0;
+       hdr.proto = ip->proto;
+       hdr.len   = udp->len;
+
+       udp_len   = uip_udp_len(udp);
+
+       if (udp_len % 2) {
+               pad = (u8 *)&udp->sport + udp_len;
+               *pad = 0;
+               memcpy((u8 *)&udp->sport + udp_len + 1, &hdr, sizeof(hdr));
+               return uip_csum(0, (u8 *)&udp->sport, udp_len + 1 + sizeof(hdr));
+       } else {
+               memcpy((u8 *)&udp->sport + udp_len, &hdr, sizeof(hdr));
+               return uip_csum(0, (u8 *)&udp->sport, udp_len + sizeof(hdr));
+       }
+
+}
+
+u16 uip_csum_tcp(struct uip_tcp *tcp)
+{
+       struct uip_pseudo_hdr hdr;
+       struct uip_ip *ip;
+       u16 tcp_len;
+       u8 *pad;
+
+       ip        = &tcp->ip;
+       tcp_len   = ntohs(ip->len) - uip_ip_hdrlen(ip);
+
+       hdr.sip   = ip->sip;
+       hdr.dip   = ip->dip;
+       hdr.zero  = 0;
+       hdr.proto = ip->proto;
+       hdr.len   = htons(tcp_len);
+
+       if (tcp_len > UIP_MAX_TCP_PAYLOAD + 20)
+               pr_warning("tcp_len(%d) is too large", tcp_len);
+
+       if (tcp_len % 2) {
+               pad = (u8 *)&tcp->sport + tcp_len;
+               *pad = 0;
+               memcpy((u8 *)&tcp->sport + tcp_len + 1, &hdr, sizeof(hdr));
+               return uip_csum(0, (u8 *)&tcp->sport, tcp_len + 1 + sizeof(hdr));
+       } else {
+               memcpy((u8 *)&tcp->sport + tcp_len, &hdr, sizeof(hdr));
+               return uip_csum(0, (u8 *)&tcp->sport, tcp_len + sizeof(hdr));
+       }
+}
diff --git a/tools/kvm/net/uip/dhcp.c b/tools/kvm/net/uip/dhcp.c
new file mode 100644 (file)
index 0000000..bd3c53b
--- /dev/null
@@ -0,0 +1,195 @@
+#include "kvm/uip.h"
+
+#include <arpa/inet.h>
+
+static inline bool uip_dhcp_is_discovery(struct uip_dhcp *dhcp)
+{
+       return (dhcp->option[2] == UIP_DHCP_DISCOVER &&
+               dhcp->option[1] == UIP_DHCP_TAG_MSG_TYPE_LEN &&
+               dhcp->option[0] == UIP_DHCP_TAG_MSG_TYPE);
+}
+
+static inline bool uip_dhcp_is_request(struct uip_dhcp *dhcp)
+{
+       return (dhcp->option[2] == UIP_DHCP_REQUEST &&
+               dhcp->option[1] == UIP_DHCP_TAG_MSG_TYPE_LEN &&
+               dhcp->option[0] == UIP_DHCP_TAG_MSG_TYPE);
+}
+
+bool uip_udp_is_dhcp(struct uip_udp *udp)
+{
+       struct uip_dhcp *dhcp;
+
+       if (ntohs(udp->sport) != UIP_DHCP_PORT_CLIENT ||
+           ntohs(udp->dport) != UIP_DHCP_PORT_SERVER)
+               return false;
+
+       dhcp = (struct uip_dhcp *)udp;
+
+       if (ntohl(dhcp->magic_cookie) != UIP_DHCP_MAGIC_COOKIE)
+               return false;
+
+       return true;
+}
+
+int uip_dhcp_get_dns(struct uip_info *info)
+{
+       char key[256], val[256];
+       struct in_addr addr;
+       int ret = -1;
+       int n = 0;
+       FILE *fp;
+       u32 ip;
+
+       fp = fopen("/etc/resolv.conf", "r");
+       if (!fp)
+               goto out;
+
+       while (!feof(fp)) {
+               if (fscanf(fp, "%s %s\n", key, val) != 2)
+                       continue;
+               if (strncmp("domain", key, 6) == 0)
+                       info->domain_name = strndup(val, UIP_DHCP_MAX_DOMAIN_NAME_LEN);
+               else if (strncmp("nameserver", key, 10) == 0) {
+                       if (!inet_aton(val, &addr))
+                               continue;
+                       ip = ntohl(addr.s_addr);
+                       if (n < UIP_DHCP_MAX_DNS_SERVER_NR)
+                               info->dns_ip[n++] = ip;
+                       ret = 0;
+               }
+       }
+
+out:
+       fclose(fp);
+       return ret;
+}
+
+static int uip_dhcp_fill_option_name_and_server(struct uip_info *info, u8 *opt, int i)
+{
+       u8 domain_name_len;
+       u32 *addr;
+       int n;
+
+       if (info->domain_name) {
+               domain_name_len = strlen(info->domain_name);
+               opt[i++]        = UIP_DHCP_TAG_DOMAIN_NAME;
+               opt[i++]        = domain_name_len;
+               memcpy(&opt[i], info->domain_name, domain_name_len);
+               i               += domain_name_len;
+       }
+
+       for (n = 0; n < UIP_DHCP_MAX_DNS_SERVER_NR; n++) {
+               if (info->dns_ip[n] == 0)
+                       continue;
+               opt[i++]        = UIP_DHCP_TAG_DNS_SERVER;
+               opt[i++]        = UIP_DHCP_TAG_DNS_SERVER_LEN;
+               addr            = (u32 *)&opt[i];
+               *addr           = htonl(info->dns_ip[n]);
+               i               += UIP_DHCP_TAG_DNS_SERVER_LEN;
+       }
+
+       return i;
+}
+static int uip_dhcp_fill_option(struct uip_info *info, struct uip_dhcp *dhcp, int reply_msg_type)
+{
+       int i = 0;
+       u32 *addr;
+       u8 *opt;
+
+       opt             = dhcp->option;
+
+       opt[i++]        = UIP_DHCP_TAG_MSG_TYPE;
+       opt[i++]        = UIP_DHCP_TAG_MSG_TYPE_LEN;
+       opt[i++]        = reply_msg_type;
+
+       opt[i++]        = UIP_DHCP_TAG_SERVER_ID;
+       opt[i++]        = UIP_DHCP_TAG_SERVER_ID_LEN;
+       addr            = (u32 *)&opt[i];
+       *addr           = htonl(info->host_ip);
+       i               += UIP_DHCP_TAG_SERVER_ID_LEN;
+
+       opt[i++]        = UIP_DHCP_TAG_LEASE_TIME;
+       opt[i++]        = UIP_DHCP_TAG_LEASE_TIME_LEN;
+       addr            = (u32 *)&opt[i];
+       *addr           = htonl(UIP_DHCP_LEASE_TIME);
+       i               += UIP_DHCP_TAG_LEASE_TIME_LEN;
+
+       opt[i++]        = UIP_DHCP_TAG_SUBMASK;
+       opt[i++]        = UIP_DHCP_TAG_SUBMASK_LEN;
+       addr            = (u32 *)&opt[i];
+       *addr           = htonl(info->guest_netmask);
+       i               += UIP_DHCP_TAG_SUBMASK_LEN;
+
+       opt[i++]        = UIP_DHCP_TAG_ROUTER;
+       opt[i++]        = UIP_DHCP_TAG_ROUTER_LEN;
+       addr            = (u32 *)&opt[i];
+       *addr           = htonl(info->host_ip);
+       i               += UIP_DHCP_TAG_ROUTER_LEN;
+
+       i               = uip_dhcp_fill_option_name_and_server(info, opt, i);
+
+       opt[i++]        = UIP_DHCP_TAG_END;
+
+       return 0;
+}
+
+static int uip_dhcp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8 reply_msg_type)
+{
+       struct uip_dhcp *dhcp;
+
+       dhcp            = (struct uip_dhcp *)buf->eth;
+
+       dhcp->msg_type  = 2;
+       dhcp->client_ip = 0;
+       dhcp->your_ip   = htonl(info->guest_ip);
+       dhcp->server_ip = htonl(info->host_ip);
+       dhcp->agent_ip  = 0;
+
+       uip_dhcp_fill_option(info, dhcp, reply_msg_type);
+
+       sk->sip         = htonl(info->guest_ip);
+       sk->dip         = htonl(info->host_ip);
+       sk->sport       = htons(UIP_DHCP_PORT_CLIENT);
+       sk->dport       = htons(UIP_DHCP_PORT_SERVER);
+
+       return 0;
+}
+
+int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg)
+{
+       struct uip_udp_socket sk;
+       struct uip_dhcp *dhcp;
+       struct uip_info *info;
+       struct uip_buf *buf;
+       u8 reply_msg_type;
+
+       dhcp = (struct uip_dhcp *)arg->eth;
+
+       if (uip_dhcp_is_discovery(dhcp))
+               reply_msg_type = UIP_DHCP_OFFER;
+       else if (uip_dhcp_is_request(dhcp))
+               reply_msg_type = UIP_DHCP_ACK;
+       else
+               return -1;
+
+       buf = uip_buf_clone(arg);
+       info = arg->info;
+
+       /*
+        * Cook DHCP pkg
+        */
+       uip_dhcp_make_pkg(info, &sk, buf, reply_msg_type);
+
+       /*
+        * Cook UDP pkg
+        */
+       uip_udp_make_pkg(info, &sk, buf, NULL, UIP_DHCP_MAX_PAYLOAD_LEN);
+
+       /*
+        * Send data received from socket to guest
+        */
+       uip_buf_set_used(info, buf);
+
+       return 0;
+}
diff --git a/tools/kvm/net/uip/icmp.c b/tools/kvm/net/uip/icmp.c
new file mode 100644 (file)
index 0000000..233297c
--- /dev/null
@@ -0,0 +1,29 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_ipv4_icmp(struct uip_tx_arg *arg)
+{
+       struct uip_ip *ip, *ip2;
+       struct uip_icmp *icmp2;
+       struct uip_buf *buf;
+
+       buf             = uip_buf_clone(arg);
+
+       icmp2           = (struct uip_icmp *)(buf->eth);
+       ip2             = (struct uip_ip *)(buf->eth);
+       ip              = (struct uip_ip *)(arg->eth);
+
+       ip2->sip        = ip->dip;
+       ip2->dip        = ip->sip;
+       ip2->csum       = 0;
+       /*
+        * ICMP reply: 0
+        */
+       icmp2->type     = 0;
+       icmp2->csum     = 0;
+       ip2->csum       = uip_csum_ip(ip2);
+       icmp2->csum     = uip_csum_icmp(icmp2);
+
+       uip_buf_set_used(arg->info, buf);
+
+       return 0;
+}
diff --git a/tools/kvm/net/uip/ipv4.c b/tools/kvm/net/uip/ipv4.c
new file mode 100644 (file)
index 0000000..4def129
--- /dev/null
@@ -0,0 +1,29 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_ipv4(struct uip_tx_arg *arg)
+{
+       struct uip_ip *ip;
+
+       ip = (struct uip_ip *)(arg->eth);
+
+       if (uip_ip_hdrlen(ip) != 20) {
+               pr_warning("IP header length is not 20 bytes");
+               return -1;
+       }
+
+       switch (ip->proto) {
+       case 0x01: /* ICMP */
+               uip_tx_do_ipv4_icmp(arg);
+               break;
+       case 0x06: /* TCP */
+               uip_tx_do_ipv4_tcp(arg);
+               break;
+       case 0x11: /* UDP */
+               uip_tx_do_ipv4_udp(arg);
+               break;
+       default:
+               break;
+       }
+
+       return 0;
+}
diff --git a/tools/kvm/net/uip/tcp.c b/tools/kvm/net/uip/tcp.c
new file mode 100644 (file)
index 0000000..586a45c
--- /dev/null
@@ -0,0 +1,317 @@
+#include "kvm/uip.h"
+
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+static int uip_tcp_socket_close(struct uip_tcp_socket *sk, int how)
+{
+       shutdown(sk->fd, how);
+
+       if (sk->write_done && sk->read_done) {
+               shutdown(sk->fd, SHUT_RDWR);
+               close(sk->fd);
+
+               mutex_lock(sk->lock);
+               list_del(&sk->list);
+               mutex_unlock(sk->lock);
+
+               free(sk);
+       }
+
+       return 0;
+}
+
+static struct uip_tcp_socket *uip_tcp_socket_find(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+       struct list_head *sk_head;
+       pthread_mutex_t *sk_lock;
+       struct uip_tcp_socket *sk;
+
+       sk_head = &arg->info->tcp_socket_head;
+       sk_lock = &arg->info->tcp_socket_lock;
+
+       mutex_lock(sk_lock);
+       list_for_each_entry(sk, sk_head, list) {
+               if (sk->sip == sip && sk->dip == dip && sk->sport == sport && sk->dport == dport) {
+                       mutex_unlock(sk_lock);
+                       return sk;
+               }
+       }
+       mutex_unlock(sk_lock);
+
+       return NULL;
+}
+
+static struct uip_tcp_socket *uip_tcp_socket_alloc(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+       struct list_head *sk_head;
+       struct uip_tcp_socket *sk;
+       pthread_mutex_t *sk_lock;
+       struct uip_tcp *tcp;
+       struct uip_ip *ip;
+       int ret;
+
+       tcp = (struct uip_tcp *)arg->eth;
+       ip = (struct uip_ip *)arg->eth;
+
+       sk_head = &arg->info->tcp_socket_head;
+       sk_lock = &arg->info->tcp_socket_lock;
+
+       sk = malloc(sizeof(*sk));
+       memset(sk, 0, sizeof(*sk));
+
+       sk->lock                        = sk_lock;
+       sk->info                        = arg->info;
+
+       sk->fd                          = socket(AF_INET, SOCK_STREAM, 0);
+       sk->addr.sin_family             = AF_INET;
+       sk->addr.sin_addr.s_addr        = dip;
+       sk->addr.sin_port               = dport;
+
+       ret = connect(sk->fd, (struct sockaddr *)&sk->addr, sizeof(sk->addr));
+       if (ret) {
+               free(sk);
+               return NULL;
+       }
+
+       sk->sip         = ip->sip;
+       sk->dip         = ip->dip;
+       sk->sport       = tcp->sport;
+       sk->dport       = tcp->dport;
+
+       mutex_lock(sk_lock);
+       list_add_tail(&sk->list, sk_head);
+       mutex_unlock(sk_lock);
+
+       return sk;
+}
+
+static int uip_tcp_payload_send(struct uip_tcp_socket *sk, u8 flag, u16 payload_len)
+{
+       struct uip_info *info;
+       struct uip_eth *eth2;
+       struct uip_tcp *tcp2;
+       struct uip_buf *buf;
+       struct uip_ip *ip2;
+
+       info            = sk->info;
+
+       /*
+        * Get free buffer to send data to guest
+        */
+       buf             = uip_buf_get_free(info);
+
+       /*
+        * Cook a ethernet frame
+        */
+       tcp2            = (struct uip_tcp *)buf->eth;
+       eth2            = (struct uip_eth *)buf->eth;
+       ip2             = (struct uip_ip *)buf->eth;
+
+       eth2->src       = info->host_mac;
+       eth2->dst       = info->guest_mac;
+       eth2->type      = htons(UIP_ETH_P_IP);
+
+       ip2->vhl        = UIP_IP_VER_4 | UIP_IP_HDR_LEN;
+       ip2->tos        = 0;
+       ip2->id         = 0;
+       ip2->flgfrag    = 0;
+       ip2->ttl        = UIP_IP_TTL;
+       ip2->proto      = UIP_IP_P_TCP;
+       ip2->csum       = 0;
+       ip2->sip        = sk->dip;
+       ip2->dip        = sk->sip;
+
+       tcp2->sport     = sk->dport;
+       tcp2->dport     = sk->sport;
+       tcp2->seq       = htonl(sk->seq_server);
+       tcp2->ack       = htonl(sk->ack_server);
+       /*
+        * Diable TCP options, tcp hdr len equals 20 bytes
+        */
+       tcp2->off       = UIP_TCP_HDR_LEN;
+       tcp2->flg       = flag;
+       tcp2->win       = htons(UIP_TCP_WIN_SIZE);
+       tcp2->csum      = 0;
+       tcp2->urgent    = 0;
+
+       if (payload_len > 0)
+               memcpy(uip_tcp_payload(tcp2), sk->payload, payload_len);
+
+       ip2->len        = htons(uip_tcp_hdrlen(tcp2) + payload_len + uip_ip_hdrlen(ip2));
+       ip2->csum       = uip_csum_ip(ip2);
+       tcp2->csum      = uip_csum_tcp(tcp2);
+
+       /*
+        * virtio_net_hdr
+        */
+       buf->vnet_len   = sizeof(struct virtio_net_hdr);
+       memset(buf->vnet, 0, buf->vnet_len);
+
+       buf->eth_len    = ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
+
+       /*
+        * Increase server seq
+        */
+       sk->seq_server  += payload_len;
+
+       /*
+        * Send data received from socket to guest
+        */
+       uip_buf_set_used(info, buf);
+
+       return 0;
+}
+
+static void *uip_tcp_socket_thread(void *p)
+{
+       struct uip_tcp_socket *sk;
+       u8 *payload;
+       int ret;
+
+       sk = p;
+
+       payload = malloc(UIP_MAX_TCP_PAYLOAD);
+       sk->payload = payload;
+       if (!sk->payload)
+               goto out;
+
+       while (1) {
+
+               ret = read(sk->fd, payload, UIP_MAX_TCP_PAYLOAD);
+
+               if (ret <= 0 || ret > UIP_MAX_TCP_PAYLOAD)
+                       goto out;
+
+               uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, ret);
+
+       }
+
+out:
+       /*
+        * Close server to guest TCP connection
+        */
+       uip_tcp_socket_close(sk, SHUT_RD);
+
+       uip_tcp_payload_send(sk, UIP_TCP_FLAG_FIN | UIP_TCP_FLAG_ACK, 0);
+       sk->seq_server += 1;
+
+       sk->read_done = 1;
+
+       free(sk->payload);
+       pthread_exit(NULL);
+
+       return NULL;
+}
+
+static int uip_tcp_socket_receive(struct uip_tcp_socket *sk)
+{
+       if (sk->thread == 0)
+               return pthread_create(&sk->thread, NULL, uip_tcp_socket_thread, (void *)sk);
+
+       return 0;
+}
+
+static int uip_tcp_socket_send(struct uip_tcp_socket *sk, struct uip_tcp *tcp)
+{
+       int len;
+       int ret;
+       u8 *payload;
+
+       if (sk->write_done)
+               return 0;
+
+       payload = uip_tcp_payload(tcp);
+       len = uip_tcp_payloadlen(tcp);
+
+       ret = write(sk->fd, payload, len);
+       if (ret != len)
+               pr_warning("tcp send error");
+
+       return ret;
+}
+
+int uip_tx_do_ipv4_tcp(struct uip_tx_arg *arg)
+{
+       struct uip_tcp_socket *sk;
+       struct uip_tcp *tcp;
+       struct uip_ip *ip;
+       int ret;
+
+       tcp = (struct uip_tcp *)arg->eth;
+       ip = (struct uip_ip *)arg->eth;
+
+       /*
+        * Guest is trying to start a TCP session, let's fake SYN-ACK to guest
+        */
+       if (uip_tcp_is_syn(tcp)) {
+               sk = uip_tcp_socket_alloc(arg, ip->sip, ip->dip, tcp->sport, tcp->dport);
+               if (!sk)
+                       return -1;
+
+               /*
+                * Setup ISN number
+                */
+               sk->isn_guest  = uip_tcp_isn(tcp);
+               sk->isn_server = uip_tcp_isn_alloc();
+
+               sk->seq_server = sk->isn_server;
+               sk->ack_server = sk->isn_guest + 1;
+               uip_tcp_payload_send(sk, UIP_TCP_FLAG_SYN | UIP_TCP_FLAG_ACK, 0);
+               sk->seq_server += 1;
+
+               /*
+                * Start receive thread for data from remote to guest
+                */
+               uip_tcp_socket_receive(sk);
+
+               goto out;
+       }
+
+       /*
+        * Find socket we have allocated
+        */
+       sk = uip_tcp_socket_find(arg, ip->sip, ip->dip, tcp->sport, tcp->dport);
+       if (!sk)
+               return -1;
+
+       sk->guest_acked = ntohl(tcp->ack);
+
+       if (uip_tcp_is_fin(tcp)) {
+               if (sk->write_done)
+                       goto out;
+
+               sk->write_done = 1;
+               sk->ack_server += 1;
+               uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, 0);
+
+               /*
+                * Close guest to server TCP connection
+                */
+               uip_tcp_socket_close(sk, SHUT_WR);
+
+               goto out;
+       }
+
+       /*
+        * Ignore guest to server frames with zero tcp payload
+        */
+       if (uip_tcp_payloadlen(tcp) == 0)
+               goto out;
+
+       /*
+        * Sent out TCP data to remote host
+        */
+       ret = uip_tcp_socket_send(sk, tcp);
+       if (ret < 0)
+               return -1;
+       /*
+        * Send ACK to guest imediately
+        */
+       sk->ack_server += ret;
+       uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, 0);
+
+out:
+       return 0;
+}
diff --git a/tools/kvm/net/uip/udp.c b/tools/kvm/net/uip/udp.c
new file mode 100644 (file)
index 0000000..39c2b57
--- /dev/null
@@ -0,0 +1,236 @@
+#include "kvm/uip.h"
+
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <sys/socket.h>
+#include <sys/epoll.h>
+#include <fcntl.h>
+
+#define UIP_UDP_MAX_EVENTS 1000
+
+static struct uip_udp_socket *uip_udp_socket_find(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+       struct list_head *sk_head;
+       struct uip_udp_socket *sk;
+       pthread_mutex_t *sk_lock;
+       struct epoll_event ev;
+       int flags;
+       int ret;
+
+       sk_head = &arg->info->udp_socket_head;
+       sk_lock = &arg->info->udp_socket_lock;
+
+       /*
+        * Find existing sk
+        */
+       mutex_lock(sk_lock);
+       list_for_each_entry(sk, sk_head, list) {
+               if (sk->sip == sip && sk->dip == dip && sk->sport == sport && sk->dport == dport) {
+                       mutex_unlock(sk_lock);
+                       return sk;
+               }
+       }
+       mutex_unlock(sk_lock);
+
+       /*
+        * Allocate new one
+        */
+       sk = malloc(sizeof(*sk));
+       memset(sk, 0, sizeof(*sk));
+
+       sk->lock = sk_lock;
+
+       sk->fd = socket(AF_INET, SOCK_DGRAM, 0);
+       if (sk->fd < 0)
+               goto out;
+
+       /*
+        * Set non-blocking
+        */
+       flags = fcntl(sk->fd, F_GETFL, 0);
+       flags |= O_NONBLOCK;
+       fcntl(sk->fd, F_SETFL, flags);
+
+       /*
+        * Add sk->fd to epoll_wait
+        */
+       ev.events       = EPOLLIN;
+       ev.data.fd      = sk->fd;
+       ev.data.ptr     = sk;
+       if (arg->info->udp_epollfd <= 0)
+               arg->info->udp_epollfd = epoll_create(UIP_UDP_MAX_EVENTS);
+       ret = epoll_ctl(arg->info->udp_epollfd, EPOLL_CTL_ADD, sk->fd, &ev);
+       if (ret == -1)
+               pr_warning("epoll_ctl error");
+
+       sk->addr.sin_family      = AF_INET;
+       sk->addr.sin_addr.s_addr = dip;
+       sk->addr.sin_port        = dport;
+
+       sk->sip                  = sip;
+       sk->dip                  = dip;
+       sk->sport                = sport;
+       sk->dport                = dport;
+
+       mutex_lock(sk_lock);
+       list_add_tail(&sk->list, sk_head);
+       mutex_unlock(sk_lock);
+
+       return sk;
+
+out:
+       free(sk);
+       return NULL;
+}
+
+static int uip_udp_socket_send(struct uip_udp_socket *sk, struct uip_udp *udp)
+{
+       int len;
+       int ret;
+
+       len = ntohs(udp->len) - uip_udp_hdrlen(udp);
+
+       ret = sendto(sk->fd, udp->payload, len, 0, (struct sockaddr *)&sk->addr, sizeof(sk->addr));
+       if (ret != len)
+               return -1;
+
+       return 0;
+}
+
+int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8* payload, int payload_len)
+{
+       struct uip_eth *eth2;
+       struct uip_udp *udp2;
+       struct uip_ip *ip2;
+
+       /*
+        * Cook a ethernet frame
+        */
+       udp2            = (struct uip_udp *)(buf->eth);
+       eth2            = (struct uip_eth *)buf->eth;
+       ip2             = (struct uip_ip *)(buf->eth);
+
+       eth2->src       = info->host_mac;
+       eth2->dst       = info->guest_mac;
+       eth2->type      = htons(UIP_ETH_P_IP);
+
+       ip2->vhl        = UIP_IP_VER_4 | UIP_IP_HDR_LEN;
+       ip2->tos        = 0;
+       ip2->id         = 0;
+       ip2->flgfrag    = 0;
+       ip2->ttl        = UIP_IP_TTL;
+       ip2->proto      = UIP_IP_P_UDP;
+       ip2->csum       = 0;
+
+       ip2->sip        = sk->dip;
+       ip2->dip        = sk->sip;
+       udp2->sport     = sk->dport;
+       udp2->dport     = sk->sport;
+
+       udp2->len       = htons(payload_len + uip_udp_hdrlen(udp2));
+       udp2->csum      = 0;
+
+       if (payload)
+               memcpy(udp2->payload, payload, payload_len);
+
+       ip2->len        = udp2->len + htons(uip_ip_hdrlen(ip2));
+       ip2->csum       = uip_csum_ip(ip2);
+       udp2->csum      = uip_csum_udp(udp2);
+
+       /*
+        * virtio_net_hdr
+        */
+       buf->vnet_len   = sizeof(struct virtio_net_hdr);
+       memset(buf->vnet, 0, buf->vnet_len);
+
+       buf->eth_len    = ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
+
+       return 0;
+}
+
+static void *uip_udp_socket_thread(void *p)
+{
+       struct epoll_event events[UIP_UDP_MAX_EVENTS];
+       struct uip_udp_socket *sk;
+       struct uip_info *info;
+       struct uip_buf *buf;
+       int payload_len;
+       u8 *payload;
+       int nfds;
+       int i;
+
+       info = p;
+
+       do {
+               payload = malloc(UIP_MAX_UDP_PAYLOAD);
+       } while (!payload);
+
+       while (1) {
+               nfds = epoll_wait(info->udp_epollfd, events, UIP_UDP_MAX_EVENTS, -1);
+
+               if (nfds == -1)
+                       continue;
+
+               for (i = 0; i < nfds; i++) {
+
+                       sk = events[i].data.ptr;
+                       payload_len = recvfrom(sk->fd, payload, UIP_MAX_UDP_PAYLOAD, 0, NULL, NULL);
+                       if (payload_len < 0)
+                               continue;
+
+                       /*
+                        * Get free buffer to send data to guest
+                        */
+                       buf             = uip_buf_get_free(info);
+
+                       uip_udp_make_pkg(info, sk, buf, payload, payload_len);
+
+                       /*
+                        * Send data received from socket to guest
+                        */
+                       uip_buf_set_used(info, buf);
+               }
+       }
+
+       free(payload);
+       pthread_exit(NULL);
+       return NULL;
+}
+
+int uip_tx_do_ipv4_udp(struct uip_tx_arg *arg)
+{
+       struct uip_udp_socket *sk;
+       struct uip_info *info;
+       struct uip_udp *udp;
+       struct uip_ip *ip;
+       int ret;
+
+       udp     = (struct uip_udp *)(arg->eth);
+       ip      = (struct uip_ip *)(arg->eth);
+       info    = arg->info;
+
+       if (uip_udp_is_dhcp(udp)) {
+               uip_tx_do_ipv4_udp_dhcp(arg);
+               return 0;
+       }
+
+       /*
+        * Find socket we have allocated before, otherwise allocate one
+        */
+       sk = uip_udp_socket_find(arg, ip->sip, ip->dip, udp->sport, udp->dport);
+       if (!sk)
+               return -1;
+
+       /*
+        * Send out UDP data to remote host
+        */
+       ret = uip_udp_socket_send(sk, udp);
+       if (ret)
+               return -1;
+
+       if (!info->udp_thread)
+               pthread_create(&info->udp_thread, NULL, uip_udp_socket_thread, (void *)info);
+
+       return 0;
+}
diff --git a/tools/kvm/pci.c b/tools/kvm/pci.c
new file mode 100644 (file)
index 0000000..d1afc05
--- /dev/null
@@ -0,0 +1,169 @@
+#include "kvm/pci.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <assert.h>
+
+#define PCI_MAX_DEVICES                        256
+#define PCI_BAR_OFFSET(b)              (offsetof(struct pci_device_header, bar[b]))
+
+static struct pci_device_header                *pci_devices[PCI_MAX_DEVICES];
+
+static struct pci_config_address       pci_config_address;
+
+/* This is within our PCI gap - in an unused area */
+static u32 io_space_blocks             = KVM_32BIT_GAP_START + 0x1000000;
+
+u32 pci_get_io_space_block(u32 size)
+{
+       u32 block = io_space_blocks;
+       io_space_blocks += size;
+
+       return block;
+}
+
+static void *pci_config_address_ptr(u16 port)
+{
+       unsigned long offset;
+       void *base;
+
+       offset          = port - PCI_CONFIG_ADDRESS;
+       base            = &pci_config_address;
+
+       return base + offset;
+}
+
+static bool pci_config_address_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       void *p = pci_config_address_ptr(port);
+
+       memcpy(p, data, size);
+
+       return true;
+}
+
+static bool pci_config_address_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       void *p = pci_config_address_ptr(port);
+
+       memcpy(data, p, size);
+
+       return true;
+}
+
+static struct ioport_operations pci_config_address_ops = {
+       .io_in          = pci_config_address_in,
+       .io_out         = pci_config_address_out,
+};
+
+static bool pci_device_exists(u8 bus_number, u8 device_number, u8 function_number)
+{
+       struct pci_device_header *dev;
+
+       if (pci_config_address.bus_number != bus_number)
+               return false;
+
+       if (pci_config_address.function_number != function_number)
+               return false;
+
+       if (device_number >= PCI_MAX_DEVICES)
+               return false;
+
+       dev             = pci_devices[device_number];
+
+       return dev != NULL;
+}
+
+static bool pci_config_data_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       unsigned long start;
+       u8 dev_num;
+
+       /*
+        * If someone accesses PCI configuration space offsets that are not
+        * aligned to 4 bytes, it uses ioports to signify that.
+        */
+       start = port - PCI_CONFIG_DATA;
+
+       dev_num         = pci_config_address.device_number;
+
+       if (pci_device_exists(0, dev_num, 0)) {
+               unsigned long offset;
+
+               offset = start + (pci_config_address.register_number << 2);
+               if (offset < sizeof(struct pci_device_header)) {
+                       void *p = pci_devices[dev_num];
+                       u8 bar = (offset - PCI_BAR_OFFSET(0)) / (sizeof(u32));
+                       u32 sz = PCI_IO_SIZE;
+
+                       if (bar < 6 && pci_devices[dev_num]->bar_size[bar])
+                               sz = pci_devices[dev_num]->bar_size[bar];
+
+                       /*
+                        * If the kernel masks the BAR it would expect to find the
+                        * size of the BAR there next time it reads from it.
+                        * When the kernel got the size it would write the address
+                        * back.
+                        */
+                       if (ioport__read32(p + offset)) {
+                               /* See if kernel tries to mask one of the BARs */
+                               if ((offset >= PCI_BAR_OFFSET(0)) &&
+                                   (offset <= PCI_BAR_OFFSET(6)) &&
+                                   (ioport__read32(data)  == 0xFFFFFFFF))
+                                       memcpy(p + offset, &sz, sizeof(sz));
+                                   else
+                                       memcpy(p + offset, data, size);
+                       }
+               }
+       }
+
+       return true;
+}
+
+static bool pci_config_data_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       unsigned long start;
+       u8 dev_num;
+
+       /*
+        * If someone accesses PCI configuration space offsets that are not
+        * aligned to 4 bytes, it uses ioports to signify that.
+        */
+       start = port - PCI_CONFIG_DATA;
+
+       dev_num         = pci_config_address.device_number;
+
+       if (pci_device_exists(0, dev_num, 0)) {
+               unsigned long offset;
+
+               offset = start + (pci_config_address.register_number << 2);
+               if (offset < sizeof(struct pci_device_header)) {
+                       void *p = pci_devices[dev_num];
+
+                       memcpy(data, p + offset, size);
+               } else
+                       memset(data, 0x00, size);
+       } else
+               memset(data, 0xff, size);
+
+       return true;
+}
+
+static struct ioport_operations pci_config_data_ops = {
+       .io_in          = pci_config_data_in,
+       .io_out         = pci_config_data_out,
+};
+
+void pci__register(struct pci_device_header *dev, u8 dev_num)
+{
+       assert(dev_num < PCI_MAX_DEVICES);
+
+       pci_devices[dev_num]    = dev;
+}
+
+void pci__init(void)
+{
+       ioport__register(PCI_CONFIG_DATA + 0, &pci_config_data_ops, 4, NULL);
+       ioport__register(PCI_CONFIG_ADDRESS + 0, &pci_config_address_ops, 4, NULL);
+}
diff --git a/tools/kvm/read-write.c b/tools/kvm/read-write.c
new file mode 100644 (file)
index 0000000..737fb26
--- /dev/null
@@ -0,0 +1,318 @@
+#include "kvm/read-write.h"
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+/* Same as read(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xread(int fd, void *buf, size_t count)
+{
+       ssize_t nr;
+
+restart:
+       nr = read(fd, buf, count);
+       if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+               goto restart;
+
+       return nr;
+}
+
+/* Same as write(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xwrite(int fd, const void *buf, size_t count)
+{
+       ssize_t nr;
+
+restart:
+       nr = write(fd, buf, count);
+       if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+               goto restart;
+
+       return nr;
+}
+
+ssize_t read_in_full(int fd, void *buf, size_t count)
+{
+       ssize_t total = 0;
+       char *p = buf;
+
+       while (count > 0) {
+               ssize_t nr;
+
+               nr = xread(fd, p, count);
+               if (nr <= 0) {
+                       if (total > 0)
+                               return total;
+
+                       return -1;
+               }
+
+               count -= nr;
+               total += nr;
+               p += nr;
+       }
+
+       return total;
+}
+
+ssize_t write_in_full(int fd, const void *buf, size_t count)
+{
+       const char *p = buf;
+       ssize_t total = 0;
+
+       while (count > 0) {
+               ssize_t nr;
+
+               nr = xwrite(fd, p, count);
+               if (nr < 0)
+                       return -1;
+               if (nr == 0) {
+                       errno = ENOSPC;
+                       return -1;
+               }
+               count -= nr;
+               total += nr;
+               p += nr;
+       }
+
+       return total;
+}
+
+/* Same as pread(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpread(int fd, void *buf, size_t count, off_t offset)
+{
+       ssize_t nr;
+
+restart:
+       nr = pread(fd, buf, count, offset);
+       if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+               goto restart;
+
+       return nr;
+}
+
+/* Same as pwrite(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset)
+{
+       ssize_t nr;
+
+restart:
+       nr = pwrite(fd, buf, count, offset);
+       if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+               goto restart;
+
+       return nr;
+}
+
+ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset)
+{
+       ssize_t total = 0;
+       char *p = buf;
+
+       while (count > 0) {
+               ssize_t nr;
+
+               nr = xpread(fd, p, count, offset);
+               if (nr <= 0) {
+                       if (total > 0)
+                               return total;
+
+                       return -1;
+               }
+
+               count -= nr;
+               total += nr;
+               p += nr;
+               offset += nr;
+       }
+
+       return total;
+}
+
+ssize_t pwrite_in_full(int fd, const void *buf, size_t count, off_t offset)
+{
+       const char *p = buf;
+       ssize_t total = 0;
+
+       while (count > 0) {
+               ssize_t nr;
+
+               nr = xpwrite(fd, p, count, offset);
+               if (nr < 0)
+                       return -1;
+               if (nr == 0) {
+                       errno = ENOSPC;
+                       return -1;
+               }
+               count -= nr;
+               total += nr;
+               p += nr;
+               offset += nr;
+       }
+
+       return total;
+}
+
+/* Same as readv(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xreadv(int fd, const struct iovec *iov, int iovcnt)
+{
+       ssize_t nr;
+
+restart:
+       nr = readv(fd, iov, iovcnt);
+       if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+               goto restart;
+
+       return nr;
+}
+
+/* Same as writev(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xwritev(int fd, const struct iovec *iov, int iovcnt)
+{
+       ssize_t nr;
+
+restart:
+       nr = writev(fd, iov, iovcnt);
+       if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+               goto restart;
+
+       return nr;
+}
+
+static inline ssize_t get_iov_size(const struct iovec *iov, int iovcnt)
+{
+       size_t size = 0;
+       while (iovcnt--)
+               size += (iov++)->iov_len;
+
+       return size;
+}
+
+static inline void shift_iovec(const struct iovec **iov, int *iovcnt,
+                               size_t nr, ssize_t *total, size_t *count, off_t *offset)
+{
+       while (nr >= (*iov)->iov_len) {
+               nr -= (*iov)->iov_len;
+               *total += (*iov)->iov_len;
+               *count -= (*iov)->iov_len;
+               if (offset)
+                       *offset += (*iov)->iov_len;
+               (*iovcnt)--;
+               (*iov)++;
+       }
+}
+
+ssize_t readv_in_full(int fd, const struct iovec *iov, int iovcnt)
+{
+       ssize_t total = 0;
+       size_t count = get_iov_size(iov, iovcnt);
+
+       while (count > 0) {
+               ssize_t nr;
+
+               nr = xreadv(fd, iov, iovcnt);
+               if (nr <= 0) {
+                       if (total > 0)
+                               return total;
+
+                       return -1;
+               }
+
+               shift_iovec(&iov, &iovcnt, nr, &total, &count, NULL);
+       }
+
+       return total;
+}
+
+ssize_t writev_in_full(int fd, const struct iovec *iov, int iovcnt)
+{
+       ssize_t total = 0;
+       size_t count = get_iov_size(iov, iovcnt);
+
+       while (count > 0) {
+               ssize_t nr;
+
+               nr = xwritev(fd, iov, iovcnt);
+               if (nr < 0)
+                       return -1;
+               if (nr == 0) {
+                       errno = ENOSPC;
+                       return -1;
+               }
+
+               shift_iovec(&iov, &iovcnt, nr, &total, &count, NULL);
+       }
+
+       return total;
+}
+
+/* Same as preadv(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpreadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+       ssize_t nr;
+
+restart:
+       nr = preadv(fd, iov, iovcnt, offset);
+       if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+               goto restart;
+
+       return nr;
+}
+
+/* Same as pwritev(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+       ssize_t nr;
+
+restart:
+       nr = pwritev(fd, iov, iovcnt, offset);
+       if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+               goto restart;
+
+       return nr;
+}
+
+ssize_t preadv_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+       ssize_t total = 0;
+       size_t count = get_iov_size(iov, iovcnt);
+
+       while (count > 0) {
+               ssize_t nr;
+
+               nr = xpreadv(fd, iov, iovcnt, offset);
+               if (nr <= 0) {
+                       if (total > 0)
+                               return total;
+
+                       return -1;
+               }
+
+               shift_iovec(&iov, &iovcnt, nr, &total, &count, &offset);
+       }
+
+       return total;
+}
+
+ssize_t pwritev_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+       ssize_t total = 0;
+       size_t count = get_iov_size(iov, iovcnt);
+
+       while (count > 0) {
+               ssize_t nr;
+
+               nr = xpwritev(fd, iov, iovcnt, offset);
+               if (nr < 0)
+                       return -1;
+               if (nr == 0) {
+                       errno = ENOSPC;
+                       return -1;
+               }
+
+               shift_iovec(&iov, &iovcnt, nr, &total, &count, &offset);
+       }
+
+       return total;
+}
diff --git a/tools/kvm/symbol.c b/tools/kvm/symbol.c
new file mode 100644 (file)
index 0000000..56dd346
--- /dev/null
@@ -0,0 +1,98 @@
+#include "kvm/symbol.h"
+
+#include "kvm/kvm.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <bfd.h>
+
+static bfd             *abfd;
+
+void symbol__init(const char *vmlinux)
+{
+       if (!vmlinux)
+               return;
+
+       bfd_init();
+
+       abfd            = bfd_openr(vmlinux, NULL);
+}
+
+static asymbol *lookup(asymbol **symbols, int nr_symbols, const char *symbol_name)
+{
+       int i;
+
+       for (i = 0; i < nr_symbols; i++) {
+               asymbol *symbol = symbols[i];
+
+               if (!strcmp(bfd_asymbol_name(symbol), symbol_name))
+                       return symbol;
+       }
+
+       return NULL;
+}
+
+char *symbol__lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size)
+{
+       const char *filename;
+       bfd_vma sym_offset;
+       bfd_vma sym_start;
+       asection *section;
+       unsigned int line;
+       const char *func;
+       long symtab_size;
+       asymbol *symbol;
+       asymbol **syms;
+       int nr_syms;
+       char *s;
+
+       if (!abfd)
+               goto not_found;
+
+       if (!bfd_check_format(abfd, bfd_object))
+               goto not_found;
+
+       symtab_size     = bfd_get_symtab_upper_bound(abfd);
+       if (!symtab_size)
+               goto not_found;
+
+       syms            = malloc(symtab_size);
+       if (!syms)
+               goto not_found;
+
+       nr_syms         = bfd_canonicalize_symtab(abfd, syms);
+
+       section         = bfd_get_section_by_name(abfd, ".debug_aranges");
+       if (!section)
+               goto not_found;
+
+       if (!bfd_find_nearest_line(abfd, section, NULL, addr, &filename, &func, &line))
+               goto not_found;
+
+       if (!func)
+               goto not_found;
+
+       symbol          = lookup(syms, nr_syms, func);
+       if (!symbol)
+               goto not_found;
+
+       sym_start       = bfd_asymbol_value(symbol);
+
+       sym_offset      = addr - sym_start;
+
+       snprintf(sym, size, "%s+%llx (%s:%i)", func, (long long) sym_offset, filename, line);
+
+       sym[size - 1] = '\0';
+
+       free(syms);
+
+       return sym;
+
+not_found:
+       s = strncpy(sym, "<unknown>", size);
+
+       sym[size - 1] = '\0';
+
+       return s;
+}
diff --git a/tools/kvm/term.c b/tools/kvm/term.c
new file mode 100644 (file)
index 0000000..fa4382d
--- /dev/null
@@ -0,0 +1,127 @@
+#include <poll.h>
+#include <stdbool.h>
+#include <termios.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <signal.h>
+
+#include "kvm/read-write.h"
+#include "kvm/term.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+extern struct kvm *kvm;
+static struct termios  orig_term;
+
+int term_escape_char   = 0x01; /* ctrl-a is used for escape */
+bool term_got_escape   = false;
+
+int active_console;
+
+int term_getc(int who)
+{
+       int c;
+
+       if (who != active_console)
+               return -1;
+
+       if (read_in_full(STDIN_FILENO, &c, 1) < 0)
+               return -1;
+
+       c &= 0xff;
+
+       if (term_got_escape) {
+               term_got_escape = false;
+               if (c == 'x')
+                       kvm_cpu__reboot();
+               if (c == term_escape_char)
+                       return c;
+       }
+
+       if (c == term_escape_char) {
+               term_got_escape = true;
+               return -1;
+       }
+
+       return c;
+}
+
+int term_putc(int who, char *addr, int cnt)
+{
+       if (who != active_console)
+               return -1;
+
+       while (cnt--)
+               fprintf(stdout, "%c", *addr++);
+
+       fflush(stdout);
+       return cnt;
+}
+
+int term_getc_iov(int who, struct iovec *iov, int iovcnt)
+{
+       int c;
+
+       if (who != active_console)
+               return 0;
+
+       c = term_getc(who);
+
+       if (c < 0)
+               return 0;
+
+       *((int *)iov[0].iov_base)       = c;
+
+       return sizeof(char);
+}
+
+int term_putc_iov(int who, struct iovec *iov, int iovcnt)
+{
+       if (who != active_console)
+               return 0;
+
+       return writev(STDOUT_FILENO, iov, iovcnt);
+}
+
+bool term_readable(int who)
+{
+       struct pollfd pollfd = (struct pollfd) {
+               .fd     = STDIN_FILENO,
+               .events = POLLIN,
+               .revents = 0,
+       };
+
+       if (who != active_console)
+               return false;
+
+       return poll(&pollfd, 1, 0) > 0;
+}
+
+static void term_cleanup(void)
+{
+       tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
+}
+
+static void term_sig_cleanup(int sig)
+{
+       term_cleanup();
+       signal(sig, SIG_DFL);
+       raise(sig);
+}
+
+void term_init(void)
+{
+       struct termios term;
+
+       if (tcgetattr(STDIN_FILENO, &orig_term) < 0)
+               die("unable to save initial standard input settings");
+
+       term = orig_term;
+       term.c_lflag &= ~(ICANON | ECHO | ISIG);
+       tcsetattr(STDIN_FILENO, TCSANOW, &term);
+
+       signal(SIGTERM, term_sig_cleanup);
+       atexit(term_cleanup);
+}
diff --git a/tools/kvm/tests/Makefile b/tools/kvm/tests/Makefile
new file mode 100644 (file)
index 0000000..cad14ec
--- /dev/null
@@ -0,0 +1,19 @@
+all: kernel pit boot
+
+kernel:
+       $(MAKE) -C kernel
+.PHONY: kernel
+
+pit:
+       $(MAKE) -C pit
+.PHONY: pit
+
+boot:
+       $(MAKE) -C boot
+.PHONY: boot
+
+clean:
+       $(MAKE) -C kernel clean
+       $(MAKE) -C pit clean
+       $(MAKE) -C boot clean
+.PHONY: clean
diff --git a/tools/kvm/tests/boot/Makefile b/tools/kvm/tests/boot/Makefile
new file mode 100644 (file)
index 0000000..40cba68
--- /dev/null
@@ -0,0 +1,13 @@
+NAME   := init
+
+OBJ    := $(NAME).o
+
+all: $(.o)
+       rm -rf rootfs
+       mkdir rootfs
+       gcc -static init.c -o rootfs/init
+       mkisofs rootfs > boot_test.iso
+
+clean:
+       rm -rf rootfs boot_test.iso
+.PHONY: clean
diff --git a/tools/kvm/tests/boot/init.c b/tools/kvm/tests/boot/init.c
new file mode 100644 (file)
index 0000000..094f8ba
--- /dev/null
@@ -0,0 +1,11 @@
+#include <linux/reboot.h>
+#include <unistd.h>
+
+int main(int argc, char *argv[])
+{
+       puts("hello, KVM guest!\r");
+
+       reboot(LINUX_REBOOT_CMD_RESTART);
+
+       return 0;
+}
diff --git a/tools/kvm/tests/kernel/.gitignore b/tools/kvm/tests/kernel/.gitignore
new file mode 100644 (file)
index 0000000..d0cd209
--- /dev/null
@@ -0,0 +1,2 @@
+kernel.bin
+kernel.elf
diff --git a/tools/kvm/tests/kernel/Makefile b/tools/kvm/tests/kernel/Makefile
new file mode 100644 (file)
index 0000000..c7dd8da
--- /dev/null
@@ -0,0 +1,20 @@
+NAME   := kernel
+
+BIN    := $(NAME).bin
+ELF    := $(NAME).elf
+OBJ    := $(NAME).o
+
+all: $(BIN)
+
+$(BIN): $(ELF)
+       objcopy -O binary $< $@
+
+$(ELF): $(OBJ)
+       ld -Ttext=0x00 -nostdlib -static $< -o $@
+
+%.o: %.S
+       gcc -nostdinc -c $< -o $@
+
+clean:
+       rm -f $(BIN) $(ELF) $(OBJ)
+.PHONY: clean
diff --git a/tools/kvm/tests/kernel/README b/tools/kvm/tests/kernel/README
new file mode 100644 (file)
index 0000000..2923777
--- /dev/null
@@ -0,0 +1,16 @@
+Compiling
+---------
+
+You can simply type:
+
+  $ make
+
+to build a 16-bit binary that uses the i8086 instruction set.
+
+Disassembling
+-------------
+
+Use the "-m i8086" command line option with objdump to make sure it knows we're
+dealing with i8086 instruction set:
+
+  $ objdump -d -m i8086 i8086.elf
diff --git a/tools/kvm/tests/kernel/kernel.S b/tools/kvm/tests/kernel/kernel.S
new file mode 100644 (file)
index 0000000..2824b64
--- /dev/null
@@ -0,0 +1,8 @@
+       .code16gcc
+       .text
+       .globl  _start
+       .type   _start, @function
+_start:
+       # "This is probably the largest possible kernel that is bug free." -- Avi Kivity
+       1:
+       jmp 1b
diff --git a/tools/kvm/tests/pit/.gitignore b/tools/kvm/tests/pit/.gitignore
new file mode 100644 (file)
index 0000000..43f0aa8
--- /dev/null
@@ -0,0 +1,2 @@
+*.bin
+*.elf
diff --git a/tools/kvm/tests/pit/Makefile b/tools/kvm/tests/pit/Makefile
new file mode 100644 (file)
index 0000000..2fae9b2
--- /dev/null
@@ -0,0 +1,20 @@
+NAME   := tick
+
+BIN    := $(NAME).bin
+ELF    := $(NAME).elf
+OBJ    := $(NAME).o
+
+all: $(BIN)
+
+$(BIN): $(ELF)
+       objcopy -O binary $< $@
+
+$(ELF): $(OBJ)
+       ld -Ttext=0x00 -nostdlib -static $< -o $@
+
+%.o: %.S
+       gcc -nostdinc -c $< -o $@
+
+clean:
+       rm -f $(BIN) $(ELF) $(OBJ)
+.PHONY: clean
diff --git a/tools/kvm/tests/pit/README b/tools/kvm/tests/pit/README
new file mode 100644 (file)
index 0000000..2923777
--- /dev/null
@@ -0,0 +1,16 @@
+Compiling
+---------
+
+You can simply type:
+
+  $ make
+
+to build a 16-bit binary that uses the i8086 instruction set.
+
+Disassembling
+-------------
+
+Use the "-m i8086" command line option with objdump to make sure it knows we're
+dealing with i8086 instruction set:
+
+  $ objdump -d -m i8086 i8086.elf
diff --git a/tools/kvm/tests/pit/tick.S b/tools/kvm/tests/pit/tick.S
new file mode 100644 (file)
index 0000000..b9e5a80
--- /dev/null
@@ -0,0 +1,109 @@
+#define IO_PIC         0x20
+#define IRQ_OFFSET     32
+#define IO_PIT         0x40
+#define TIMER_FREQ     1193182
+#define TIMER_DIV(x)   ((TIMER_FREQ+(x)/2)/(x))
+
+/*
+ * hpa noted:
+ *
+ * 0xe0..0xef are "motherboard specific", but 0xe9 is
+ * used for Bochs debugging and 0xed is the Phoenix-reserved
+ * delay port
+ */
+#define DBG_PORT       0xe0
+
+#define TEST_COUNT     0x0200
+
+       .code16gcc
+       .text
+       .globl  _start
+       .type   _start, @function
+_start:
+/*
+ * fill up noop handlers
+ */
+       xorw    %ax, %ax
+       xorw    %di, %di
+       movw    %ax, %es
+       movw    $256, %cx
+fill_noop_idt:
+       movw    $noop_handler, %es:(%di)
+       movw    %cs, %es:2(%di)
+       add     $4, %di
+       loop    fill_noop_idt
+
+set_idt:
+       movw    $timer_isr, %es:(IRQ_OFFSET*4)
+       movw    %cs, %es:(IRQ_OFFSET*4+2)
+
+set_pic:
+       # ICW1
+       mov     $0x11, %al
+       mov     $(IO_PIC), %dx
+       out     %al,%dx
+       # ICW2
+       mov     $(IRQ_OFFSET), %al
+       mov     $(IO_PIC+1), %dx
+       out     %al, %dx
+       # ICW3
+       mov     $0x00, %al
+       mov     $(IO_PIC+1), %dx
+       out     %al, %dx
+       # ICW4
+       mov     $0x3, %al
+       mov     $(IO_PIC+1), %dx
+       out     %al, %dx
+
+set_pit:
+       # set 8254 mode
+       mov     $(IO_PIT+3), %dx
+       mov     $0x34, %al
+       outb    %al, %dx
+       # set 8254 freq 1KHz
+       mov     $(IO_PIT), %dx
+       movb    $(TIMER_DIV(1000) % 256), %al
+       outb    %al, %dx
+       movb    $(TIMER_DIV(1000) / 256), %al
+       outb    %al, %dx
+
+enable_irq0:
+       mov     $0xfe, %al
+       mov     $(IO_PIC+1), %dx
+       out     %al, %dx
+       sti
+loop:
+       1:
+       jmp     1b
+
+test_ok:
+       mov     $0x3f8,%dx
+       cs lea  msg2, %si
+       mov     $(msg2_end-msg2), %cx
+       cs rep/outsb
+
+       /* not a valid port to force exit */
+       outb    %al, $DBG_PORT
+
+timer_isr:
+       cli
+       pushaw
+       pushfw
+       mov     $0x3f8,%dx
+       mov     $0x2e, %al      # .
+       out     %al,%dx
+       decw    count
+       jz      test_ok
+       popfw
+       popaw
+       iretw
+
+noop_handler:
+       iretw
+
+count:
+       .word   TEST_COUNT
+
+msg2:
+       .asciz "\nTest OK\n"
+msg2_end:
diff --git a/tools/kvm/threadpool.c b/tools/kvm/threadpool.c
new file mode 100644 (file)
index 0000000..fdc5fa7
--- /dev/null
@@ -0,0 +1,146 @@
+#include "kvm/threadpool.h"
+#include "kvm/mutex.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <pthread.h>
+#include <stdbool.h>
+
+static pthread_mutex_t job_mutex       = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t thread_mutex    = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t  job_cond        = PTHREAD_COND_INITIALIZER;
+
+static LIST_HEAD(head);
+
+static pthread_t       *threads;
+static long            threadcount;
+
+static struct thread_pool__job *thread_pool__job_pop(void)
+{
+       struct thread_pool__job *job;
+
+       if (list_empty(&head))
+               return NULL;
+
+       job = list_first_entry(&head, struct thread_pool__job, queue);
+       list_del(&job->queue);
+
+       return job;
+}
+
+static void thread_pool__job_push(struct thread_pool__job *job)
+{
+       list_add_tail(&job->queue, &head);
+}
+
+static struct thread_pool__job *thread_pool__job_pop_locked(void)
+{
+       struct thread_pool__job *job;
+
+       mutex_lock(&job_mutex);
+       job = thread_pool__job_pop();
+       mutex_unlock(&job_mutex);
+       return job;
+}
+
+static void thread_pool__job_push_locked(struct thread_pool__job *job)
+{
+       mutex_lock(&job_mutex);
+       thread_pool__job_push(job);
+       mutex_unlock(&job_mutex);
+}
+
+static void thread_pool__handle_job(struct thread_pool__job *job)
+{
+       while (job) {
+               job->callback(job->kvm, job->data);
+
+               mutex_lock(&job->mutex);
+
+               if (--job->signalcount > 0)
+                       /* If the job was signaled again while we were working */
+                       thread_pool__job_push_locked(job);
+
+               mutex_unlock(&job->mutex);
+
+               job = thread_pool__job_pop_locked();
+       }
+}
+
+static void thread_pool__threadfunc_cleanup(void *param)
+{
+       mutex_unlock(&job_mutex);
+}
+
+static void *thread_pool__threadfunc(void *param)
+{
+       pthread_cleanup_push(thread_pool__threadfunc_cleanup, NULL);
+
+       for (;;) {
+               struct thread_pool__job *curjob;
+
+               mutex_lock(&job_mutex);
+               pthread_cond_wait(&job_cond, &job_mutex);
+               curjob = thread_pool__job_pop();
+               mutex_unlock(&job_mutex);
+
+               if (curjob)
+                       thread_pool__handle_job(curjob);
+       }
+
+       pthread_cleanup_pop(0);
+
+       return NULL;
+}
+
+static int thread_pool__addthread(void)
+{
+       int res;
+       void *newthreads;
+
+       mutex_lock(&thread_mutex);
+       newthreads = realloc(threads, (threadcount + 1) * sizeof(pthread_t));
+       if (newthreads == NULL) {
+               mutex_unlock(&thread_mutex);
+               return -1;
+       }
+
+       threads = newthreads;
+
+       res = pthread_create(threads + threadcount, NULL,
+                            thread_pool__threadfunc, NULL);
+
+       if (res == 0)
+               threadcount++;
+       mutex_unlock(&thread_mutex);
+
+       return res;
+}
+
+int thread_pool__init(unsigned long thread_count)
+{
+       unsigned long i;
+
+       for (i = 0; i < thread_count; i++)
+               if (thread_pool__addthread() < 0)
+                       return i;
+
+       return i;
+}
+
+void thread_pool__do_job(struct thread_pool__job *job)
+{
+       struct thread_pool__job *jobinfo = job;
+
+       if (jobinfo == NULL || jobinfo->callback == NULL)
+               return;
+
+       mutex_lock(&jobinfo->mutex);
+       if (jobinfo->signalcount++ == 0)
+               thread_pool__job_push_locked(job);
+       mutex_unlock(&jobinfo->mutex);
+
+       mutex_lock(&job_mutex);
+       pthread_cond_signal(&job_cond);
+       mutex_unlock(&job_mutex);
+}
diff --git a/tools/kvm/ui/sdl.c b/tools/kvm/ui/sdl.c
new file mode 100644 (file)
index 0000000..a9e7cc7
--- /dev/null
@@ -0,0 +1,158 @@
+#include "kvm/sdl.h"
+
+#include "kvm/framebuffer.h"
+#include "kvm/i8042.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <SDL/SDL.h>
+#include <pthread.h>
+#include <signal.h>
+
+#define FRAME_RATE             25
+
+static u8 keymap[255] = {
+       [10]            = 0x16,         /* 1 */
+       [11]            = 0x1e,         /* 2 */
+       [12]            = 0x26,         /* 3 */
+       [13]            = 0x25,         /* 4 */
+       [14]            = 0x27,         /* 5 */
+       [15]            = 0x36,         /* 6 */
+       [16]            = 0x3d,         /* 7 */
+       [17]            = 0x3e,         /* 8 */
+       [18]            = 0x46,         /* 9 */
+       [19]            = 0x45,         /* 9 */
+       [20]            = 0x4e,         /* - */
+       [21]            = 0x55,         /* + */
+       [22]            = 0x66,         /* <backspace> */
+
+       [24]            = 0x15,         /* q */
+       [25]            = 0x1d,         /* w */
+       [26]            = 0x24,         /* e */
+       [27]            = 0x2d,         /* r */
+       [28]            = 0x2c,         /* t */
+       [29]            = 0x35,         /* y */
+       [30]            = 0x3c,         /* u */
+       [31]            = 0x43,         /* i */
+       [32]            = 0x44,         /* o */
+       [33]            = 0x4d,         /* p */
+
+       [36]            = 0x5a,         /* <enter> */
+
+       [38]            = 0x1c,         /* a */
+       [39]            = 0x1b,         /* s */
+       [40]            = 0x23,         /* d */
+       [41]            = 0x2b,         /* f */
+       [42]            = 0x34,         /* g */
+       [43]            = 0x33,         /* h */
+       [44]            = 0x3b,         /* j */
+       [45]            = 0x42,         /* k */
+       [46]            = 0x4b,         /* l */
+
+       [50]            = 0x12,         /* <left shift> */
+       [51]            = 0x5d,         /* | */
+
+
+       [52]            = 0x1a,         /* z */
+       [53]            = 0x22,         /* x */
+       [54]            = 0x21,         /* c */
+       [55]            = 0x2a,         /* v */
+       [56]            = 0x32,         /* b */
+       [57]            = 0x31,         /* n */
+       [58]            = 0x3a,         /* m */
+       [59]            = 0x41,         /* < */
+       [60]            = 0x49,         /* > */
+       [61]            = 0x4a,         /* / */
+       [62]            = 0x59,         /* <right shift> */
+       [65]            = 0x29,         /* <space> */
+};
+
+static u8 to_code(u8 scancode)
+{
+       return keymap[scancode];
+}
+
+static void *sdl__thread(void *p)
+{
+       Uint32 rmask, gmask, bmask, amask;
+       struct framebuffer *fb = p;
+       SDL_Surface *guest_screen;
+       SDL_Surface *screen;
+       SDL_Event ev;
+       Uint32 flags;
+
+       if (SDL_Init(SDL_INIT_VIDEO) != 0)
+               die("Unable to initialize SDL");
+
+       rmask = 0x000000ff;
+       gmask = 0x0000ff00;
+       bmask = 0x00ff0000;
+       amask = 0x00000000;
+
+       guest_screen = SDL_CreateRGBSurfaceFrom(fb->mem, fb->width, fb->height, fb->depth, fb->width * fb->depth / 8, rmask, gmask, bmask, amask);
+       if (!guest_screen)
+               die("Unable to create SDL RBG surface");
+
+       flags = SDL_HWSURFACE | SDL_ASYNCBLIT | SDL_HWACCEL | SDL_DOUBLEBUF;
+
+       SDL_WM_SetCaption("KVM tool", "KVM tool");
+
+       screen = SDL_SetVideoMode(fb->width, fb->height, fb->depth, flags);
+       if (!screen)
+               die("Unable to set SDL video mode");
+
+       SDL_EnableKeyRepeat(200, 50);
+
+       for (;;) {
+               SDL_BlitSurface(guest_screen, NULL, screen, NULL);
+               SDL_Flip(screen);
+
+               while (SDL_PollEvent(&ev)) {
+                       switch (ev.type) {
+                       case SDL_KEYDOWN: {
+                               u8 code = to_code(ev.key.keysym.scancode);
+                               if (code)
+                                       kbd_queue(code);
+                               else
+                                       pr_warning("key '%d' not found in keymap", ev.key.keysym.scancode);
+                               break;
+                       }
+                       case SDL_KEYUP: {
+                               u8 code = to_code(ev.key.keysym.scancode);
+                               if (code) {
+                                       kbd_queue(0xf0);
+                                       kbd_queue(code);
+                               }
+                               break;
+                       }
+                       case SDL_QUIT:
+                               goto exit;
+                       }
+               }
+
+               SDL_Delay(1000 / FRAME_RATE);
+       }
+exit:
+       kill(0, SIGKVMSTOP);
+
+       return NULL;
+}
+
+static int sdl__start(struct framebuffer *fb)
+{
+       pthread_t thread;
+
+       if (pthread_create(&thread, NULL, sdl__thread, fb) != 0)
+               return -1;
+
+       return 0;
+}
+
+static struct fb_target_operations sdl_ops = {
+       .start                  = sdl__start,
+};
+
+void sdl__init(struct framebuffer *fb)
+{
+       fb__attach(fb, &sdl_ops);
+}
diff --git a/tools/kvm/ui/vnc.c b/tools/kvm/ui/vnc.c
new file mode 100644 (file)
index 0000000..d760492
--- /dev/null
@@ -0,0 +1,218 @@
+#include "kvm/vnc.h"
+
+#include "kvm/framebuffer.h"
+#include "kvm/i8042.h"
+
+#include <linux/types.h>
+#include <rfb/keysym.h>
+#include <rfb/rfb.h>
+#include <pthread.h>
+
+#define VESA_QUEUE_SIZE                128
+#define VESA_IRQ               14
+
+/*
+ * This "6000" value is pretty much the result of experimentation
+ * It seems that around this value, things update pretty smoothly
+ */
+#define VESA_UPDATE_TIME       6000
+
+/*
+ * We can map the letters and numbers without a fuss,
+ * but the other characters not so much.
+ */
+static char letters[26] = {
+       0x1c, 0x32, 0x21, 0x23, 0x24, /* a-e */
+       0x2b, 0x34, 0x33, 0x43, 0x3b, /* f-j */
+       0x42, 0x4b, 0x3a, 0x31, 0x44, /* k-o */
+       0x4d, 0x15, 0x2d, 0x1b, 0x2c, /* p-t */
+       0x3c, 0x2a, 0x1d, 0x22, 0x35, /* u-y */
+       0x1a,
+};
+
+static char num[10] = {
+       0x45, 0x16, 0x1e, 0x26, 0x2e, 0x23, 0x36, 0x3d, 0x3e, 0x46,
+};
+
+/*
+ * This is called when the VNC server receives a key event
+ * The reason this function is such a beast is that we have
+ * to convert from ASCII characters (which is what VNC gets)
+ * to PC keyboard scancodes, which is what Linux expects to
+ * get from its keyboard. ASCII and the scancode set don't
+ * really seem to mesh in any good way beyond some basics with
+ * the letters and numbers.
+ */
+static void kbd_handle_key(rfbBool down, rfbKeySym key, rfbClientPtr cl)
+{
+       char tosend = 0;
+
+       if (key >= 0x41 && key <= 0x5a)
+               key += 0x20; /* convert to lowercase */
+
+       if (key >= 0x61 && key <= 0x7a) /* a-z */
+               tosend = letters[key - 0x61];
+
+       if (key >= 0x30 && key <= 0x39)
+               tosend = num[key - 0x30];
+
+       switch (key) {
+       case XK_Insert:         kbd_queue(0xe0);        tosend = 0x70;  break;
+       case XK_Delete:         kbd_queue(0xe0);        tosend = 0x71;  break;
+       case XK_Up:             kbd_queue(0xe0);        tosend = 0x75;  break;
+       case XK_Down:           kbd_queue(0xe0);        tosend = 0x72;  break;
+       case XK_Left:           kbd_queue(0xe0);        tosend = 0x6b;  break;
+       case XK_Right:          kbd_queue(0xe0);        tosend = 0x74;  break;
+       case XK_Page_Up:        kbd_queue(0xe0);        tosend = 0x7d;  break;
+       case XK_Page_Down:      kbd_queue(0xe0);        tosend = 0x7a;  break;
+       case XK_Home:           kbd_queue(0xe0);        tosend = 0x6c;  break;
+       case XK_BackSpace:      tosend = 0x66;          break;
+       case XK_Tab:            tosend = 0x0d;          break;
+       case XK_Return:         tosend = 0x5a;          break;
+       case XK_Escape:         tosend = 0x76;          break;
+       case XK_End:            tosend = 0x69;          break;
+       case XK_Shift_L:        tosend = 0x12;          break;
+       case XK_Shift_R:        tosend = 0x59;          break;
+       case XK_Control_R:      kbd_queue(0xe0);
+       case XK_Control_L:      tosend = 0x14;          break;
+       case XK_Alt_R:          kbd_queue(0xe0);
+       case XK_Alt_L:          tosend = 0x11;          break;
+       case XK_quoteleft:      tosend = 0x0e;          break;
+       case XK_minus:          tosend = 0x4e;          break;
+       case XK_equal:          tosend = 0x55;          break;
+       case XK_bracketleft:    tosend = 0x54;          break;
+       case XK_bracketright:   tosend = 0x5b;          break;
+       case XK_backslash:      tosend = 0x5d;          break;
+       case XK_Caps_Lock:      tosend = 0x58;          break;
+       case XK_semicolon:      tosend = 0x4c;          break;
+       case XK_quoteright:     tosend = 0x52;          break;
+       case XK_comma:          tosend = 0x41;          break;
+       case XK_period:         tosend = 0x49;          break;
+       case XK_slash:          tosend = 0x4a;          break;
+       case XK_space:          tosend = 0x29;          break;
+
+       /*
+        * This is where I handle the shifted characters.
+        * They don't really map nicely the way A-Z maps to a-z,
+        * so I'm doing it manually
+        */
+       case XK_exclam:         tosend = 0x16;          break;
+       case XK_quotedbl:       tosend = 0x52;          break;
+       case XK_numbersign:     tosend = 0x26;          break;
+       case XK_dollar:         tosend = 0x25;          break;
+       case XK_percent:        tosend = 0x2e;          break;
+       case XK_ampersand:      tosend = 0x3d;          break;
+       case XK_parenleft:      tosend = 0x46;          break;
+       case XK_parenright:     tosend = 0x45;          break;
+       case XK_asterisk:       tosend = 0x3e;          break;
+       case XK_plus:           tosend = 0x55;          break;
+       case XK_colon:          tosend = 0x4c;          break;
+       case XK_less:           tosend = 0x41;          break;
+       case XK_greater:        tosend = 0x49;          break;
+       case XK_question:       tosend = 0x4a;          break;
+       case XK_at:             tosend = 0x1e;          break;
+       case XK_asciicircum:    tosend = 0x36;          break;
+       case XK_underscore:     tosend = 0x4e;          break;
+       case XK_braceleft:      tosend = 0x54;          break;
+       case XK_braceright:     tosend = 0x5b;          break;
+       case XK_bar:            tosend = 0x5d;          break;
+       case XK_asciitilde:     tosend = 0x0e;          break;
+       default:                break;
+       }
+
+       /*
+        * If this is a "key up" event (the user has released the key, we
+        * need to send 0xf0 first.
+        */
+       if (!down && tosend != 0x0)
+               kbd_queue(0xf0);
+
+       if (tosend)
+               kbd_queue(tosend);
+}
+
+/* The previous X and Y coordinates of the mouse */
+static int xlast, ylast = -1;
+
+/*
+ * This function is called by the VNC server whenever a mouse event occurs.
+ */
+static void kbd_handle_ptr(int buttonMask, int x, int y, rfbClientPtr cl)
+{
+       int dx, dy;
+       char b1 = 0x8;
+
+       /* The VNC mask and the PS/2 button encoding are the same */
+       b1 |= buttonMask;
+
+       if (xlast >= 0 && ylast >= 0) {
+               /* The PS/2 mouse sends deltas, not absolutes */
+               dx = x - xlast;
+               dy = ylast - y;
+
+               /* Set overflow bits if needed */
+               if (dy > 255)
+                       b1 |= 0x80;
+               if (dx > 255)
+                       b1 |= 0x40;
+
+               /* Set negative bits if needed */
+               if (dy < 0)
+                       b1 |= 0x20;
+               if (dx < 0)
+                       b1 |= 0x10;
+
+               mouse_queue(b1);
+               mouse_queue(dx);
+               mouse_queue(dy);
+       }
+
+       xlast = x;
+       ylast = y;
+       rfbDefaultPtrAddEvent(buttonMask, x, y, cl);
+}
+
+static void *vnc__thread(void *p)
+{
+       struct framebuffer *fb = p;
+       /*
+        * Make a fake argc and argv because the getscreen function
+        * seems to want it.
+        */
+       char argv[1][1] = {{0}};
+       int argc = 1;
+
+       rfbScreenInfoPtr server;
+
+       server = rfbGetScreen(&argc, (char **) argv, fb->width, fb->height, 8, 3, 4);
+       server->frameBuffer             = fb->mem;
+       server->alwaysShared            = TRUE;
+       server->kbdAddEvent             = kbd_handle_key;
+       server->ptrAddEvent             = kbd_handle_ptr;
+       rfbInitServer(server);
+
+       while (rfbIsActive(server)) {
+               rfbMarkRectAsModified(server, 0, 0, fb->width, fb->height);
+               rfbProcessEvents(server, server->deferUpdateTime * VESA_UPDATE_TIME);
+       }
+       return NULL;
+}
+
+static int vnc__start(struct framebuffer *fb)
+{
+       pthread_t thread;
+
+       if (pthread_create(&thread, NULL, vnc__thread, fb) != 0)
+               return -1;
+
+       return 0;
+}
+
+static struct fb_target_operations vnc_ops = {
+       .start                  = vnc__start,
+};
+
+void vnc__init(struct framebuffer *fb)
+{
+       fb__attach(fb, &vnc_ops);
+}
diff --git a/tools/kvm/util.c b/tools/kvm/util.c
new file mode 100644 (file)
index 0000000..4efbce9
--- /dev/null
@@ -0,0 +1,101 @@
+/*
+ * Taken from perf which in turn take it from GIT
+ */
+
+#include "kvm/util.h"
+
+static void report(const char *prefix, const char *err, va_list params)
+{
+       char msg[1024];
+       vsnprintf(msg, sizeof(msg), err, params);
+       fprintf(stderr, " %s%s\n", prefix, msg);
+}
+
+static NORETURN void die_builtin(const char *err, va_list params)
+{
+       report(" Fatal: ", err, params);
+       exit(128);
+}
+
+static void error_builtin(const char *err, va_list params)
+{
+       report(" Error: ", err, params);
+}
+
+static void warn_builtin(const char *warn, va_list params)
+{
+       report(" Warning: ", warn, params);
+}
+
+static void info_builtin(const char *info, va_list params)
+{
+       report(" Info: ", info, params);
+}
+
+void die(const char *err, ...)
+{
+       va_list params;
+
+       va_start(params, err);
+       die_builtin(err, params);
+       va_end(params);
+}
+
+int pr_error(const char *err, ...)
+{
+       va_list params;
+
+       va_start(params, err);
+       error_builtin(err, params);
+       va_end(params);
+       return -1;
+}
+
+void pr_warning(const char *warn, ...)
+{
+       va_list params;
+
+       va_start(params, warn);
+       warn_builtin(warn, params);
+       va_end(params);
+}
+
+void pr_info(const char *info, ...)
+{
+       va_list params;
+
+       va_start(params, info);
+       info_builtin(info, params);
+       va_end(params);
+}
+
+void die_perror(const char *s)
+{
+       perror(s);
+       exit(1);
+}
+
+/**
+ * strlcat - Append a length-limited, %NUL-terminated string to another
+ * @dest: The string to be appended to
+ * @src: The string to append to it
+ * @count: The size of the destination buffer.
+ */
+size_t strlcat(char *dest, const char *src, size_t count)
+{
+       size_t dsize = strlen(dest);
+       size_t len = strlen(src);
+       size_t res = dsize + len;
+
+       DIE_IF(dsize >= count);
+
+       dest += dsize;
+       count -= dsize;
+       if (len >= count)
+               len = count - 1;
+
+       memcpy(dest, src, len);
+       dest[len] = 0;
+
+       return res;
+}
diff --git a/tools/kvm/util/KVMTOOLS-VERSION-GEN b/tools/kvm/util/KVMTOOLS-VERSION-GEN
new file mode 100755 (executable)
index 0000000..1af9d6c
--- /dev/null
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+if [ $# -eq 1 ]  ; then
+       OUTPUT=$1
+fi
+
+GVF=${OUTPUT}KVMTOOLS-VERSION-FILE
+
+LF='
+'
+
+# First check if there is a .git to get the version from git describe
+# otherwise try to get the version from the kernel makefile
+if test -d ../../.git -o -f ../../.git &&
+       VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
+       case "$VN" in
+       *$LF*) (exit 1) ;;
+       v[0-9]*)
+               git update-index -q --refresh
+               test -z "$(git diff-index --name-only HEAD --)" ||
+               VN="$VN-dirty" ;;
+       esac
+then
+       VN=$(echo "$VN" | sed -e 's/-/./g');
+else
+       VN=$(MAKEFLAGS= make -sC ../.. kernelversion)
+fi
+
+VN=$(expr "$VN" : v*'\(.*\)')
+
+if test -r $GVF
+then
+       VC=$(sed -e 's/^KVMTOOLS_VERSION = //' <$GVF)
+else
+       VC=unset
+fi
+test "$VN" = "$VC" || {
+       echo >&2 "KVMTOOLS_VERSION = $VN"
+       echo "KVMTOOLS_VERSION = $VN" >$GVF
+}
diff --git a/tools/kvm/util/generate-cmdlist.sh b/tools/kvm/util/generate-cmdlist.sh
new file mode 100755 (executable)
index 0000000..fe0178e
--- /dev/null
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+echo "/* Automatically generated by $0 */
+struct cmdname_help
+{
+    char name[16];
+    char help[80];
+};
+
+static struct cmdname_help common_cmds[] = {"
+
+sed -n 's/^kvm-\([^ \t]*\).*common/\1/p' command-list.txt |
+while read cmd
+do
+        # TODO following sed command should be fixed
+     sed -n '/^NAME/,/^kvm-'"$cmd"'/ {
+                /NAME/d
+                /--/d
+                s/.*kvm-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
+            p
+        }' "Documentation/kvm-$cmd.txt"
+done
+echo "};"
diff --git a/tools/kvm/util/kvm-ifup-vbr0 b/tools/kvm/util/kvm-ifup-vbr0
new file mode 100755 (executable)
index 0000000..a91c37f
--- /dev/null
@@ -0,0 +1,6 @@
+#!/bin/sh
+switch=vbr0
+/sbin/ifconfig $1 0.0.0.0 up
+/usr/sbin/brctl addif ${switch} $1
+/usr/sbin/brctl setfd ${switch} 0
+/usr/sbin/brctl stp ${switch} off
diff --git a/tools/kvm/util/parse-options.c b/tools/kvm/util/parse-options.c
new file mode 100644 (file)
index 0000000..c280379
--- /dev/null
@@ -0,0 +1,631 @@
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <stdbool.h>
+
+/* user defined includes */
+#include <linux/types.h>
+#include <kvm/util.h>
+#include <kvm/parse-options.h>
+#include <kvm/strbuf.h>
+
+#define OPT_SHORT 1
+#define OPT_UNSET 2
+
+static int opterror(const struct option *opt, const char *reason, int flags)
+{
+       if (flags & OPT_SHORT)
+               return pr_error("switch `%c' %s", opt->short_name, reason);
+       if (flags & OPT_UNSET)
+               return pr_error("option `no-%s' %s", opt->long_name, reason);
+       return pr_error("option `%s' %s", opt->long_name, reason);
+}
+
+static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
+               int flags, const char **arg)
+{
+       if (p->opt) {
+               *arg = p->opt;
+               p->opt = NULL;
+       } else if ((opt->flags & PARSE_OPT_LASTARG_DEFAULT) && (p->argc == 1 ||
+                               **(p->argv + 1) == '-')) {
+               *arg = (const char *)opt->defval;
+       } else if (p->argc > 1) {
+               p->argc--;
+               *arg = *++p->argv;
+       } else
+               return opterror(opt, "requires a value", flags);
+       return 0;
+}
+
+#define numvalue(c)                                    \
+       ((c) >= 'a' ? (c) - 'a' + 10 :                  \
+        (c) >= 'A' ? (c) - 'A' + 10 : (c) - '0')
+
+static u64 readhex(const char *str, bool *error)
+{
+       char *pos = strchr(str, 'x') + 1;
+       u64 res = 0;
+
+       while (*pos) {
+               unsigned int v = numvalue(*pos);
+               if (v > 16) {
+                       *error = true;
+                       return 0;
+               }
+
+               res = (res * 16) + v;
+               pos++;
+       }
+
+       *error = false;
+       return res;
+}
+
+static int readnum(const struct option *opt, int flags,
+                  const char *str, char **end)
+{
+       if (strchr(str, 'x')) {
+               bool error;
+               u64 value;
+
+               value = readhex(str, &error);
+               if (error)
+                       goto enotnum;
+
+               switch (opt->type) {
+               case OPTION_INTEGER:
+                       *(int *)opt->value = value;
+                       break;
+               case OPTION_UINTEGER:
+                       *(unsigned int *)opt->value = value;
+                       break;
+               case OPTION_LONG:
+                       *(long *)opt->value = value;
+                       break;
+               case OPTION_U64:
+                       *(u64 *)opt->value = value;
+                       break;
+               default:
+                       goto invcall;
+               }
+       } else {
+               switch (opt->type) {
+               case OPTION_INTEGER:
+                       *(int *)opt->value = strtol(str, end, 10);
+                       break;
+               case OPTION_UINTEGER:
+                       *(unsigned int *)opt->value = strtol(str, end, 10);
+                       break;
+               case OPTION_LONG:
+                       *(long *)opt->value = strtol(str, end, 10);
+                       break;
+               case OPTION_U64:
+                       *(u64 *)opt->value = strtoull(str, end, 10);
+                       break;
+               default:
+                       goto invcall;
+               }
+       }
+
+       return 0;
+
+enotnum:
+       return opterror(opt, "expects a numerical value", flags);
+invcall:
+       return opterror(opt, "invalid numeric conversion", flags);
+}
+
+static int get_value(struct parse_opt_ctx_t *p,
+               const struct option *opt, int flags)
+{
+       const char *s, *arg = NULL;
+       const int unset = flags & OPT_UNSET;
+
+       if (unset && p->opt)
+               return opterror(opt, "takes no value", flags);
+       if (unset && (opt->flags & PARSE_OPT_NONEG))
+               return opterror(opt, "isn't available", flags);
+
+       if (!(flags & OPT_SHORT) && p->opt) {
+               switch (opt->type) {
+               case OPTION_CALLBACK:
+                       if (!(opt->flags & PARSE_OPT_NOARG))
+                               break;
+               /* FALLTHROUGH */
+               case OPTION_BOOLEAN:
+               case OPTION_INCR:
+               case OPTION_BIT:
+               case OPTION_SET_UINT:
+               case OPTION_SET_PTR:
+                       return opterror(opt, "takes no value", flags);
+               case OPTION_END:
+               case OPTION_ARGUMENT:
+               case OPTION_GROUP:
+               case OPTION_STRING:
+               case OPTION_INTEGER:
+               case OPTION_UINTEGER:
+               case OPTION_LONG:
+               case OPTION_U64:
+               default:
+                       break;
+               }
+       }
+
+       switch (opt->type) {
+       case OPTION_BIT:
+               if (unset)
+                       *(int *)opt->value &= ~opt->defval;
+               else
+                       *(int *)opt->value |= opt->defval;
+               return 0;
+
+       case OPTION_BOOLEAN:
+               *(bool *)opt->value = unset ? false : true;
+               return 0;
+
+       case OPTION_INCR:
+               *(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
+               return 0;
+
+       case OPTION_SET_UINT:
+               *(unsigned int *)opt->value = unset ? 0 : opt->defval;
+               return 0;
+
+       case OPTION_SET_PTR:
+               *(void **)opt->value = unset ? NULL : (void *)opt->defval;
+               return 0;
+
+       case OPTION_STRING:
+               if (unset)
+                       *(const char **)opt->value = NULL;
+               else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+                       *(const char **)opt->value = (const char *)opt->defval;
+               else
+                       return get_arg(p, opt, flags,
+                                       (const char **)opt->value);
+               return 0;
+
+       case OPTION_CALLBACK:
+               if (unset)
+                       return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
+               if (opt->flags & PARSE_OPT_NOARG)
+                       return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+               if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+                       return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+               if (get_arg(p, opt, flags, &arg))
+                       return -1;
+               return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
+
+       case OPTION_INTEGER:
+               if (unset) {
+                       *(int *)opt->value = 0;
+                       return 0;
+               }
+               if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+                       *(int *)opt->value = opt->defval;
+                       return 0;
+               }
+               if (get_arg(p, opt, flags, &arg))
+                       return -1;
+               return readnum(opt, flags, arg, (char **)&s);
+
+       case OPTION_UINTEGER:
+               if (unset) {
+                       *(unsigned int *)opt->value = 0;
+                       return 0;
+               }
+               if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+                       *(unsigned int *)opt->value = opt->defval;
+                       return 0;
+               }
+               if (get_arg(p, opt, flags, &arg))
+                       return -1;
+               return readnum(opt, flags, arg, (char **)&s);
+
+       case OPTION_LONG:
+               if (unset) {
+                       *(long *)opt->value = 0;
+                       return 0;
+               }
+               if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+                       *(long *)opt->value = opt->defval;
+                       return 0;
+               }
+               if (get_arg(p, opt, flags, &arg))
+                       return -1;
+               return readnum(opt, flags, arg, (char **)&s);
+
+       case OPTION_U64:
+               if (unset) {
+                       *(u64 *)opt->value = 0;
+                       return 0;
+               }
+               if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+                       *(u64 *)opt->value = opt->defval;
+                       return 0;
+               }
+               if (get_arg(p, opt, flags, &arg))
+                       return -1;
+               return readnum(opt, flags, arg, (char **)&s);
+
+       case OPTION_END:
+       case OPTION_ARGUMENT:
+       case OPTION_GROUP:
+       default:
+               die("should not happen, someone must be hit on the forehead");
+       }
+}
+
+#define USAGE_OPTS_WIDTH 24
+#define USAGE_GAP         2
+
+static int usage_with_options_internal(const char * const *usagestr,
+               const struct option *opts, int full)
+{
+       if (!usagestr)
+               return PARSE_OPT_HELP;
+
+       fprintf(stderr, "\n usage: %s\n", *usagestr++);
+       while (*usagestr && **usagestr)
+               fprintf(stderr, "    or: %s\n", *usagestr++);
+       while (*usagestr) {
+               fprintf(stderr, "%s%s\n",
+                               **usagestr ? "    " : "",
+                               *usagestr);
+               usagestr++;
+       }
+
+       if (opts->type != OPTION_GROUP)
+               fputc('\n', stderr);
+
+       for (; opts->type != OPTION_END; opts++) {
+               size_t pos;
+               int pad;
+
+               if (opts->type == OPTION_GROUP) {
+                       fputc('\n', stderr);
+                       if (*opts->help)
+                               fprintf(stderr, "%s\n", opts->help);
+                       continue;
+               }
+               if (!full && (opts->flags & PARSE_OPT_HIDDEN))
+                       continue;
+
+               pos = fprintf(stderr, "    ");
+               if (opts->short_name)
+                       pos += fprintf(stderr, "-%c", opts->short_name);
+               else
+                       pos += fprintf(stderr, "    ");
+
+               if (opts->long_name && opts->short_name)
+                       pos += fprintf(stderr, ", ");
+               if (opts->long_name)
+                       pos += fprintf(stderr, "--%s", opts->long_name);
+
+               switch (opts->type) {
+               case OPTION_ARGUMENT:
+                       break;
+               case OPTION_LONG:
+               case OPTION_U64:
+               case OPTION_INTEGER:
+               case OPTION_UINTEGER:
+                       if (opts->flags & PARSE_OPT_OPTARG)
+                               if (opts->long_name)
+                                       pos += fprintf(stderr, "[=<n>]");
+                               else
+                                       pos += fprintf(stderr, "[<n>]");
+                       else
+                               pos += fprintf(stderr, " <n>");
+                       break;
+               case OPTION_CALLBACK:
+                       if (opts->flags & PARSE_OPT_NOARG)
+                               break;
+               /* FALLTHROUGH */
+               case OPTION_STRING:
+                       if (opts->argh) {
+                               if (opts->flags & PARSE_OPT_OPTARG)
+                                       if (opts->long_name)
+                                               pos += fprintf(stderr, "[=<%s>]", opts->argh);
+                                       else
+                                               pos += fprintf(stderr, "[<%s>]", opts->argh);
+                               else
+                                       pos += fprintf(stderr, " <%s>", opts->argh);
+                       } else {
+                               if (opts->flags & PARSE_OPT_OPTARG)
+                                       if (opts->long_name)
+                                               pos += fprintf(stderr, "[=...]");
+                                       else
+                                               pos += fprintf(stderr, "[...]");
+                               else
+                                       pos += fprintf(stderr, " ...");
+                       }
+                               break;
+               default: /* OPTION_{BIT,BOOLEAN,SET_UINT,SET_PTR} */
+               case OPTION_END:
+               case OPTION_GROUP:
+               case OPTION_BIT:
+               case OPTION_BOOLEAN:
+               case OPTION_INCR:
+               case OPTION_SET_UINT:
+               case OPTION_SET_PTR:
+                       break;
+               }
+               if (pos <= USAGE_OPTS_WIDTH)
+                       pad = USAGE_OPTS_WIDTH - pos;
+               else {
+                       fputc('\n', stderr);
+                       pad = USAGE_OPTS_WIDTH;
+               }
+               fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
+       }
+       fputc('\n', stderr);
+
+       return PARSE_OPT_HELP;
+}
+
+void usage_with_options(const char * const *usagestr,
+               const struct option *opts)
+{
+       usage_with_options_internal(usagestr, opts, 0);
+       exit(129);
+}
+
+static void check_typos(const char *arg, const struct option *options)
+{
+       if (strlen(arg) < 3)
+               return;
+
+       if (!prefixcmp(arg, "no-")) {
+               pr_error ("did you mean `--%s` (with two dashes ?)", arg);
+               exit(129);
+       }
+
+       for (; options->type != OPTION_END; options++) {
+               if (!options->long_name)
+                       continue;
+               if (!prefixcmp(options->long_name, arg)) {
+                       pr_error ("did you mean `--%s` (with two dashes ?)", arg);
+                       exit(129);
+               }
+       }
+}
+
+static int parse_options_usage(const char * const *usagestr,
+               const struct option *opts)
+{
+       return usage_with_options_internal(usagestr, opts, 0);
+}
+
+static int parse_short_opt(struct parse_opt_ctx_t *p,
+        const struct option *options)
+{
+       for (; options->type != OPTION_END; options++) {
+               if (options->short_name == *p->opt) {
+                       p->opt = p->opt[1] ? p->opt + 1 : NULL;
+                       return get_value(p, options, OPT_SHORT);
+               }
+       }
+       return -2;
+}
+
+static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
+               const struct option *options)
+{
+       const char *arg_end = strchr(arg, '=');
+       const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
+       int abbrev_flags = 0, ambiguous_flags = 0;
+
+       if (!arg_end)
+               arg_end = arg + strlen(arg);
+
+       for (; options->type != OPTION_END; options++) {
+               const char *rest;
+               int flags = 0;
+
+               if (!options->long_name)
+                       continue;
+
+               rest = skip_prefix(arg, options->long_name);
+               if (options->type == OPTION_ARGUMENT) {
+                       if (!rest)
+                               continue;
+                       if (*rest == '=')
+                               return opterror(options, "takes no value",
+                                               flags);
+                       if (*rest)
+                               continue;
+                       p->out[p->cpidx++] = arg - 2;
+                       return 0;
+               }
+               if (!rest) {
+                       /* abbreviated? */
+                       if (!strncmp(options->long_name, arg, arg_end - arg)) {
+is_abbreviated:
+                               if (abbrev_option) {
+                                       /*
+                                        * If this is abbreviated, it is
+                                        * ambiguous. So when there is no
+                                        * exact match later, we need to
+                                        * error out.
+                                        */
+                                       ambiguous_option = abbrev_option;
+                                       ambiguous_flags = abbrev_flags;
+                               }
+                               if (!(flags & OPT_UNSET) && *arg_end)
+                                       p->opt = arg_end + 1;
+                               abbrev_option = options;
+                               abbrev_flags = flags;
+                               continue;
+                       }
+                       /* negated and abbreviated very much? */
+                       if (!prefixcmp("no-", arg)) {
+                               flags |= OPT_UNSET;
+                               goto is_abbreviated;
+                       }
+                       /* negated? */
+                       if (strncmp(arg, "no-", 3))
+                               continue;
+                       flags |= OPT_UNSET;
+                       rest = skip_prefix(arg + 3, options->long_name);
+                       /* abbreviated and negated? */
+                       if (!rest && !prefixcmp(options->long_name, arg + 3))
+                               goto is_abbreviated;
+                       if (!rest)
+                               continue;
+               }
+               if (*rest) {
+                       if (*rest != '=')
+                               continue;
+                       p->opt = rest + 1;
+               }
+               return get_value(p, options, flags);
+       }
+
+       if (ambiguous_option)
+               return pr_error("Ambiguous option: %s "
+                               "(could be --%s%s or --%s%s)",
+                               arg,
+                               (ambiguous_flags & OPT_UNSET) ?  "no-" : "",
+                               ambiguous_option->long_name,
+                               (abbrev_flags & OPT_UNSET) ?  "no-" : "",
+                               abbrev_option->long_name);
+       if (abbrev_option)
+               return get_value(p, abbrev_option, abbrev_flags);
+       return -2;
+}
+
+
+static void parse_options_start(struct parse_opt_ctx_t *ctx, int argc,
+               const char **argv, int flags)
+{
+       memset(ctx, 0, sizeof(*ctx));
+       ctx->argc = argc;
+       ctx->argv = argv;
+       ctx->out  = argv;
+       ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
+       ctx->flags = flags;
+       if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
+                       (flags & PARSE_OPT_STOP_AT_NON_OPTION))
+               die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
+}
+
+static int parse_options_end(struct parse_opt_ctx_t *ctx)
+{
+       memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
+       ctx->out[ctx->cpidx + ctx->argc] = NULL;
+       return ctx->cpidx + ctx->argc;
+}
+
+
+static int parse_options_step(struct parse_opt_ctx_t *ctx,
+               const struct option *options, const char * const usagestr[])
+{
+       int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
+
+       /* we must reset ->opt, unknown short option leave it dangling */
+       ctx->opt = NULL;
+
+       for (; ctx->argc; ctx->argc--, ctx->argv++) {
+               const char *arg = ctx->argv[0];
+
+               if (*arg != '-' || !arg[1]) {
+                       if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
+                               break;
+                       ctx->out[ctx->cpidx++] = ctx->argv[0];
+                       continue;
+               }
+
+               if (arg[1] != '-') {
+                       ctx->opt = arg + 1;
+                       if (internal_help && *ctx->opt == 'h')
+                               return parse_options_usage(usagestr, options);
+                       switch (parse_short_opt(ctx, options)) {
+                       case -1:
+                               return parse_options_usage(usagestr, options);
+                       case -2:
+                               goto unknown;
+                       default:
+                               break;
+                       }
+                       if (ctx->opt)
+                               check_typos(arg + 1, options);
+                       while (ctx->opt) {
+                               if (internal_help && *ctx->opt == 'h')
+                                       return parse_options_usage(usagestr,
+                                                       options);
+                               switch (parse_short_opt(ctx, options)) {
+                               case -1:
+                                       return parse_options_usage(usagestr,
+                                                       options);
+                               case -2:
+                                       /* fake a short option thing to hide
+                                        * the fact that we may have
+                                        * started to parse aggregated stuff
+                                        *
+                                        * This is leaky, too bad.
+                                        */
+                                       ctx->argv[0] = strdup(ctx->opt - 1);
+                                       *(char *)ctx->argv[0] = '-';
+                                       goto unknown;
+                               default:
+                                       break;
+                               }
+                       }
+                       continue;
+               }
+
+               if (!arg[2]) { /* "--" */
+                       if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
+                               ctx->argc--;
+                               ctx->argv++;
+                       }
+                       break;
+               }
+
+               if (internal_help && !strcmp(arg + 2, "help-all"))
+                       return usage_with_options_internal(usagestr, options,
+                                       1);
+               if (internal_help && !strcmp(arg + 2, "help"))
+                       return parse_options_usage(usagestr, options);
+               switch (parse_long_opt(ctx, arg + 2, options)) {
+               case -1:
+                       return parse_options_usage(usagestr, options);
+               case -2:
+                       goto unknown;
+               default:
+                       break;
+               }
+               continue;
+unknown:
+               if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
+                       return PARSE_OPT_UNKNOWN;
+               ctx->out[ctx->cpidx++] = ctx->argv[0];
+               ctx->opt = NULL;
+       }
+       return PARSE_OPT_DONE;
+}
+
+int parse_options(int argc, const char **argv, const struct option *options,
+               const char * const usagestr[], int flags)
+{
+       struct parse_opt_ctx_t ctx;
+
+       parse_options_start(&ctx, argc, argv, flags);
+       switch (parse_options_step(&ctx, options, usagestr)) {
+       case PARSE_OPT_HELP:
+               exit(129);
+       case PARSE_OPT_DONE:
+               break;
+       default: /* PARSE_OPT_UNKNOWN */
+               if (ctx.argv[0][1] == '-') {
+                       pr_error("unknown option `%s'", ctx.argv[0] + 2);
+               } else {
+                       pr_error("unknown switch `%c'", *ctx.opt);
+               }
+               usage_with_options(usagestr, options);
+       }
+
+       return parse_options_end(&ctx);
+}
diff --git a/tools/kvm/util/rbtree-interval.c b/tools/kvm/util/rbtree-interval.c
new file mode 100644 (file)
index 0000000..d02fbf0
--- /dev/null
@@ -0,0 +1,90 @@
+#include <kvm/rbtree-interval.h>
+#include <stddef.h>
+
+#define rb_int(n) rb_entry(n, struct rb_int_node, node)
+
+struct rb_int_node *rb_int_search_single(struct rb_root *root, u64 point)
+{
+       struct rb_node *node = root->rb_node;
+       struct rb_node *lowest = NULL;
+
+       while (node) {
+               struct rb_int_node *cur = rb_int(node);
+
+               if (node->rb_left && (rb_int(node->rb_left)->max_high > point)) {
+                       node = node->rb_left;
+               } else if (cur->low <= point && cur->high > point) {
+                       lowest = node;
+                       break;
+               } else if (point > cur->low) {
+                       node = node->rb_right;
+               } else {
+                       break;
+               }
+       }
+
+       if (lowest == NULL)
+               return NULL;
+
+       return rb_int(lowest);
+}
+
+struct rb_int_node *rb_int_search_range(struct rb_root *root, u64 low, u64 high)
+{
+       struct rb_int_node *range;
+
+       range = rb_int_search_single(root, low);
+       if (range == NULL)
+               return NULL;
+
+       /* We simply verify that 'high' is smaller than the end of the range where 'low' is located */
+       if (range->high < high)
+               return NULL;
+
+       return range;
+}
+
+static void update_node_max_high(struct rb_node *node, void *arg)
+{
+       struct rb_int_node *i_node = rb_int(node);
+
+       i_node->max_high = i_node->high;
+
+       if (node->rb_left)
+               i_node->max_high = max(i_node->max_high, rb_int(node->rb_left)->max_high);
+       if (node->rb_right)
+               i_node->max_high = max(i_node->max_high, rb_int(node->rb_right)->max_high);
+}
+
+int rb_int_insert(struct rb_root *root, struct rb_int_node *i_node)
+{
+       struct rb_node **node   = &(root->rb_node), *parent = NULL;
+
+       while (*node) {
+               int result      = i_node->low - rb_int(*node)->low;
+
+               parent = *node;
+               if (result < 0)
+                       node    = &((*node)->rb_left);
+               else if (result > 0)
+                       node    = &((*node)->rb_right);
+               else
+                       return 0;
+       }
+
+       rb_link_node(&i_node->node, parent, node);
+       rb_insert_color(&i_node->node, root);
+
+       rb_augment_insert(&i_node->node, update_node_max_high, NULL);
+       return 1;
+}
+
+void rb_int_erase(struct rb_root *root, struct rb_int_node *node)
+{
+       struct rb_node *deepest;
+
+       deepest = rb_augment_erase_begin(&node->node);
+       rb_erase(&node->node, root);
+       rb_augment_erase_end(deepest, update_node_max_high, NULL);
+
+}
diff --git a/tools/kvm/util/set_private_br.sh b/tools/kvm/util/set_private_br.sh
new file mode 100755 (executable)
index 0000000..49867dd
--- /dev/null
@@ -0,0 +1,51 @@
+#!/bin/bash
+#
+# Author: Amos Kong <kongjianjun@gmail.com>
+# Date: Apr 14, 2011
+# Description: this script is used to create/delete a private bridge,
+# launch a dhcp server on the bridge by dnsmasq.
+#
+# @ ./set_private_br.sh $bridge_name $subnet_prefix
+# @ ./set_private_br.sh vbr0 192.168.33
+
+brname='vbr0'
+subnet='192.168.33'
+
+add_br()
+{
+    echo "add new private bridge: $brname"
+    /usr/sbin/brctl addbr $brname
+    echo 1 > /proc/sys/net/ipv6/conf/$brname/disable_ipv6
+    echo 1 > /proc/sys/net/ipv4/ip_forward
+    /usr/sbin/brctl stp $brname on
+    /usr/sbin/brctl setfd $brname 0
+    ifconfig $brname $subnet.1
+    ifconfig $brname up
+    # Add forward rule, then guest can access public network
+    iptables -t nat -A POSTROUTING -s $subnet.254/24 ! -d $subnet.254/24 -j MASQUERADE
+    /etc/init.d/dnsmasq stop
+    /etc/init.d/tftpd-hpa stop 2>/dev/null
+    dnsmasq --strict-order --bind-interfaces --listen-address $subnet.1 --dhcp-range $subnet.1,$subnet.254 $tftp_cmd
+}
+
+del_br()
+{
+    echo "cleanup bridge setup"
+    kill -9 `pgrep dnsmasq|tail -1`
+    ifconfig $brname down
+    /usr/sbin/brctl delbr $brname
+    iptables -t nat -D POSTROUTING -s $subnet.254/24 ! -d $subnet.254/24 -j MASQUERADE
+}
+
+
+if [ $# = 0 ]; then
+    del_br 2>/dev/null
+    exit
+fi
+if [ $# > 1 ]; then
+    brname="$1"
+fi
+if [ $# = 2 ]; then
+    subnet="$2"
+fi
+add_br
diff --git a/tools/kvm/util/strbuf.c b/tools/kvm/util/strbuf.c
new file mode 100644 (file)
index 0000000..ec77ab1
--- /dev/null
@@ -0,0 +1,13 @@
+
+/* user defined headers */
+#include <kvm/strbuf.h>
+
+int prefixcmp(const char *str, const char *prefix)
+{
+       for (; ; str++, prefix++) {
+               if (!*prefix)
+                       return 0;
+               else if (*str != *prefix)
+                       return (unsigned char)*prefix - (unsigned char)*str;
+       }
+}
diff --git a/tools/kvm/virtio/9p-pdu.c b/tools/kvm/virtio/9p-pdu.c
new file mode 100644 (file)
index 0000000..b9ce8ce
--- /dev/null
@@ -0,0 +1,287 @@
+#include "kvm/util.h"
+#include "kvm/virtio-9p.h"
+
+#include <endian.h>
+#include <stdint.h>
+
+#include <linux/compiler.h>
+#include <net/9p/9p.h>
+
+static void virtio_p9_pdu_read(struct p9_pdu *pdu, void *data, size_t size)
+{
+       size_t len;
+       int i, copied = 0;
+       u16 iov_cnt = pdu->out_iov_cnt;
+       size_t offset = pdu->read_offset;
+       struct iovec *iov = pdu->out_iov;
+
+       for (i = 0; i < iov_cnt && size; i++) {
+               if (offset >= iov[i].iov_len) {
+                       offset -= iov[i].iov_len;
+                       continue;
+               } else {
+                       len = MIN(iov[i].iov_len - offset, size);
+                       memcpy(data, iov[i].iov_base + offset, len);
+                       size -= len;
+                       data += len;
+                       offset = 0;
+                       copied += len;
+               }
+       }
+       pdu->read_offset += copied;
+}
+
+static void virtio_p9_pdu_write(struct p9_pdu *pdu,
+                               const void *data, size_t size)
+{
+       size_t len;
+       int i, copied = 0;
+       u16 iov_cnt = pdu->in_iov_cnt;
+       size_t offset = pdu->write_offset;
+       struct iovec *iov = pdu->in_iov;
+
+       for (i = 0; i < iov_cnt && size; i++) {
+               if (offset >= iov[i].iov_len) {
+                       offset -= iov[i].iov_len;
+                       continue;
+               } else {
+                       len = MIN(iov[i].iov_len - offset, size);
+                       memcpy(iov[i].iov_base + offset, data, len);
+                       size -= len;
+                       data += len;
+                       offset = 0;
+                       copied += len;
+               }
+       }
+       pdu->write_offset += copied;
+}
+
+static void virtio_p9_wstat_free(struct p9_wstat *stbuf)
+{
+       free(stbuf->name);
+       free(stbuf->uid);
+       free(stbuf->gid);
+       free(stbuf->muid);
+}
+
+static int virtio_p9_decode(struct p9_pdu *pdu, const char *fmt, va_list ap)
+{
+       int retval = 0;
+       const char *ptr;
+
+       for (ptr = fmt; *ptr; ptr++) {
+               switch (*ptr) {
+               case 'b':
+               {
+                       int8_t *val = va_arg(ap, int8_t *);
+                       virtio_p9_pdu_read(pdu, val, sizeof(*val));
+               }
+               break;
+               case 'w':
+               {
+                       int16_t le_val;
+                       int16_t *val = va_arg(ap, int16_t *);
+                       virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+                       *val = le16toh(le_val);
+               }
+               break;
+               case 'd':
+               {
+                       int32_t le_val;
+                       int32_t *val = va_arg(ap, int32_t *);
+                       virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+                       *val = le32toh(le_val);
+               }
+               break;
+               case 'q':
+               {
+                       int64_t le_val;
+                       int64_t *val = va_arg(ap, int64_t *);
+                       virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+                       *val = le64toh(le_val);
+               }
+               break;
+               case 's':
+               {
+                       int16_t len;
+                       char **str = va_arg(ap, char **);
+
+                       virtio_p9_pdu_readf(pdu, "w", &len);
+                       *str = malloc(len + 1);
+                       if (*str == NULL) {
+                               retval = ENOMEM;
+                               break;
+                       }
+                       virtio_p9_pdu_read(pdu, *str, len);
+                       (*str)[len] = 0;
+               }
+               break;
+               case 'Q':
+               {
+                       struct p9_qid *qid = va_arg(ap, struct p9_qid *);
+                       retval = virtio_p9_pdu_readf(pdu, "bdq",
+                                                    &qid->type, &qid->version,
+                                                    &qid->path);
+               }
+               break;
+               case 'S':
+               {
+                       struct p9_wstat *stbuf = va_arg(ap, struct p9_wstat *);
+                       memset(stbuf, 0, sizeof(struct p9_wstat));
+                       stbuf->n_uid = stbuf->n_gid = stbuf->n_muid = -1;
+                       retval = virtio_p9_pdu_readf(pdu, "wwdQdddqssss",
+                                               &stbuf->size, &stbuf->type,
+                                               &stbuf->dev, &stbuf->qid,
+                                               &stbuf->mode, &stbuf->atime,
+                                               &stbuf->mtime, &stbuf->length,
+                                               &stbuf->name, &stbuf->uid,
+                                               &stbuf->gid, &stbuf->muid);
+                       if (retval)
+                               virtio_p9_wstat_free(stbuf);
+               }
+               break;
+               case 'I':
+               {
+                       struct p9_iattr_dotl *p9attr = va_arg(ap,
+                                                      struct p9_iattr_dotl *);
+
+                       retval = virtio_p9_pdu_readf(pdu, "ddddqqqqq",
+                                                    &p9attr->valid,
+                                                    &p9attr->mode,
+                                                    &p9attr->uid,
+                                                    &p9attr->gid,
+                                                    &p9attr->size,
+                                                    &p9attr->atime_sec,
+                                                    &p9attr->atime_nsec,
+                                                    &p9attr->mtime_sec,
+                                                    &p9attr->mtime_nsec);
+               }
+               break;
+               default:
+                       retval = EINVAL;
+                       break;
+               }
+       }
+       return retval;
+}
+
+static int virtio_p9_pdu_encode(struct p9_pdu *pdu, const char *fmt, va_list ap)
+{
+       int retval = 0;
+       const char *ptr;
+
+       for (ptr = fmt; *ptr; ptr++) {
+               switch (*ptr) {
+               case 'b':
+               {
+                       int8_t val = va_arg(ap, int);
+                       virtio_p9_pdu_write(pdu, &val, sizeof(val));
+               }
+               break;
+               case 'w':
+               {
+                       int16_t val = htole16(va_arg(ap, int));
+                       virtio_p9_pdu_write(pdu, &val, sizeof(val));
+               }
+               break;
+               case 'd':
+               {
+                       int32_t val = htole32(va_arg(ap, int32_t));
+                       virtio_p9_pdu_write(pdu, &val, sizeof(val));
+               }
+               break;
+               case 'q':
+               {
+                       int64_t val = htole64(va_arg(ap, int64_t));
+                       virtio_p9_pdu_write(pdu, &val, sizeof(val));
+               }
+               break;
+               case 's':
+               {
+                       uint16_t len = 0;
+                       const char *s = va_arg(ap, char *);
+                       if (s)
+                               len = MIN(strlen(s), USHRT_MAX);
+                       virtio_p9_pdu_writef(pdu, "w", len);
+                       virtio_p9_pdu_write(pdu, s, len);
+               }
+               break;
+               case 'Q':
+               {
+                       struct p9_qid *qid = va_arg(ap, struct p9_qid *);
+                       retval = virtio_p9_pdu_writef(pdu, "bdq",
+                                                     qid->type, qid->version,
+                                                     qid->path);
+               }
+               break;
+               case 'S':
+               {
+                       struct p9_wstat *stbuf = va_arg(ap, struct p9_wstat *);
+                       retval = virtio_p9_pdu_writef(pdu, "wwdQdddqssss",
+                                               stbuf->size, stbuf->type,
+                                               stbuf->dev, &stbuf->qid,
+                                               stbuf->mode, stbuf->atime,
+                                               stbuf->mtime, stbuf->length,
+                                               stbuf->name, stbuf->uid,
+                                               stbuf->gid, stbuf->muid);
+               }
+               break;
+               case 'A':
+               {
+                       struct p9_stat_dotl *stbuf = va_arg(ap,
+                                                     struct p9_stat_dotl *);
+                       retval  = virtio_p9_pdu_writef(pdu,
+                                                      "qQdddqqqqqqqqqqqqqqq",
+                                                      stbuf->st_result_mask,
+                                                      &stbuf->qid,
+                                                      stbuf->st_mode,
+                                                      stbuf->st_uid,
+                                                      stbuf->st_gid,
+                                                      stbuf->st_nlink,
+                                                      stbuf->st_rdev,
+                                                      stbuf->st_size,
+                                                      stbuf->st_blksize,
+                                                      stbuf->st_blocks,
+                                                      stbuf->st_atime_sec,
+                                                      stbuf->st_atime_nsec,
+                                                      stbuf->st_mtime_sec,
+                                                      stbuf->st_mtime_nsec,
+                                                      stbuf->st_ctime_sec,
+                                                      stbuf->st_ctime_nsec,
+                                                      stbuf->st_btime_sec,
+                                                      stbuf->st_btime_nsec,
+                                                      stbuf->st_gen,
+                                                      stbuf->st_data_version);
+               }
+               break;
+               default:
+                       retval = EINVAL;
+                       break;
+               }
+       }
+       return retval;
+}
+
+int virtio_p9_pdu_readf(struct p9_pdu *pdu, const char *fmt, ...)
+{
+       int ret;
+       va_list ap;
+
+       va_start(ap, fmt);
+       ret = virtio_p9_decode(pdu, fmt, ap);
+       va_end(ap);
+
+       return ret;
+}
+
+int virtio_p9_pdu_writef(struct p9_pdu *pdu, const char *fmt, ...)
+{
+       int ret;
+       va_list ap;
+
+       va_start(ap, fmt);
+       ret = virtio_p9_pdu_encode(pdu, fmt, ap);
+       va_end(ap);
+
+       return ret;
+}
diff --git a/tools/kvm/virtio/9p.c b/tools/kvm/virtio/9p.c
new file mode 100644 (file)
index 0000000..22a2732
--- /dev/null
@@ -0,0 +1,1263 @@
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/threadpool.h"
+#include "kvm/irq.h"
+#include "kvm/virtio-9p.h"
+#include "kvm/guest_compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/vfs.h>
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_9p.h>
+#include <net/9p/9p.h>
+
+static LIST_HEAD(devs);
+
+/* Warning: Immediately use value returned from this function */
+static const char *rel_to_abs(struct p9_dev *p9dev,
+                             const char *path, char *abs_path)
+{
+       sprintf(abs_path, "%s/%s", p9dev->root_dir, path);
+
+       return abs_path;
+}
+
+static void stat2qid(struct stat *st, struct p9_qid *qid)
+{
+       *qid = (struct p9_qid) {
+               .path           = st->st_ino,
+               .version        = st->st_mtime,
+       };
+
+       if (S_ISDIR(st->st_mode))
+               qid->type       |= P9_QTDIR;
+}
+
+static void close_fid(struct p9_dev *p9dev, u32 fid)
+{
+       if (p9dev->fids[fid].fd > 0) {
+               close(p9dev->fids[fid].fd);
+               p9dev->fids[fid].fd = -1;
+       }
+       if (p9dev->fids[fid].dir) {
+               closedir(p9dev->fids[fid].dir);
+               p9dev->fids[fid].dir = NULL;
+       }
+       p9dev->fids[fid].fid = P9_NOFID;
+}
+
+static void virtio_p9_set_reply_header(struct p9_pdu *pdu, u32 size)
+{
+       u8 cmd;
+       u16 tag;
+
+       pdu->read_offset = sizeof(u32);
+       virtio_p9_pdu_readf(pdu, "bw", &cmd, &tag);
+       pdu->write_offset = 0;
+       /* cmd + 1 is the reply message */
+       virtio_p9_pdu_writef(pdu, "dbw", size, cmd + 1, tag);
+}
+
+static u16 virtio_p9_update_iov_cnt(struct iovec iov[], u32 count, int iov_cnt)
+{
+       int i;
+       u32 total = 0;
+       for (i = 0; (i < iov_cnt) && (total < count); i++) {
+               if (total + iov[i].iov_len > count) {
+                       /* we don't need this iov fully */
+                       iov[i].iov_len -= ((total + iov[i].iov_len) - count);
+                       i++;
+                       break;
+               }
+               total += iov[i].iov_len;
+       }
+       return i;
+}
+
+static void virtio_p9_error_reply(struct p9_dev *p9dev,
+                                 struct p9_pdu *pdu, int err, u32 *outlen)
+{
+       u16 tag;
+
+       pdu->write_offset = VIRTIO_P9_HDR_LEN;
+       virtio_p9_pdu_writef(pdu, "d", err);
+       *outlen = pdu->write_offset;
+
+       /* read the tag from input */
+       pdu->read_offset = sizeof(u32) + sizeof(u8);
+       virtio_p9_pdu_readf(pdu, "w", &tag);
+
+       /* Update the header */
+       pdu->write_offset = 0;
+       virtio_p9_pdu_writef(pdu, "dbw", *outlen, P9_RLERROR, tag);
+}
+
+static void virtio_p9_version(struct p9_dev *p9dev,
+                             struct p9_pdu *pdu, u32 *outlen)
+{
+       u32 msize;
+       char *version;
+       virtio_p9_pdu_readf(pdu, "ds", &msize, &version);
+       /*
+        * reply with the same msize the client sent us
+        * Error out if the request is not for 9P2000.L
+        */
+       if (!strcmp(version, VIRTIO_P9_VERSION_DOTL))
+               virtio_p9_pdu_writef(pdu, "ds", msize, version);
+       else
+               virtio_p9_pdu_writef(pdu, "ds", msize, "unknown");
+
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       free(version);
+       return;
+}
+
+static void virtio_p9_clunk(struct p9_dev *p9dev,
+                           struct p9_pdu *pdu, u32 *outlen)
+{
+       u32 fid;
+
+       virtio_p9_pdu_readf(pdu, "d", &fid);
+       close_fid(p9dev, fid);
+
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+}
+
+/*
+ * FIXME!! Need to map to protocol independent value. Upstream
+ * 9p also have the same BUG
+ */
+static int virtio_p9_openflags(int flags)
+{
+       flags &= ~(O_NOCTTY | O_ASYNC | O_CREAT | O_DIRECT);
+       flags |= O_NOFOLLOW;
+       return flags;
+}
+
+static void virtio_p9_open(struct p9_dev *p9dev,
+                          struct p9_pdu *pdu, u32 *outlen)
+{
+       u32 fid, flags;
+       struct stat st;
+       struct p9_qid qid;
+       struct p9_fid *new_fid;
+
+
+       virtio_p9_pdu_readf(pdu, "dd", &fid, &flags);
+       new_fid = &p9dev->fids[fid];
+
+       if (lstat(new_fid->abs_path, &st) < 0)
+               goto err_out;
+
+       stat2qid(&st, &qid);
+
+       if (new_fid->is_dir) {
+               new_fid->dir = opendir(new_fid->abs_path);
+               if (!new_fid->dir)
+                       goto err_out;
+       } else {
+               new_fid->fd  = open(new_fid->abs_path,
+                                   virtio_p9_openflags(flags));
+               if (new_fid->fd < 0)
+                       goto err_out;
+       }
+       /* FIXME!! need ot send proper iounit  */
+       virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_create(struct p9_dev *p9dev,
+                            struct p9_pdu *pdu, u32 *outlen)
+{
+       int fd, ret;
+       char *name;
+       struct stat st;
+       struct p9_qid qid;
+       struct p9_fid *dfid;
+       char full_path[PATH_MAX];
+       u32 dfid_val, flags, mode, gid;
+
+       virtio_p9_pdu_readf(pdu, "dsddd", &dfid_val,
+                           &name, &flags, &mode, &gid);
+       dfid = &p9dev->fids[dfid_val];
+
+       flags = virtio_p9_openflags(flags);
+
+       sprintf(full_path, "%s/%s", dfid->abs_path, name);
+       fd = open(full_path, flags | O_CREAT, mode);
+       if (fd < 0)
+               goto err_out;
+       close_fid(p9dev, dfid_val);
+       dfid->fd = fd;
+
+       if (lstat(full_path, &st) < 0)
+               goto err_out;
+
+       ret = chmod(full_path, mode & 0777);
+       if (ret < 0)
+               goto err_out;
+
+       ret = lchown(full_path, dfid->uid, gid);
+       if (ret < 0)
+               goto err_out;
+
+       sprintf(dfid->path, "%s/%s", dfid->path, name);
+       stat2qid(&st, &qid);
+       virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       free(name);
+       return;
+err_out:
+       free(name);
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_mkdir(struct p9_dev *p9dev,
+                           struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       char *name;
+       struct stat st;
+       struct p9_qid qid;
+       struct p9_fid *dfid;
+       char full_path[PATH_MAX];
+       u32 dfid_val, mode, gid;
+
+       virtio_p9_pdu_readf(pdu, "dsdd", &dfid_val,
+                           &name, &mode, &gid);
+       dfid = &p9dev->fids[dfid_val];
+
+       sprintf(full_path, "%s/%s", dfid->abs_path, name);
+       ret = mkdir(full_path, mode);
+       if (ret < 0)
+               goto err_out;
+
+       if (lstat(full_path, &st) < 0)
+               goto err_out;
+
+       ret = chmod(full_path, mode & 0777);
+       if (ret < 0)
+               goto err_out;
+
+       ret = lchown(full_path, dfid->uid, gid);
+       if (ret < 0)
+               goto err_out;
+
+       stat2qid(&st, &qid);
+       virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       free(name);
+       return;
+err_out:
+       free(name);
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_walk(struct p9_dev *p9dev,
+                          struct p9_pdu *pdu, u32 *outlen)
+{
+       u8 i;
+       u16 nwqid;
+       char *str;
+       u16 nwname;
+       struct p9_qid wqid;
+       struct p9_fid *new_fid;
+       u32 fid_val, newfid_val;
+
+
+       virtio_p9_pdu_readf(pdu, "ddw", &fid_val, &newfid_val, &nwname);
+       new_fid = &p9dev->fids[newfid_val];
+
+       nwqid = 0;
+       if (nwname) {
+               struct p9_fid *fid = &p9dev->fids[fid_val];
+
+               strcpy(new_fid->path, fid->path);
+               /* skip the space for count */
+               pdu->write_offset += sizeof(u16);
+               for (i = 0; i < nwname; i++) {
+                       struct stat st;
+                       char tmp[PATH_MAX] = {0};
+                       char full_path[PATH_MAX];
+
+                       virtio_p9_pdu_readf(pdu, "s", &str);
+
+                       /* Format the new path we're 'walk'ing into */
+                       sprintf(tmp, "%s/%s", new_fid->path, str);
+                       if (lstat(rel_to_abs(p9dev, tmp, full_path), &st) < 0)
+                               goto err_out;
+
+                       stat2qid(&st, &wqid);
+                       new_fid->is_dir = S_ISDIR(st.st_mode);
+                       strcpy(new_fid->path, tmp);
+                       new_fid->fid = newfid_val;
+                       new_fid->uid = fid->uid;
+                       nwqid++;
+                       virtio_p9_pdu_writef(pdu, "Q", &wqid);
+               }
+       } else {
+               /*
+                * update write_offset so our outlen get correct value
+                */
+               pdu->write_offset += sizeof(u16);
+               new_fid->is_dir = p9dev->fids[fid_val].is_dir;
+               strcpy(new_fid->path, p9dev->fids[fid_val].path);
+               new_fid->fid    = newfid_val;
+               new_fid->uid    = p9dev->fids[fid_val].uid;
+       }
+       *outlen = pdu->write_offset;
+       pdu->write_offset = VIRTIO_P9_HDR_LEN;
+       virtio_p9_pdu_writef(pdu, "d", nwqid);
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_attach(struct p9_dev *p9dev,
+                            struct p9_pdu *pdu, u32 *outlen)
+{
+       int i;
+       char *uname;
+       char *aname;
+       struct stat st;
+       struct p9_qid qid;
+       struct p9_fid *fid;
+       u32 fid_val, afid, uid;
+
+       virtio_p9_pdu_readf(pdu, "ddssd", &fid_val, &afid,
+                           &uname, &aname, &uid);
+
+       /* Reset everything */
+       for (i = 0; i < VIRTIO_P9_MAX_FID; i++)
+               p9dev->fids[i].fid = P9_NOFID;
+
+       if (lstat(p9dev->root_dir, &st) < 0)
+               goto err_out;
+
+       stat2qid(&st, &qid);
+
+       fid = &p9dev->fids[fid_val];
+       fid->fid = fid_val;
+       fid->uid = uid;
+       fid->is_dir = 1;
+       strcpy(fid->path, "/");
+
+       virtio_p9_pdu_writef(pdu, "Q", &qid);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_fill_stat(struct p9_dev *p9dev,
+                               struct stat *st, struct p9_stat_dotl *statl)
+{
+       memset(statl, 0, sizeof(*statl));
+       statl->st_mode = st->st_mode;
+       statl->st_nlink = st->st_nlink;
+       statl->st_uid = st->st_uid;
+       statl->st_gid = st->st_gid;
+       statl->st_rdev = st->st_rdev;
+       statl->st_size = st->st_size;
+       statl->st_blksize = st->st_blksize;
+       statl->st_blocks = st->st_blocks;
+       statl->st_atime_sec = st->st_atime;
+       statl->st_atime_nsec = st->st_atim.tv_nsec;
+       statl->st_mtime_sec = st->st_mtime;
+       statl->st_mtime_nsec = st->st_mtim.tv_nsec;
+       statl->st_ctime_sec = st->st_ctime;
+       statl->st_ctime_nsec = st->st_ctim.tv_nsec;
+       /* Currently we only support BASIC fields in stat */
+       statl->st_result_mask = P9_STATS_BASIC;
+       stat2qid(st, &statl->qid);
+}
+
+static void virtio_p9_read(struct p9_dev *p9dev,
+                          struct p9_pdu *pdu, u32 *outlen)
+{
+       u64 offset;
+       u32 fid_val;
+       u16 iov_cnt;
+       void *iov_base;
+       size_t iov_len;
+       u32 count, rcount;
+       struct p9_fid *fid;
+
+
+       rcount = 0;
+       virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+       fid = &p9dev->fids[fid_val];
+
+       iov_base = pdu->in_iov[0].iov_base;
+       iov_len  = pdu->in_iov[0].iov_len;
+       iov_cnt  = pdu->in_iov_cnt;
+       pdu->in_iov[0].iov_base += VIRTIO_P9_HDR_LEN + sizeof(u32);
+       pdu->in_iov[0].iov_len -= VIRTIO_P9_HDR_LEN + sizeof(u32);
+       pdu->in_iov_cnt = virtio_p9_update_iov_cnt(pdu->in_iov,
+                                                  count,
+                                                  pdu->in_iov_cnt);
+       rcount = preadv(fid->fd, pdu->in_iov,
+                       pdu->in_iov_cnt, offset);
+       if (rcount > count)
+               rcount = count;
+       /*
+        * Update the iov_base back, so that rest of
+        * pdu_writef works correctly.
+        */
+       pdu->in_iov[0].iov_base = iov_base;
+       pdu->in_iov[0].iov_len  = iov_len;
+       pdu->in_iov_cnt         = iov_cnt;
+
+       pdu->write_offset = VIRTIO_P9_HDR_LEN;
+       virtio_p9_pdu_writef(pdu, "d", rcount);
+       *outlen = pdu->write_offset + rcount;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+}
+
+static int virtio_p9_dentry_size(struct dirent *dent)
+{
+       /*
+        * Size of each dirent:
+        * qid(13) + offset(8) + type(1) + name_len(2) + name
+        */
+       return 24 + strlen(dent->d_name);
+}
+
+static void virtio_p9_readdir(struct p9_dev *p9dev,
+                             struct p9_pdu *pdu, u32 *outlen)
+{
+       u32 fid_val;
+       u32 count, rcount;
+       struct stat st;
+       struct p9_fid *fid;
+       struct dirent *dent;
+       char full_path[PATH_MAX];
+       u64 offset, old_offset;
+
+       rcount = 0;
+       virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+       fid = &p9dev->fids[fid_val];
+
+       if (!fid->is_dir) {
+               errno = -EINVAL;
+               goto err_out;
+       }
+
+       /* Move the offset specified */
+       seekdir(fid->dir, offset);
+
+       old_offset = offset;
+       /* If reading a dir, fill the buffer with p9_stat entries */
+       dent = readdir(fid->dir);
+
+       /* Skip the space for writing count */
+       pdu->write_offset += sizeof(u32);
+       while (dent) {
+               u32 read;
+               struct p9_qid qid;
+
+               if ((rcount + virtio_p9_dentry_size(dent)) > count) {
+                       /* seek to the previous offset and return */
+                       seekdir(fid->dir, old_offset);
+                       break;
+               }
+               old_offset = dent->d_off;
+               lstat(rel_to_abs(p9dev, dent->d_name, full_path), &st);
+               stat2qid(&st, &qid);
+               read = pdu->write_offset;
+               virtio_p9_pdu_writef(pdu, "Qqbs", &qid, dent->d_off,
+                                    dent->d_type, dent->d_name);
+               rcount += pdu->write_offset - read;
+               dent = readdir(fid->dir);
+       }
+
+       pdu->write_offset = VIRTIO_P9_HDR_LEN;
+       virtio_p9_pdu_writef(pdu, "d", rcount);
+       *outlen = pdu->write_offset + rcount;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+
+static void virtio_p9_getattr(struct p9_dev *p9dev,
+                             struct p9_pdu *pdu, u32 *outlen)
+{
+       u32 fid_val;
+       struct stat st;
+       u64 request_mask;
+       struct p9_fid *fid;
+       struct p9_stat_dotl statl;
+
+       virtio_p9_pdu_readf(pdu, "dq", &fid_val, &request_mask);
+       fid = &p9dev->fids[fid_val];
+       if (lstat(fid->abs_path, &st) < 0)
+               goto err_out;
+
+       virtio_p9_fill_stat(p9dev, &st, &statl);
+       virtio_p9_pdu_writef(pdu, "A", &statl);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+/* FIXME!! from linux/fs.h */
+/*
+ * Attribute flags.  These should be or-ed together to figure out what
+ * has been changed!
+ */
+#define ATTR_MODE      (1 << 0)
+#define ATTR_UID       (1 << 1)
+#define ATTR_GID       (1 << 2)
+#define ATTR_SIZE      (1 << 3)
+#define ATTR_ATIME     (1 << 4)
+#define ATTR_MTIME     (1 << 5)
+#define ATTR_CTIME     (1 << 6)
+#define ATTR_ATIME_SET (1 << 7)
+#define ATTR_MTIME_SET (1 << 8)
+#define ATTR_FORCE     (1 << 9) /* Not a change, but a change it */
+#define ATTR_ATTR_FLAG (1 << 10)
+#define ATTR_KILL_SUID (1 << 11)
+#define ATTR_KILL_SGID (1 << 12)
+#define ATTR_FILE      (1 << 13)
+#define ATTR_KILL_PRIV (1 << 14)
+#define ATTR_OPEN      (1 << 15) /* Truncating from open(O_TRUNC) */
+#define ATTR_TIMES_SET (1 << 16)
+
+#define ATTR_MASK    127
+
+static void virtio_p9_setattr(struct p9_dev *p9dev,
+                             struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret = 0;
+       u32 fid_val;
+       struct p9_fid *fid;
+       struct p9_iattr_dotl p9attr;
+
+       virtio_p9_pdu_readf(pdu, "dI", &fid_val, &p9attr);
+       fid = &p9dev->fids[fid_val];
+
+       if (p9attr.valid & ATTR_MODE) {
+               ret = chmod(fid->abs_path, p9attr.mode);
+               if (ret < 0)
+                       goto err_out;
+       }
+       if (p9attr.valid & (ATTR_ATIME | ATTR_MTIME)) {
+               struct timespec times[2];
+               if (p9attr.valid & ATTR_ATIME) {
+                       if (p9attr.valid & ATTR_ATIME_SET) {
+                               times[0].tv_sec = p9attr.atime_sec;
+                               times[0].tv_nsec = p9attr.atime_nsec;
+                       } else {
+                               times[0].tv_nsec = UTIME_NOW;
+                       }
+               } else {
+                       times[0].tv_nsec = UTIME_OMIT;
+               }
+               if (p9attr.valid & ATTR_MTIME) {
+                       if (p9attr.valid & ATTR_MTIME_SET) {
+                               times[1].tv_sec = p9attr.mtime_sec;
+                               times[1].tv_nsec = p9attr.mtime_nsec;
+                       } else {
+                               times[1].tv_nsec = UTIME_NOW;
+                       }
+               } else
+                       times[1].tv_nsec = UTIME_OMIT;
+
+               ret = utimensat(-1, fid->abs_path, times, AT_SYMLINK_NOFOLLOW);
+               if (ret < 0)
+                       goto err_out;
+       }
+       /*
+        * If the only valid entry in iattr is ctime we can call
+        * chown(-1,-1) to update the ctime of the file
+        */
+       if ((p9attr.valid & (ATTR_UID | ATTR_GID)) ||
+           ((p9attr.valid & ATTR_CTIME)
+            && !((p9attr.valid & ATTR_MASK) & ~ATTR_CTIME))) {
+               if (!(p9attr.valid & ATTR_UID))
+                       p9attr.uid = -1;
+
+               if (!(p9attr.valid & ATTR_GID))
+                       p9attr.gid = -1;
+
+               ret = lchown(fid->abs_path, p9attr.uid, p9attr.gid);
+               if (ret < 0)
+                       goto err_out;
+       }
+       if (p9attr.valid & (ATTR_SIZE)) {
+               ret = truncate(fid->abs_path, p9attr.size);
+               if (ret < 0)
+                       goto err_out;
+       }
+       *outlen = VIRTIO_P9_HDR_LEN;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_write(struct p9_dev *p9dev,
+                           struct p9_pdu *pdu, u32 *outlen)
+{
+
+       u64 offset;
+       u32 fid_val;
+       u32 count;
+       ssize_t res;
+       u16 iov_cnt;
+       void *iov_base;
+       size_t iov_len;
+       struct p9_fid *fid;
+       /* u32 fid + u64 offset + u32 count */
+       int twrite_size = sizeof(u32) + sizeof(u64) + sizeof(u32);
+
+       virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+       fid = &p9dev->fids[fid_val];
+
+       iov_base = pdu->out_iov[0].iov_base;
+       iov_len  = pdu->out_iov[0].iov_len;
+       iov_cnt  = pdu->out_iov_cnt;
+
+       /* Adjust the iovec to skip the header and meta data */
+       pdu->out_iov[0].iov_base += (sizeof(struct p9_msg) + twrite_size);
+       pdu->out_iov[0].iov_len -=  (sizeof(struct p9_msg) + twrite_size);
+       pdu->out_iov_cnt = virtio_p9_update_iov_cnt(pdu->out_iov, count,
+                                                   pdu->out_iov_cnt);
+       res = pwritev(fid->fd, pdu->out_iov, pdu->out_iov_cnt, offset);
+       /*
+        * Update the iov_base back, so that rest of
+        * pdu_readf works correctly.
+        */
+       pdu->out_iov[0].iov_base = iov_base;
+       pdu->out_iov[0].iov_len  = iov_len;
+       pdu->out_iov_cnt         = iov_cnt;
+
+       if (res < 0)
+               goto err_out;
+       virtio_p9_pdu_writef(pdu, "d", res);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_readlink(struct p9_dev *p9dev,
+                              struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       u32 fid_val;
+       struct p9_fid *fid;
+       char target_path[PATH_MAX];
+
+       virtio_p9_pdu_readf(pdu, "d", &fid_val);
+       fid = &p9dev->fids[fid_val];
+
+       memset(target_path, 0, PATH_MAX);
+       ret = readlink(fid->abs_path, target_path, PATH_MAX - 1);
+       if (ret < 0)
+               goto err_out;
+
+       virtio_p9_pdu_writef(pdu, "s", target_path);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_statfs(struct p9_dev *p9dev,
+                            struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       u64 fsid;
+       u32 fid_val;
+       struct p9_fid *fid;
+       struct statfs stat_buf;
+
+       virtio_p9_pdu_readf(pdu, "d", &fid_val);
+       fid = &p9dev->fids[fid_val];
+
+       ret = statfs(fid->abs_path, &stat_buf);
+       if (ret < 0)
+               goto err_out;
+       /* FIXME!! f_blocks needs update based on client msize */
+       fsid = (unsigned int) stat_buf.f_fsid.__val[0] |
+               (unsigned long long)stat_buf.f_fsid.__val[1] << 32;
+       virtio_p9_pdu_writef(pdu, "ddqqqqqqd", stat_buf.f_type,
+                            stat_buf.f_bsize, stat_buf.f_blocks,
+                            stat_buf.f_bfree, stat_buf.f_bavail,
+                            stat_buf.f_files, stat_buf.f_ffree,
+                            fsid, stat_buf.f_namelen);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_mknod(struct p9_dev *p9dev,
+                           struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       char *name;
+       struct stat st;
+       struct p9_fid *dfid;
+       struct p9_qid qid;
+       char full_path[PATH_MAX];
+       u32 fid_val, mode, major, minor, gid;
+
+       virtio_p9_pdu_readf(pdu, "dsdddd", &fid_val, &name, &mode,
+                           &major, &minor, &gid);
+
+       dfid = &p9dev->fids[fid_val];
+       sprintf(full_path, "%s/%s", dfid->abs_path, name);
+       ret = mknod(full_path, mode, makedev(major, minor));
+       if (ret < 0)
+               goto err_out;
+
+       if (lstat(full_path, &st) < 0)
+               goto err_out;
+
+       ret = chmod(full_path, mode & 0777);
+       if (ret < 0)
+               goto err_out;
+
+       ret = lchown(full_path, dfid->uid, gid);
+       if (ret < 0)
+               goto err_out;
+
+       stat2qid(&st, &qid);
+       virtio_p9_pdu_writef(pdu, "Q", &qid);
+       free(name);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       free(name);
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_fsync(struct p9_dev *p9dev,
+                           struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       struct p9_fid *fid;
+       u32 fid_val, datasync;
+
+       virtio_p9_pdu_readf(pdu, "dd", &fid_val, &datasync);
+       fid = &p9dev->fids[fid_val];
+
+       if (datasync)
+               ret = fdatasync(fid->fd);
+       else
+               ret = fsync(fid->fd);
+       if (ret < 0)
+               goto err_out;
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_symlink(struct p9_dev *p9dev,
+                             struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       struct stat st;
+       u32 fid_val, gid;
+       struct p9_qid qid;
+       struct p9_fid *dfid;
+       char new_name[PATH_MAX];
+       char *old_path, *name;
+
+       virtio_p9_pdu_readf(pdu, "dssd", &fid_val, &name, &old_path, &gid);
+
+       dfid = &p9dev->fids[fid_val];
+       sprintf(new_name, "%s/%s", dfid->abs_path, name);
+       ret = symlink(old_path, new_name);
+       if (ret < 0)
+               goto err_out;
+
+       if (lstat(new_name, &st) < 0)
+               goto err_out;
+
+       ret = lchown(new_name, dfid->uid, gid);
+       if (ret < 0)
+               goto err_out;
+
+       stat2qid(&st, &qid);
+       virtio_p9_pdu_writef(pdu, "Q", &qid);
+       free(name);
+       free(old_path);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       free(name);
+       free(old_path);
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_link(struct p9_dev *p9dev,
+                          struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       char *name;
+       u32 fid_val, dfid_val;
+       struct p9_fid *dfid, *fid;
+       char full_path[PATH_MAX];
+
+       virtio_p9_pdu_readf(pdu, "dds", &dfid_val, &fid_val, &name);
+
+       dfid = &p9dev->fids[dfid_val];
+       fid =  &p9dev->fids[fid_val];
+       sprintf(full_path, "%s/%s", dfid->abs_path, name);
+       ret = link(fid->abs_path, full_path);
+       if (ret < 0)
+               goto err_out;
+       free(name);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       free(name);
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+
+}
+
+static void virtio_p9_lock(struct p9_dev *p9dev,
+                          struct p9_pdu *pdu, u32 *outlen)
+{
+       u8 ret;
+       u32 fid_val;
+       struct p9_flock flock;
+
+       virtio_p9_pdu_readf(pdu, "dbdqqds", &fid_val, &flock.type,
+                           &flock.flags, &flock.start, &flock.length,
+                           &flock.proc_id, &flock.client_id);
+
+       /* Just return success */
+       ret = P9_LOCK_SUCCESS;
+       virtio_p9_pdu_writef(pdu, "d", ret);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       free(flock.client_id);
+       return;
+}
+
+static void virtio_p9_getlock(struct p9_dev *p9dev,
+                             struct p9_pdu *pdu, u32 *outlen)
+{
+       u32 fid_val;
+       struct p9_getlock glock;
+       virtio_p9_pdu_readf(pdu, "dbqqds", &fid_val, &glock.type,
+                           &glock.start, &glock.length, &glock.proc_id,
+                           &glock.client_id);
+
+       /* Just return success */
+       glock.type = F_UNLCK;
+       virtio_p9_pdu_writef(pdu, "bqqds", glock.type,
+                            glock.start, glock.length, glock.proc_id,
+                            glock.client_id);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       free(glock.client_id);
+       return;
+}
+
+static int virtio_p9_ancestor(char *path, char *ancestor)
+{
+       int size = strlen(ancestor);
+       if (!strncmp(path, ancestor, size)) {
+               /*
+                * Now check whether ancestor is a full name or
+                * or directory component and not just part
+                * of a name.
+                */
+               if (path[size] == '\0' || path[size] == '/')
+                       return 1;
+       }
+       return 0;
+}
+
+static void virtio_p9_fix_path(char *fid_path, char *old_name, char *new_name)
+{
+       char tmp_name[PATH_MAX];
+       size_t rp_sz = strlen(old_name);
+
+       if (rp_sz == strlen(fid_path)) {
+               /* replace the full name */
+               strcpy(fid_path, new_name);
+               return;
+       }
+       /* save the trailing path details */
+       strcpy(tmp_name, fid_path + rp_sz);
+       sprintf(fid_path, "%s%s", new_name, tmp_name);
+       return;
+}
+
+static void virtio_p9_renameat(struct p9_dev *p9dev,
+                              struct p9_pdu *pdu, u32 *outlen)
+{
+       int i, ret;
+       char *old_name, *new_name;
+       u32 old_dfid_val, new_dfid_val;
+       struct p9_fid *old_dfid, *new_dfid;
+       char old_full_path[PATH_MAX], new_full_path[PATH_MAX];
+
+
+       virtio_p9_pdu_readf(pdu, "dsds", &old_dfid_val, &old_name,
+                           &new_dfid_val, &new_name);
+
+       old_dfid = &p9dev->fids[old_dfid_val];
+       new_dfid = &p9dev->fids[new_dfid_val];
+
+       sprintf(old_full_path, "%s/%s", old_dfid->abs_path, old_name);
+       sprintf(new_full_path, "%s/%s", new_dfid->abs_path, new_name);
+       ret = rename(old_full_path, new_full_path);
+       if (ret < 0)
+               goto err_out;
+       /*
+        * Now fix path in other fids, if the renamed path is part of
+        * that.
+        */
+       for (i = 0; i < VIRTIO_P9_MAX_FID; i++) {
+               if (p9dev->fids[i].fid != P9_NOFID &&
+                   virtio_p9_ancestor(p9dev->fids[i].path, old_name)) {
+                       virtio_p9_fix_path(p9dev->fids[i].path, old_name,
+                                          new_name);
+               }
+       }
+       free(old_name);
+       free(new_name);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       free(old_name);
+       free(new_name);
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_unlinkat(struct p9_dev *p9dev,
+                              struct p9_pdu *pdu, u32 *outlen)
+{
+       int ret;
+       char *name;
+       u32 fid_val, flags;
+       struct p9_fid *fid;
+       char full_path[PATH_MAX];
+
+       virtio_p9_pdu_readf(pdu, "dsd", &fid_val, &name, &flags);
+       fid = &p9dev->fids[fid_val];
+
+       sprintf(full_path, "%s/%s", fid->abs_path, name);
+       ret = remove(full_path);
+       if (ret < 0)
+               goto err_out;
+       free(name);
+       *outlen = pdu->write_offset;
+       virtio_p9_set_reply_header(pdu, *outlen);
+       return;
+err_out:
+       free(name);
+       virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+       return;
+}
+
+static void virtio_p9_eopnotsupp(struct p9_dev *p9dev,
+                                struct p9_pdu *pdu, u32 *outlen)
+{
+       return virtio_p9_error_reply(p9dev, pdu, EOPNOTSUPP, outlen);
+}
+
+typedef void p9_handler(struct p9_dev *p9dev,
+                       struct p9_pdu *pdu, u32 *outlen);
+
+/* FIXME should be removed when merging with latest linus tree */
+#define P9_TRENAMEAT 74
+#define P9_TUNLINKAT 76
+
+static p9_handler *virtio_9p_dotl_handler [] = {
+       [P9_TREADDIR]     = virtio_p9_readdir,
+       [P9_TSTATFS]      = virtio_p9_statfs,
+       [P9_TGETATTR]     = virtio_p9_getattr,
+       [P9_TSETATTR]     = virtio_p9_setattr,
+       [P9_TXATTRWALK]   = virtio_p9_eopnotsupp,
+       [P9_TXATTRCREATE] = virtio_p9_eopnotsupp,
+       [P9_TMKNOD]       = virtio_p9_mknod,
+       [P9_TLOCK]        = virtio_p9_lock,
+       [P9_TGETLOCK]     = virtio_p9_getlock,
+       [P9_TRENAMEAT]    = virtio_p9_renameat,
+       [P9_TREADLINK]    = virtio_p9_readlink,
+       [P9_TUNLINKAT]    = virtio_p9_unlinkat,
+       [P9_TMKDIR]       = virtio_p9_mkdir,
+       [P9_TVERSION]     = virtio_p9_version,
+       [P9_TLOPEN]       = virtio_p9_open,
+       [P9_TATTACH]      = virtio_p9_attach,
+       [P9_TWALK]        = virtio_p9_walk,
+       [P9_TCLUNK]       = virtio_p9_clunk,
+       [P9_TFSYNC]       = virtio_p9_fsync,
+       [P9_TREAD]        = virtio_p9_read,
+       [P9_TFLUSH]       = virtio_p9_eopnotsupp,
+       [P9_TLINK]        = virtio_p9_link,
+       [P9_TSYMLINK]     = virtio_p9_symlink,
+       [P9_TLCREATE]     = virtio_p9_create,
+       [P9_TWRITE]       = virtio_p9_write,
+};
+
+static struct p9_pdu *virtio_p9_pdu_init(struct kvm *kvm, struct virt_queue *vq)
+{
+       struct p9_pdu *pdu = calloc(1, sizeof(*pdu));
+       if (!pdu)
+               return NULL;
+
+       /* skip the pdu header p9_msg */
+       pdu->read_offset  = VIRTIO_P9_HDR_LEN;
+       pdu->write_offset = VIRTIO_P9_HDR_LEN;
+       pdu->queue_head  = virt_queue__get_inout_iov(kvm, vq, pdu->in_iov,
+                                                    pdu->out_iov,
+                                                    &pdu->in_iov_cnt,
+                                                    &pdu->out_iov_cnt);
+       return pdu;
+}
+
+static u8 virtio_p9_get_cmd(struct p9_pdu *pdu)
+{
+       struct p9_msg *msg;
+       /*
+        * we can peek directly into pdu for a u8
+        * value. The host endianess won't be an issue
+        */
+       msg = pdu->out_iov[0].iov_base;
+       return msg->cmd;
+}
+
+static bool virtio_p9_do_io_request(struct kvm *kvm, struct p9_dev_job *job)
+{
+       u8 cmd;
+       u32 len = 0;
+       p9_handler *handler;
+       struct p9_dev *p9dev;
+       struct virt_queue *vq;
+       struct p9_pdu *p9pdu;
+
+       vq = job->vq;
+       p9dev = job->p9dev;
+
+       p9pdu = virtio_p9_pdu_init(kvm, vq);
+       cmd = virtio_p9_get_cmd(p9pdu);
+
+       if ((cmd >= ARRAY_SIZE(virtio_9p_dotl_handler)) ||
+           !virtio_9p_dotl_handler[cmd])
+               handler = virtio_p9_eopnotsupp;
+       else
+               handler = virtio_9p_dotl_handler[cmd];
+
+       handler(p9dev, p9pdu, &len);
+       virt_queue__set_used_elem(vq, p9pdu->queue_head, len);
+       free(p9pdu);
+       return true;
+}
+
+static void virtio_p9_do_io(struct kvm *kvm, void *param)
+{
+       struct p9_dev_job *job = (struct p9_dev_job *)param;
+       struct p9_dev *p9dev   = job->p9dev;
+       struct virt_queue *vq  = job->vq;
+
+       while (virt_queue__available(vq)) {
+               virtio_p9_do_io_request(kvm, job);
+               virtio_pci__signal_vq(kvm, &p9dev->vpci, vq - p9dev->vqs);
+       }
+}
+
+static void set_config(struct kvm *kvm, void *dev, u8 data, u32 offset)
+{
+       struct p9_dev *p9dev = dev;
+
+       ((u8 *)(p9dev->config))[offset] = data;
+}
+
+static u8 get_config(struct kvm *kvm, void *dev, u32 offset)
+{
+       struct p9_dev *p9dev = dev;
+
+       return ((u8 *)(p9dev->config))[offset];
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+       return 1 << VIRTIO_9P_MOUNT_TAG;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+       struct p9_dev *p9dev = dev;
+
+       p9dev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 pfn)
+{
+       struct p9_dev *p9dev = dev;
+       struct p9_dev_job *job;
+       struct virt_queue *queue;
+       void *p;
+
+       compat__remove_message(p9dev->compat_id);
+
+       queue                   = &p9dev->vqs[vq];
+       queue->pfn              = pfn;
+       p                       = guest_pfn_to_host(kvm, queue->pfn);
+       job                     = &p9dev->jobs[vq];
+
+       vring_init(&queue->vring, VIRTQUEUE_NUM, p, VIRTIO_PCI_VRING_ALIGN);
+
+       *job                    = (struct p9_dev_job) {
+               .vq                     = queue,
+               .p9dev                  = p9dev,
+       };
+       thread_pool__init_job(&job->job_id, kvm, virtio_p9_do_io, job);
+
+       return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct p9_dev *p9dev = dev;
+
+       thread_pool__do_job(&p9dev->jobs[vq].job_id);
+
+       return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct p9_dev *p9dev = dev;
+
+       return p9dev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       return VIRTQUEUE_NUM;
+}
+
+int virtio_9p__init(struct kvm *kvm)
+{
+       struct p9_dev *p9dev;
+
+       list_for_each_entry(p9dev, &devs, list) {
+               virtio_pci__init(kvm, &p9dev->vpci, p9dev, PCI_DEVICE_ID_VIRTIO_P9, VIRTIO_ID_9P);
+               p9dev->vpci.ops = (struct virtio_pci_ops) {
+                       .set_config             = set_config,
+                       .get_config             = get_config,
+                       .get_host_features      = get_host_features,
+                       .set_guest_features     = set_guest_features,
+                       .init_vq                = init_vq,
+                       .notify_vq              = notify_vq,
+                       .get_pfn_vq             = get_pfn_vq,
+                       .get_size_vq            = get_size_vq,
+               };
+       }
+
+       return 0;
+}
+
+int virtio_9p__register(struct kvm *kvm, const char *root, const char *tag_name)
+{
+       struct p9_dev *p9dev;
+       u32 i, root_len;
+       int err = 0;
+
+       p9dev = calloc(1, sizeof(*p9dev));
+       if (!p9dev)
+               return -ENOMEM;
+
+       if (!tag_name)
+               tag_name = VIRTIO_P9_DEFAULT_TAG;
+
+       p9dev->config = calloc(1, sizeof(*p9dev->config) + strlen(tag_name) + 1);
+       if (p9dev->config == NULL) {
+               err = -ENOMEM;
+               goto free_p9dev;
+       }
+
+       strcpy(p9dev->root_dir, root);
+       root_len = strlen(root);
+       /*
+        * We prefix the full path in all fids, This allows us to get the
+        * absolute path of an fid without playing with strings.
+        */
+       for (i = 0; i < VIRTIO_P9_MAX_FID; i++) {
+               strcpy(p9dev->fids[i].abs_path, root);
+               p9dev->fids[i].path = p9dev->fids[i].abs_path + root_len;
+       }
+       p9dev->config->tag_len = strlen(tag_name);
+       if (p9dev->config->tag_len > MAX_TAG_LEN) {
+               err = -EINVAL;
+               goto free_p9dev_config;
+       }
+
+       memcpy(&p9dev->config->tag, tag_name, strlen(tag_name));
+
+       list_add(&p9dev->list, &devs);
+
+       p9dev->compat_id = compat__add_message("virtio-9p device was not detected",
+                                               "While you have requested a virtio-9p device, "
+                                               "the guest kernel didn't seem to detect it.\n"
+                                               "Please make sure that the kernel was compiled "
+                                               "with CONFIG_NET_9P_VIRTIO.");
+
+       return err;
+
+free_p9dev_config:
+       free(p9dev->config);
+free_p9dev:
+       free(p9dev);
+       return err;
+}
diff --git a/tools/kvm/virtio/balloon.c b/tools/kvm/virtio/balloon.c
new file mode 100644 (file)
index 0000000..0b79132
--- /dev/null
@@ -0,0 +1,288 @@
+#include "kvm/virtio-balloon.h"
+
+#include "kvm/virtio-pci-dev.h"
+
+#include "kvm/virtio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/guest_compat.h"
+#include "kvm/virtio-pci.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_balloon.h>
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <sys/eventfd.h>
+
+#define NUM_VIRT_QUEUES                3
+#define VIRTIO_BLN_QUEUE_SIZE  128
+#define VIRTIO_BLN_INFLATE     0
+#define VIRTIO_BLN_DEFLATE     1
+#define VIRTIO_BLN_STATS       2
+
+struct bln_dev {
+       struct list_head        list;
+       struct virtio_pci       vpci;
+
+       u32                     features;
+
+       /* virtio queue */
+       struct virt_queue       vqs[NUM_VIRT_QUEUES];
+       struct thread_pool__job jobs[NUM_VIRT_QUEUES];
+
+       struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
+       struct virtio_balloon_stat *cur_stat;
+       u32                     cur_stat_head;
+       u16                     stat_count;
+       int                     stat_waitfd;
+
+       int                     compat_id;
+       struct virtio_balloon_config config;
+};
+
+static struct bln_dev bdev;
+extern struct kvm *kvm;
+
+static bool virtio_bln_do_io_request(struct kvm *kvm, struct bln_dev *bdev, struct virt_queue *queue)
+{
+       struct iovec iov[VIRTIO_BLN_QUEUE_SIZE];
+       unsigned int len = 0;
+       u16 out, in, head;
+       u32 *ptrs, i;
+
+       head            = virt_queue__get_iov(queue, iov, &out, &in, kvm);
+       ptrs            = iov[0].iov_base;
+       len             = iov[0].iov_len / sizeof(u32);
+
+       for (i = 0 ; i < len ; i++) {
+               void *guest_ptr;
+
+               guest_ptr = guest_flat_to_host(kvm, ptrs[i] << VIRTIO_BALLOON_PFN_SHIFT);
+               if (queue == &bdev->vqs[VIRTIO_BLN_INFLATE]) {
+                       madvise(guest_ptr, 1 << VIRTIO_BALLOON_PFN_SHIFT, MADV_DONTNEED);
+                       bdev->config.actual++;
+               } else if (queue == &bdev->vqs[VIRTIO_BLN_DEFLATE]) {
+                       bdev->config.actual--;
+               }
+       }
+
+       virt_queue__set_used_elem(queue, head, len);
+
+       return true;
+}
+
+static bool virtio_bln_do_stat_request(struct kvm *kvm, struct bln_dev *bdev, struct virt_queue *queue)
+{
+       struct iovec iov[VIRTIO_BLN_QUEUE_SIZE];
+       u16 out, in, head;
+       struct virtio_balloon_stat *stat;
+       u64 wait_val = 1;
+
+       head = virt_queue__get_iov(queue, iov, &out, &in, kvm);
+       stat = iov[0].iov_base;
+
+       /* Initial empty stat buffer */
+       if (bdev->cur_stat == NULL) {
+               bdev->cur_stat = stat;
+               bdev->cur_stat_head = head;
+
+               return true;
+       }
+
+       memcpy(bdev->stats, stat, iov[0].iov_len);
+
+       bdev->stat_count = iov[0].iov_len / sizeof(struct virtio_balloon_stat);
+       bdev->cur_stat = stat;
+       bdev->cur_stat_head = head;
+
+       if (write(bdev->stat_waitfd, &wait_val, sizeof(wait_val)) <= 0)
+               return -EFAULT;
+
+       return 1;
+}
+
+static void virtio_bln_do_io(struct kvm *kvm, void *param)
+{
+       struct virt_queue *vq = param;
+
+       if (vq == &bdev.vqs[VIRTIO_BLN_STATS]) {
+               virtio_bln_do_stat_request(kvm, &bdev, vq);
+               virtio_pci__signal_vq(kvm, &bdev.vpci, VIRTIO_BLN_STATS);
+               return;
+       }
+
+       while (virt_queue__available(vq)) {
+               virtio_bln_do_io_request(kvm, &bdev, vq);
+               virtio_pci__signal_vq(kvm, &bdev.vpci, vq - bdev.vqs);
+       }
+}
+
+static int virtio_bln__collect_stats(void)
+{
+       u64 tmp;
+
+       virt_queue__set_used_elem(&bdev.vqs[VIRTIO_BLN_STATS], bdev.cur_stat_head,
+                                 sizeof(struct virtio_balloon_stat));
+       virtio_pci__signal_vq(kvm, &bdev.vpci, VIRTIO_BLN_STATS);
+
+       if (read(bdev.stat_waitfd, &tmp, sizeof(tmp)) <= 0)
+               return -EFAULT;
+
+       return 0;
+}
+
+static int virtio_bln__print_stats(void)
+{
+       u16 i;
+
+       if (virtio_bln__collect_stats() < 0)
+               return -EFAULT;
+
+       printf("\n\n\t*** Guest memory statistics ***\n\n");
+       for (i = 0; i < bdev.stat_count; i++) {
+               switch (bdev.stats[i].tag) {
+               case VIRTIO_BALLOON_S_SWAP_IN:
+                       printf("The amount of memory that has been swapped in (in bytes):");
+                       break;
+               case VIRTIO_BALLOON_S_SWAP_OUT:
+                       printf("The amount of memory that has been swapped out to disk (in bytes):");
+                       break;
+               case VIRTIO_BALLOON_S_MAJFLT:
+                       printf("The number of major page faults that have occurred:");
+                       break;
+               case VIRTIO_BALLOON_S_MINFLT:
+                       printf("The number of minor page faults that have occurred:");
+                       break;
+               case VIRTIO_BALLOON_S_MEMFREE:
+                       printf("The amount of memory not being used for any purpose (in bytes):");
+                       break;
+               case VIRTIO_BALLOON_S_MEMTOT:
+                       printf("The total amount of memory available (in bytes):");
+                       break;
+               }
+               printf("%llu\n", bdev.stats[i].val);
+       }
+       printf("\n");
+
+       return 0;
+}
+
+static void handle_sigmem(int sig)
+{
+       if (sig == SIGKVMADDMEM) {
+               bdev.config.num_pages += 256;
+       } else if (sig == SIGKVMDELMEM) {
+               if (bdev.config.num_pages < 256)
+                       return;
+
+               bdev.config.num_pages -= 256;
+       } else if (sig == SIGKVMMEMSTAT) {
+               virtio_bln__print_stats();
+
+               return;
+       }
+
+       /* Notify that the configuration space has changed */
+       virtio_pci__signal_config(kvm, &bdev.vpci);
+}
+
+static void set_config(struct kvm *kvm, void *dev, u8 data, u32 offset)
+{
+       struct bln_dev *bdev = dev;
+
+       ((u8 *)(&bdev->config))[offset] = data;
+}
+
+static u8 get_config(struct kvm *kvm, void *dev, u32 offset)
+{
+       struct bln_dev *bdev = dev;
+
+       return ((u8 *)(&bdev->config))[offset];
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+       return 1 << VIRTIO_BALLOON_F_STATS_VQ;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+       struct bln_dev *bdev = dev;
+
+       bdev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 pfn)
+{
+       struct bln_dev *bdev = dev;
+       struct virt_queue *queue;
+       void *p;
+
+       compat__remove_message(bdev->compat_id);
+
+       queue                   = &bdev->vqs[vq];
+       queue->pfn              = pfn;
+       p                       = guest_pfn_to_host(kvm, queue->pfn);
+
+       thread_pool__init_job(&bdev->jobs[vq], kvm, virtio_bln_do_io, queue);
+       vring_init(&queue->vring, VIRTIO_BLN_QUEUE_SIZE, p, VIRTIO_PCI_VRING_ALIGN);
+
+       return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct bln_dev *bdev = dev;
+
+       thread_pool__do_job(&bdev->jobs[vq]);
+
+       return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct bln_dev *bdev = dev;
+
+       return bdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       return VIRTIO_BLN_QUEUE_SIZE;
+}
+
+void virtio_bln__init(struct kvm *kvm)
+{
+       signal(SIGKVMADDMEM, handle_sigmem);
+       signal(SIGKVMDELMEM, handle_sigmem);
+       signal(SIGKVMMEMSTAT, handle_sigmem);
+
+       bdev.stat_waitfd        = eventfd(0, 0);
+       memset(&bdev.config, 0, sizeof(struct virtio_balloon_config));
+
+       virtio_pci__init(kvm, &bdev.vpci, &bdev, PCI_DEVICE_ID_VIRTIO_BLN, VIRTIO_ID_BALLOON);
+       bdev.vpci.ops = (struct virtio_pci_ops) {
+               .set_config             = set_config,
+               .get_config             = get_config,
+               .get_host_features      = get_host_features,
+               .set_guest_features     = set_guest_features,
+               .init_vq                = init_vq,
+               .notify_vq              = notify_vq,
+               .get_pfn_vq             = get_pfn_vq,
+               .get_size_vq            = get_size_vq,
+       };
+
+       bdev.compat_id = compat__add_message("virtio-balloon device was not detected",
+                                               "While you have requested a virtio-balloon device, "
+                                               "the guest kernel didn't seem to detect it.\n"
+                                               "Please make sure that the kernel was compiled "
+                                               "with CONFIG_VIRTIO_BALLOON.");
+}
diff --git a/tools/kvm/virtio/blk.c b/tools/kvm/virtio/blk.c
new file mode 100644 (file)
index 0000000..c508123
--- /dev/null
@@ -0,0 +1,247 @@
+#include "kvm/virtio-blk.h"
+
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/virtio.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/guest_compat.h"
+#include "kvm/virtio-pci.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_blk.h>
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <pthread.h>
+
+#define VIRTIO_BLK_MAX_DEV             4
+#define NUM_VIRT_QUEUES                        1
+
+#define VIRTIO_BLK_QUEUE_SIZE          128
+/*
+ * the header and status consume too entries
+ */
+#define DISK_SEG_MAX                   (VIRTIO_BLK_QUEUE_SIZE - 2)
+
+struct blk_dev_job {
+       struct virt_queue               *vq;
+       struct blk_dev                  *bdev;
+       struct iovec                    iov[VIRTIO_BLK_QUEUE_SIZE];
+       u16                             out, in, head;
+       struct thread_pool__job         job_id;
+};
+
+struct blk_dev {
+       pthread_mutex_t                 mutex;
+       struct list_head                list;
+
+       struct virtio_pci               vpci;
+       struct virtio_blk_config        blk_config;
+       struct disk_image               *disk;
+       int                             compat_id;
+       u32                             features;
+
+       struct virt_queue               vqs[NUM_VIRT_QUEUES];
+       struct blk_dev_job              jobs[VIRTIO_BLK_QUEUE_SIZE];
+       u16                             job_idx;
+};
+
+static LIST_HEAD(bdevs);
+
+static void virtio_blk_do_io_request(struct kvm *kvm, void *param)
+{
+       struct virtio_blk_outhdr *req;
+       u8 *status;
+       ssize_t block_cnt;
+       struct blk_dev_job *job;
+       struct blk_dev *bdev;
+       struct virt_queue *queue;
+       struct iovec *iov;
+       u16 out, in, head;
+
+       block_cnt       = -1;
+       job             = param;
+       bdev            = job->bdev;
+       queue           = job->vq;
+       iov             = job->iov;
+       out             = job->out;
+       in              = job->in;
+       head            = job->head;
+       req             = iov[0].iov_base;
+
+       switch (req->type) {
+       case VIRTIO_BLK_T_IN:
+               block_cnt       = disk_image__read(bdev->disk, req->sector, iov + 1, in + out - 2);
+               break;
+       case VIRTIO_BLK_T_OUT:
+               block_cnt       = disk_image__write(bdev->disk, req->sector, iov + 1, in + out - 2);
+               break;
+       case VIRTIO_BLK_T_FLUSH:
+               block_cnt       = disk_image__flush(bdev->disk);
+               break;
+       case VIRTIO_BLK_T_GET_ID:
+               block_cnt       = VIRTIO_BLK_ID_BYTES;
+               disk_image__get_serial(bdev->disk, (iov + 1)->iov_base, &block_cnt);
+               break;
+       default:
+               pr_warning("request type %d", req->type);
+               block_cnt       = -1;
+               break;
+       }
+
+       /* status */
+       status                  = iov[out + in - 1].iov_base;
+       *status                 = (block_cnt < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+
+       mutex_lock(&bdev->mutex);
+       virt_queue__set_used_elem(queue, head, block_cnt);
+       mutex_unlock(&bdev->mutex);
+
+       virtio_pci__signal_vq(kvm, &bdev->vpci, queue - bdev->vqs);
+}
+
+static void virtio_blk_do_io(struct kvm *kvm, struct virt_queue *vq, struct blk_dev *bdev)
+{
+       while (virt_queue__available(vq)) {
+               struct blk_dev_job *job = &bdev->jobs[bdev->job_idx++ % VIRTIO_BLK_QUEUE_SIZE];
+
+               *job                    = (struct blk_dev_job) {
+                       .vq                     = vq,
+                       .bdev                   = bdev,
+               };
+               job->head = virt_queue__get_iov(vq, job->iov, &job->out, &job->in, kvm);
+
+               thread_pool__init_job(&job->job_id, kvm, virtio_blk_do_io_request, job);
+               thread_pool__do_job(&job->job_id);
+       }
+}
+
+static void set_config(struct kvm *kvm, void *dev, u8 data, u32 offset)
+{
+       struct blk_dev *bdev = dev;
+
+       ((u8 *)(&bdev->blk_config))[offset] = data;
+}
+
+static u8 get_config(struct kvm *kvm, void *dev, u32 offset)
+{
+       struct blk_dev *bdev = dev;
+
+       return ((u8 *)(&bdev->blk_config))[offset];
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+       return 1UL << VIRTIO_BLK_F_SEG_MAX | 1UL << VIRTIO_BLK_F_FLUSH;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+       struct blk_dev *bdev = dev;
+
+       bdev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 pfn)
+{
+       struct blk_dev *bdev = dev;
+       struct virt_queue *queue;
+       void *p;
+
+       compat__remove_message(bdev->compat_id);
+
+       queue                   = &bdev->vqs[vq];
+       queue->pfn              = pfn;
+       p                       = guest_pfn_to_host(kvm, queue->pfn);
+
+       vring_init(&queue->vring, VIRTIO_BLK_QUEUE_SIZE, p, VIRTIO_PCI_VRING_ALIGN);
+
+       return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct blk_dev *bdev = dev;
+
+       virtio_blk_do_io(kvm, &bdev->vqs[vq], bdev);
+
+       return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct blk_dev *bdev = dev;
+
+       return bdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       return VIRTIO_BLK_QUEUE_SIZE;
+}
+
+void virtio_blk__init(struct kvm *kvm, struct disk_image *disk)
+{
+       struct blk_dev *bdev;
+
+       if (!disk)
+               return;
+
+       bdev = calloc(1, sizeof(struct blk_dev));
+       if (bdev == NULL)
+               die("Failed allocating bdev");
+
+       *bdev = (struct blk_dev) {
+               .mutex                  = PTHREAD_MUTEX_INITIALIZER,
+               .disk                   = disk,
+               .blk_config             = (struct virtio_blk_config) {
+                       .capacity       = disk->size / SECTOR_SIZE,
+                       .seg_max        = DISK_SEG_MAX,
+               },
+       };
+
+       virtio_pci__init(kvm, &bdev->vpci, bdev, PCI_DEVICE_ID_VIRTIO_BLK, VIRTIO_ID_BLOCK);
+       bdev->vpci.ops = (struct virtio_pci_ops) {
+               .set_config             = set_config,
+               .get_config             = get_config,
+               .get_host_features      = get_host_features,
+               .set_guest_features     = set_guest_features,
+               .init_vq                = init_vq,
+               .notify_vq              = notify_vq,
+               .get_pfn_vq             = get_pfn_vq,
+               .get_size_vq            = get_size_vq,
+       };
+
+       list_add_tail(&bdev->list, &bdevs);
+
+       bdev->compat_id = compat__add_message("virtio-blk device was not detected",
+                                               "While you have requested a virtio-blk device, "
+                                               "the guest kernel didn't seem to detect it.\n"
+                                               "Please make sure that the kernel was compiled "
+                                               "with CONFIG_VIRTIO_BLK.");
+}
+
+void virtio_blk__init_all(struct kvm *kvm)
+{
+       int i;
+
+       for (i = 0; i < kvm->nr_disks; i++)
+               virtio_blk__init(kvm, kvm->disks[i]);
+}
+
+void virtio_blk__delete_all(struct kvm *kvm)
+{
+       while (!list_empty(&bdevs)) {
+               struct blk_dev *bdev;
+
+               bdev = list_first_entry(&bdevs, struct blk_dev, list);
+               list_del(&bdev->list);
+               free(bdev);
+       }
+}
diff --git a/tools/kvm/virtio/console.c b/tools/kvm/virtio/console.c
new file mode 100644 (file)
index 0000000..c0ccd6c
--- /dev/null
@@ -0,0 +1,196 @@
+#include "kvm/virtio-console.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/virtio.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/irq.h"
+#include "kvm/guest_compat.h"
+#include "kvm/virtio-pci.h"
+
+#include <linux/virtio_console.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_blk.h>
+
+#include <sys/uio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <termios.h>
+#include <assert.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define VIRTIO_CONSOLE_QUEUE_SIZE      128
+#define VIRTIO_CONSOLE_NUM_QUEUES      2
+#define VIRTIO_CONSOLE_RX_QUEUE                0
+#define VIRTIO_CONSOLE_TX_QUEUE                1
+
+struct con_dev {
+       pthread_mutex_t                 mutex;
+
+       struct virtio_pci               vpci;
+       struct virt_queue               vqs[VIRTIO_CONSOLE_NUM_QUEUES];
+       struct virtio_console_config    config;
+       u32                             features;
+       int                             compat_id;
+
+       struct thread_pool__job         jobs[VIRTIO_CONSOLE_NUM_QUEUES];
+};
+
+static struct con_dev cdev = {
+       .mutex                          = PTHREAD_MUTEX_INITIALIZER,
+
+       .config = {
+               .cols                   = 80,
+               .rows                   = 24,
+               .max_nr_ports           = 1,
+       },
+};
+
+/*
+ * Interrupts are injected for hvc0 only.
+ */
+static void virtio_console__inject_interrupt_callback(struct kvm *kvm, void *param)
+{
+       struct iovec iov[VIRTIO_CONSOLE_QUEUE_SIZE];
+       struct virt_queue *vq;
+       u16 out, in;
+       u16 head;
+       int len;
+
+       mutex_lock(&cdev.mutex);
+
+       vq = param;
+
+       if (term_readable(CONSOLE_VIRTIO) && virt_queue__available(vq)) {
+               head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+               len = term_getc_iov(CONSOLE_VIRTIO, iov, in);
+               virt_queue__set_used_elem(vq, head, len);
+               virtio_pci__signal_vq(kvm, &cdev.vpci, vq - cdev.vqs);
+       }
+
+       mutex_unlock(&cdev.mutex);
+}
+
+void virtio_console__inject_interrupt(struct kvm *kvm)
+{
+       thread_pool__do_job(&cdev.jobs[VIRTIO_CONSOLE_RX_QUEUE]);
+}
+
+static void virtio_console_handle_callback(struct kvm *kvm, void *param)
+{
+       struct iovec iov[VIRTIO_CONSOLE_QUEUE_SIZE];
+       struct virt_queue *vq;
+       u16 out, in;
+       u16 head;
+       u32 len;
+
+       vq = param;
+
+       /*
+        * The current Linux implementation polls for the buffer
+        * to be used, rather than waiting for an interrupt.
+        * So there is no need to inject an interrupt for the tx path.
+        */
+
+       while (virt_queue__available(vq)) {
+               head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+               len = term_putc_iov(CONSOLE_VIRTIO, iov, out);
+               virt_queue__set_used_elem(vq, head, len);
+       }
+
+}
+
+static void set_config(struct kvm *kvm, void *dev, u8 data, u32 offset)
+{
+       struct con_dev *cdev = dev;
+
+       ((u8 *)(&cdev->config))[offset] = data;
+}
+
+static u8 get_config(struct kvm *kvm, void *dev, u32 offset)
+{
+       struct con_dev *cdev = dev;
+
+       return ((u8 *)(&cdev->config))[offset];
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+       return 0;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+       /* Unused */
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 pfn)
+{
+       struct virt_queue *queue;
+       void *p;
+
+       assert(vq < VIRTIO_CONSOLE_NUM_QUEUES);
+
+       compat__remove_message(cdev.compat_id);
+
+       queue                   = &cdev.vqs[vq];
+       queue->pfn              = pfn;
+       p                       = guest_pfn_to_host(kvm, queue->pfn);
+
+       vring_init(&queue->vring, VIRTIO_CONSOLE_QUEUE_SIZE, p, VIRTIO_PCI_VRING_ALIGN);
+
+       if (vq == VIRTIO_CONSOLE_TX_QUEUE)
+               thread_pool__init_job(&cdev.jobs[vq], kvm, virtio_console_handle_callback, queue);
+       else if (vq == VIRTIO_CONSOLE_RX_QUEUE)
+               thread_pool__init_job(&cdev.jobs[vq], kvm, virtio_console__inject_interrupt_callback, queue);
+
+       return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct con_dev *cdev = dev;
+
+       thread_pool__do_job(&cdev->jobs[vq]);
+
+       return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct con_dev *cdev = dev;
+
+       return cdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       return VIRTIO_CONSOLE_QUEUE_SIZE;
+}
+
+void virtio_console__init(struct kvm *kvm)
+{
+       virtio_pci__init(kvm, &cdev.vpci, &cdev, PCI_DEVICE_ID_VIRTIO_CONSOLE, VIRTIO_ID_CONSOLE);
+       cdev.vpci.ops = (struct virtio_pci_ops) {
+               .set_config             = set_config,
+               .get_config             = get_config,
+               .get_host_features      = get_host_features,
+               .set_guest_features     = set_guest_features,
+               .init_vq                = init_vq,
+               .notify_vq              = notify_vq,
+               .get_pfn_vq             = get_pfn_vq,
+               .get_size_vq            = get_size_vq,
+       };
+
+       cdev.compat_id = compat__add_message("virtio-console device was not detected",
+                                               "While you have requested a virtio-console device, "
+                                               "the guest kernel didn't seem to detect it.\n"
+                                               "Please make sure that the kernel was compiled "
+                                               "with CONFIG_VIRTIO_CONSOLE.");
+}
diff --git a/tools/kvm/virtio/core.c b/tools/kvm/virtio/core.c
new file mode 100644 (file)
index 0000000..d7c132b
--- /dev/null
@@ -0,0 +1,123 @@
+#include <linux/virtio_ring.h>
+#include <linux/types.h>
+#include <sys/uio.h>
+
+#include "kvm/barrier.h"
+
+#include "kvm/kvm.h"
+#include "kvm/virtio.h"
+
+struct vring_used_elem *virt_queue__set_used_elem(struct virt_queue *queue, u32 head, u32 len)
+{
+       struct vring_used_elem *used_elem;
+
+       used_elem       = &queue->vring.used->ring[queue->vring.used->idx % queue->vring.num];
+       used_elem->id   = head;
+       used_elem->len  = len;
+
+       /*
+        * Use wmb to assure that used elem was updated with head and len.
+        * We need a wmb here since we can't advance idx unless we're ready
+        * to pass the used element to the guest.
+        */
+       wmb();
+       queue->vring.used->idx++;
+
+       /*
+        * Use wmb to assure used idx has been increased before we signal the guest.
+        * Without a wmb here the guest may ignore the queue since it won't see
+        * an updated idx.
+        */
+       wmb();
+
+       return used_elem;
+}
+
+u16 virt_queue__get_iov(struct virt_queue *queue, struct iovec iov[], u16 *out, u16 *in, struct kvm *kvm)
+{
+       struct vring_desc *desc;
+       u16 head, idx;
+
+       idx = head = virt_queue__pop(queue);
+       *out = *in = 0;
+
+       do {
+               desc                            = virt_queue__get_desc(queue, idx);
+               iov[*out + *in].iov_base        = guest_flat_to_host(kvm, desc->addr);
+               iov[*out + *in].iov_len         = desc->len;
+               if (desc->flags & VRING_DESC_F_WRITE)
+                       (*in)++;
+               else
+                       (*out)++;
+               if (desc->flags & VRING_DESC_F_NEXT)
+                       idx = desc->next;
+               else
+                       break;
+       } while (1);
+
+       return head;
+}
+
+/* in and out are relative to guest */
+u16 virt_queue__get_inout_iov(struct kvm *kvm, struct virt_queue *queue,
+                             struct iovec in_iov[], struct iovec out_iov[],
+                             u16 *in, u16 *out)
+{
+       u16 head, idx;
+       struct vring_desc *desc;
+
+       idx = head = virt_queue__pop(queue);
+       *out = *in = 0;
+       do {
+               desc = virt_queue__get_desc(queue, idx);
+               if (desc->flags & VRING_DESC_F_WRITE) {
+                       in_iov[*in].iov_base = guest_flat_to_host(kvm,
+                                                                 desc->addr);
+                       in_iov[*in].iov_len = desc->len;
+                       (*in)++;
+               } else {
+                       out_iov[*out].iov_base = guest_flat_to_host(kvm,
+                                                                   desc->addr);
+                       out_iov[*out].iov_len = desc->len;
+                       (*out)++;
+               }
+               if (desc->flags & VRING_DESC_F_NEXT)
+                       idx = desc->next;
+               else
+                       break;
+       } while (1);
+       return head;
+}
+
+
+void virt_queue__trigger_irq(struct virt_queue *vq, int irq, u8 *isr, struct kvm *kvm)
+{
+       if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+               return;
+
+       if (*isr == VIRTIO_IRQ_LOW) {
+               *isr = VIRTIO_IRQ_HIGH;
+               kvm__irq_line(kvm, irq, VIRTIO_IRQ_HIGH);
+       }
+}
+
+int virtio__get_dev_specific_field(int offset, bool msix, bool features_hi, u32 *config_off)
+{
+       if (msix) {
+               if (offset < 4)
+                       return VIRTIO_PCI_O_MSIX;
+               else
+                       offset -= 4;
+       }
+
+       if (features_hi) {
+               if (offset < 4)
+                       return VIRTIO_PCI_O_FEATURES;
+               else
+                       offset -= 4;
+       }
+
+       *config_off = offset;
+
+       return VIRTIO_PCI_O_CONFIG;
+}
diff --git a/tools/kvm/virtio/net.c b/tools/kvm/virtio/net.c
new file mode 100644 (file)
index 0000000..4700483
--- /dev/null
@@ -0,0 +1,406 @@
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/virtio-net.h"
+#include "kvm/virtio.h"
+#include "kvm/types.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/irq.h"
+#include "kvm/uip.h"
+#include "kvm/guest_compat.h"
+#include "kvm/virtio-pci.h"
+
+#include <linux/virtio_net.h>
+#include <linux/if_tun.h>
+#include <linux/types.h>
+
+#include <arpa/inet.h>
+#include <net/if.h>
+
+#include <unistd.h>
+#include <assert.h>
+#include <fcntl.h>
+
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#define VIRTIO_NET_QUEUE_SIZE          128
+#define VIRTIO_NET_NUM_QUEUES          2
+#define VIRTIO_NET_RX_QUEUE            0
+#define VIRTIO_NET_TX_QUEUE            1
+
+struct net_dev;
+
+extern struct kvm *kvm;
+
+struct net_dev_operations {
+       int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev);
+       int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev);
+};
+
+struct net_dev {
+       pthread_mutex_t                 mutex;
+       struct virtio_pci               vpci;
+
+       struct virt_queue               vqs[VIRTIO_NET_NUM_QUEUES];
+       struct virtio_net_config        config;
+       u32                             features;
+       int                             compat_id;
+
+       pthread_t                       io_rx_thread;
+       pthread_mutex_t                 io_rx_lock;
+       pthread_cond_t                  io_rx_cond;
+
+       pthread_t                       io_tx_thread;
+       pthread_mutex_t                 io_tx_lock;
+       pthread_cond_t                  io_tx_cond;
+
+       int                             tap_fd;
+       char                            tap_name[IFNAMSIZ];
+
+       int                             mode;
+
+       struct uip_info                 info;
+       struct net_dev_operations       *ops;
+};
+
+static struct net_dev ndev = {
+       .mutex  = PTHREAD_MUTEX_INITIALIZER,
+
+       .config = {
+               .status                 = VIRTIO_NET_S_LINK_UP,
+       },
+       .info = {
+               .buf_nr                 = 20,
+       }
+};
+
+static void *virtio_net_rx_thread(void *p)
+{
+       struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
+       struct virt_queue *vq;
+       struct kvm *kvm;
+       u16 out, in;
+       u16 head;
+       int len;
+
+       kvm     = p;
+       vq      = &ndev.vqs[VIRTIO_NET_RX_QUEUE];
+
+       while (1) {
+
+               mutex_lock(&ndev.io_rx_lock);
+               if (!virt_queue__available(vq))
+                       pthread_cond_wait(&ndev.io_rx_cond, &ndev.io_rx_lock);
+               mutex_unlock(&ndev.io_rx_lock);
+
+               while (virt_queue__available(vq)) {
+
+                       head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+
+                       len = ndev.ops->rx(iov, in, &ndev);
+
+                       virt_queue__set_used_elem(vq, head, len);
+
+                       /* We should interrupt guest right now, otherwise latency is huge. */
+                       virtio_pci__signal_vq(kvm, &ndev.vpci, VIRTIO_NET_RX_QUEUE);
+               }
+
+       }
+
+       pthread_exit(NULL);
+       return NULL;
+
+}
+
+static void *virtio_net_tx_thread(void *p)
+{
+       struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
+       struct virt_queue *vq;
+       struct kvm *kvm;
+       u16 out, in;
+       u16 head;
+       int len;
+
+       kvm     = p;
+       vq      = &ndev.vqs[VIRTIO_NET_TX_QUEUE];
+
+       while (1) {
+               mutex_lock(&ndev.io_tx_lock);
+               if (!virt_queue__available(vq))
+                       pthread_cond_wait(&ndev.io_tx_cond, &ndev.io_tx_lock);
+               mutex_unlock(&ndev.io_tx_lock);
+
+               while (virt_queue__available(vq)) {
+
+                       head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+
+                       len = ndev.ops->tx(iov, out, &ndev);
+
+                       virt_queue__set_used_elem(vq, head, len);
+               }
+
+               virtio_pci__signal_vq(kvm, &ndev.vpci, VIRTIO_NET_TX_QUEUE);
+       }
+
+       pthread_exit(NULL);
+
+       return NULL;
+
+}
+
+static void virtio_net_handle_callback(struct kvm *kvm, u16 queue_index)
+{
+       switch (queue_index) {
+       case VIRTIO_NET_TX_QUEUE:
+               mutex_lock(&ndev.io_tx_lock);
+               pthread_cond_signal(&ndev.io_tx_cond);
+               mutex_unlock(&ndev.io_tx_lock);
+               break;
+       case VIRTIO_NET_RX_QUEUE:
+               mutex_lock(&ndev.io_rx_lock);
+               pthread_cond_signal(&ndev.io_rx_cond);
+               mutex_unlock(&ndev.io_rx_lock);
+               break;
+       default:
+               pr_warning("Unknown queue index %u", queue_index);
+       }
+}
+
+static bool virtio_net__tap_init(const struct virtio_net_parameters *params)
+{
+       int sock = socket(AF_INET, SOCK_STREAM, 0);
+       int pid, status, offload, hdr_len;
+       struct sockaddr_in sin = {0};
+       struct ifreq ifr;
+
+       ndev.tap_fd = open("/dev/net/tun", O_RDWR);
+       if (ndev.tap_fd < 0) {
+               pr_warning("Unable to open /dev/net/tun");
+               goto fail;
+       }
+
+       memset(&ifr, 0, sizeof(ifr));
+       ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+       if (ioctl(ndev.tap_fd, TUNSETIFF, &ifr) < 0) {
+               pr_warning("Config tap device error. Are you root?");
+               goto fail;
+       }
+
+       strncpy(ndev.tap_name, ifr.ifr_name, sizeof(ndev.tap_name));
+
+       if (ioctl(ndev.tap_fd, TUNSETNOCSUM, 1) < 0) {
+               pr_warning("Config tap device TUNSETNOCSUM error");
+               goto fail;
+       }
+
+       hdr_len = sizeof(struct virtio_net_hdr);
+       if (ioctl(ndev.tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0) {
+               pr_warning("Config tap device TUNSETVNETHDRSZ error");
+       }
+
+       offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_UFO;
+       if (ioctl(ndev.tap_fd, TUNSETOFFLOAD, offload) < 0) {
+               pr_warning("Config tap device TUNSETOFFLOAD error");
+               goto fail;
+       }
+
+       if (strcmp(params->script, "none")) {
+               pid = fork();
+               if (pid == 0) {
+                       execl(params->script, params->script, ndev.tap_name, NULL);
+                       _exit(1);
+               } else {
+                       waitpid(pid, &status, 0);
+                       if (WIFEXITED(status) && WEXITSTATUS(status) != 0) {
+                               pr_warning("Fail to setup tap by %s", params->script);
+                               goto fail;
+                       }
+               }
+       } else {
+               memset(&ifr, 0, sizeof(ifr));
+               strncpy(ifr.ifr_name, ndev.tap_name, sizeof(ndev.tap_name));
+               sin.sin_addr.s_addr = inet_addr(params->host_ip);
+               memcpy(&(ifr.ifr_addr), &sin, sizeof(ifr.ifr_addr));
+               ifr.ifr_addr.sa_family = AF_INET;
+               if (ioctl(sock, SIOCSIFADDR, &ifr) < 0) {
+                       pr_warning("Could not set ip address on tap device");
+                       goto fail;
+               }
+       }
+
+       memset(&ifr, 0, sizeof(ifr));
+       strncpy(ifr.ifr_name, ndev.tap_name, sizeof(ndev.tap_name));
+       ioctl(sock, SIOCGIFFLAGS, &ifr);
+       ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
+       if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0)
+               pr_warning("Could not bring tap device up");
+
+       close(sock);
+
+       return 1;
+
+fail:
+       if (sock >= 0)
+               close(sock);
+       if (ndev.tap_fd >= 0)
+               close(ndev.tap_fd);
+
+       return 0;
+}
+
+static void virtio_net__io_thread_init(struct kvm *kvm)
+{
+       pthread_mutex_init(&ndev.io_rx_lock, NULL);
+       pthread_cond_init(&ndev.io_tx_cond, NULL);
+
+       pthread_mutex_init(&ndev.io_rx_lock, NULL);
+       pthread_cond_init(&ndev.io_tx_cond, NULL);
+
+       pthread_create(&ndev.io_rx_thread, NULL, virtio_net_rx_thread, (void *)kvm);
+       pthread_create(&ndev.io_tx_thread, NULL, virtio_net_tx_thread, (void *)kvm);
+}
+
+static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
+{
+       return writev(ndev->tap_fd, iov, out);
+}
+
+static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
+{
+       return readv(ndev->tap_fd, iov, in);
+}
+
+static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
+{
+       return uip_tx(iov, out, &ndev->info);
+}
+
+static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
+{
+       return uip_rx(iov, in, &ndev->info);
+}
+
+static struct net_dev_operations tap_ops = {
+       .rx     = tap_ops_rx,
+       .tx     = tap_ops_tx,
+};
+
+static struct net_dev_operations uip_ops = {
+       .rx     = uip_ops_rx,
+       .tx     = uip_ops_tx,
+};
+
+static void set_config(struct kvm *kvm, void *dev, u8 data, u32 offset)
+{
+       struct net_dev *ndev = dev;
+
+       ((u8 *)(&ndev->config))[offset] = data;
+}
+
+static u8 get_config(struct kvm *kvm, void *dev, u32 offset)
+{
+       struct net_dev *ndev = dev;
+
+       return ((u8 *)(&ndev->config))[offset];
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+       return 1UL << VIRTIO_NET_F_MAC
+               | 1UL << VIRTIO_NET_F_CSUM
+               | 1UL << VIRTIO_NET_F_HOST_UFO
+               | 1UL << VIRTIO_NET_F_HOST_TSO4
+               | 1UL << VIRTIO_NET_F_HOST_TSO6
+               | 1UL << VIRTIO_NET_F_GUEST_UFO
+               | 1UL << VIRTIO_NET_F_GUEST_TSO4
+               | 1UL << VIRTIO_NET_F_GUEST_TSO6;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+       struct net_dev *ndev = dev;
+
+       ndev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 pfn)
+{
+       struct net_dev *ndev = dev;
+       struct virt_queue *queue;
+       void *p;
+
+       compat__remove_message(ndev->compat_id);
+
+       queue                   = &ndev->vqs[vq];
+       queue->pfn              = pfn;
+       p                       = guest_pfn_to_host(kvm, queue->pfn);
+
+       vring_init(&queue->vring, VIRTIO_NET_QUEUE_SIZE, p, VIRTIO_PCI_VRING_ALIGN);
+
+       return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       virtio_net_handle_callback(kvm, vq);
+
+       return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct net_dev *ndev = dev;
+
+       return ndev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       return VIRTIO_NET_QUEUE_SIZE;
+}
+
+void virtio_net__init(const struct virtio_net_parameters *params)
+{
+       int i;
+
+       for (i = 0 ; i < 6 ; i++) {
+               ndev.config.mac[i]              = params->guest_mac[i];
+               ndev.info.guest_mac.addr[i]     = params->guest_mac[i];
+               ndev.info.host_mac.addr[i]      = params->host_mac[i];
+       }
+
+       ndev.mode = params->mode;
+       if (ndev.mode == NET_MODE_TAP) {
+               virtio_net__tap_init(params);
+               ndev.ops = &tap_ops;
+       } else {
+               ndev.info.host_ip               = ntohl(inet_addr(params->host_ip));
+               ndev.info.guest_ip              = ntohl(inet_addr(params->guest_ip));
+               ndev.info.guest_netmask         = ntohl(inet_addr("255.255.255.0"));
+               uip_init(&ndev.info);
+               ndev.ops = &uip_ops;
+       }
+
+       virtio_pci__init(kvm, &ndev.vpci, &ndev, PCI_DEVICE_ID_VIRTIO_NET, VIRTIO_ID_NET);
+       ndev.vpci.ops = (struct virtio_pci_ops) {
+               .set_config             = set_config,
+               .get_config             = get_config,
+               .get_host_features      = get_host_features,
+               .set_guest_features     = set_guest_features,
+               .init_vq                = init_vq,
+               .notify_vq              = notify_vq,
+               .get_pfn_vq             = get_pfn_vq,
+               .get_size_vq            = get_size_vq,
+       };
+
+       virtio_net__io_thread_init(params->kvm);
+
+       ndev.compat_id = compat__add_message("virtio-net device was not detected",
+                                               "While you have requested a virtio-net device, "
+                                               "the guest kernel didn't seem to detect it.\n"
+                                               "Please make sure that the kernel was compiled "
+                                               "with CONFIG_VIRTIO_NET.");
+}
diff --git a/tools/kvm/virtio/pci.c b/tools/kvm/virtio/pci.c
new file mode 100644 (file)
index 0000000..2745b25
--- /dev/null
@@ -0,0 +1,313 @@
+#include "kvm/virtio-pci.h"
+
+#include "kvm/ioport.h"
+#include "kvm/kvm.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/irq.h"
+#include "kvm/virtio.h"
+#include "kvm/ioeventfd.h"
+
+#include <linux/virtio_pci.h>
+#include <string.h>
+
+static void virtio_pci__ioevent_callback(struct kvm *kvm, void *param)
+{
+       struct virtio_pci_ioevent_param *ioeventfd = param;
+
+       ioeventfd->vpci->ops.notify_vq(kvm, ioeventfd->vpci->dev, ioeventfd->vq);
+}
+
+static int virtio_pci__init_ioeventfd(struct kvm *kvm, struct virtio_pci *vpci, u32 vq)
+{
+       struct ioevent ioevent;
+
+       vpci->ioeventfds[vq] = (struct virtio_pci_ioevent_param) {
+               .vpci           = vpci,
+               .vq             = vq,
+       };
+
+       ioevent = (struct ioevent) {
+               .io_addr        = vpci->base_addr + VIRTIO_PCI_QUEUE_NOTIFY,
+               .io_len         = sizeof(u16),
+               .fn             = virtio_pci__ioevent_callback,
+               .fn_ptr         = &vpci->ioeventfds[vq],
+               .datamatch      = vq,
+               .fn_kvm         = kvm,
+               .fd             = eventfd(0, 0),
+       };
+
+       ioeventfd__add_event(&ioevent);
+
+       return 0;
+}
+
+static inline bool virtio_pci__msix_enabled(struct virtio_pci *vpci)
+{
+       return vpci->pci_hdr.msix.ctrl & PCI_MSIX_FLAGS_ENABLE;
+}
+
+static bool virtio_pci__specific_io_in(struct kvm *kvm, struct virtio_pci *vpci, u16 port,
+                                       void *data, int size, int offset)
+{
+       u32 config_offset;
+       int type = virtio__get_dev_specific_field(offset - 20,
+                                                       virtio_pci__msix_enabled(vpci),
+                                                       0, &config_offset);
+       if (type == VIRTIO_PCI_O_MSIX) {
+               switch (offset) {
+               case VIRTIO_MSI_CONFIG_VECTOR:
+                       ioport__write16(data, vpci->config_vector);
+                       break;
+               case VIRTIO_MSI_QUEUE_VECTOR:
+                       ioport__write16(data, vpci->vq_vector[vpci->queue_selector]);
+                       break;
+               };
+
+               return true;
+       } else if (type == VIRTIO_PCI_O_CONFIG) {
+               u8 cfg;
+
+               cfg = vpci->ops.get_config(kvm, vpci->dev, config_offset);
+               ioport__write8(data, cfg);
+               return true;
+       }
+
+       return false;
+}
+
+static bool virtio_pci__io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       unsigned long offset;
+       bool ret = true;
+       struct virtio_pci *vpci;
+       u32 val;
+
+       vpci = ioport->priv;
+       offset = port - vpci->base_addr;
+
+       switch (offset) {
+       case VIRTIO_PCI_HOST_FEATURES:
+               val = vpci->ops.get_host_features(kvm, vpci->dev);
+               ioport__write32(data, val);
+               break;
+       case VIRTIO_PCI_QUEUE_PFN:
+               val = vpci->ops.get_pfn_vq(kvm, vpci->dev, vpci->queue_selector);
+               ioport__write32(data, val);
+               break;
+       case VIRTIO_PCI_QUEUE_NUM:
+               val = vpci->ops.get_size_vq(kvm, vpci->dev, vpci->queue_selector);
+               ioport__write32(data, val);
+               break;
+               break;
+       case VIRTIO_PCI_STATUS:
+               ioport__write8(data, vpci->status);
+               break;
+       case VIRTIO_PCI_ISR:
+               ioport__write8(data, vpci->isr);
+               kvm__irq_line(kvm, vpci->pci_hdr.irq_line, VIRTIO_IRQ_LOW);
+               vpci->isr = VIRTIO_IRQ_LOW;
+               break;
+       default:
+               ret = virtio_pci__specific_io_in(kvm, vpci, port, data, size, offset);
+               break;
+       };
+
+       return ret;
+}
+
+static bool virtio_pci__specific_io_out(struct kvm *kvm, struct virtio_pci *vpci, u16 port,
+                                       void *data, int size, int offset)
+{
+       u32 config_offset, gsi, vec;
+       int type = virtio__get_dev_specific_field(offset - 20, virtio_pci__msix_enabled(vpci),
+                                                       0, &config_offset);
+       if (type == VIRTIO_PCI_O_MSIX) {
+               switch (offset) {
+               case VIRTIO_MSI_CONFIG_VECTOR:
+                       vec = vpci->config_vector = ioport__read16(data);
+
+                       gsi = irq__add_msix_route(kvm,
+                                                 vpci->msix_table[vec].low,
+                                                 vpci->msix_table[vec].high,
+                                                 vpci->msix_table[vec].data);
+
+                       vpci->config_gsi = gsi;
+                       break;
+               case VIRTIO_MSI_QUEUE_VECTOR: {
+                       vec = vpci->vq_vector[vpci->queue_selector] = ioport__read16(data);
+
+                       gsi = irq__add_msix_route(kvm,
+                                                 vpci->msix_table[vec].low,
+                                                 vpci->msix_table[vec].high,
+                                                 vpci->msix_table[vec].data);
+                       vpci->gsis[vpci->queue_selector] = gsi;
+                       break;
+               }
+               };
+
+               return true;
+       } else if (type == VIRTIO_PCI_O_CONFIG) {
+               vpci->ops.set_config(kvm, vpci->dev, *(u8 *)data, config_offset);
+
+               return true;
+       }
+
+       return false;
+}
+
+static bool virtio_pci__io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+       unsigned long offset;
+       bool ret = true;
+       struct virtio_pci *vpci;
+       u32 val;
+
+       vpci = ioport->priv;
+       offset = port - vpci->base_addr;
+
+       switch (offset) {
+       case VIRTIO_PCI_GUEST_FEATURES:
+               val = ioport__read32(data);
+               vpci->ops.set_guest_features(kvm, vpci, val);
+               break;
+       case VIRTIO_PCI_QUEUE_PFN:
+               val = ioport__read32(data);
+               virtio_pci__init_ioeventfd(kvm, vpci, vpci->queue_selector);
+               vpci->ops.init_vq(kvm, vpci->dev, vpci->queue_selector, val);
+               break;
+       case VIRTIO_PCI_QUEUE_SEL:
+               vpci->queue_selector    = ioport__read16(data);
+               break;
+       case VIRTIO_PCI_QUEUE_NOTIFY:
+               val                     = ioport__read16(data);
+               vpci->ops.notify_vq(kvm, vpci->dev, val);
+               break;
+       case VIRTIO_PCI_STATUS:
+               vpci->status            = ioport__read8(data);
+               break;
+       default:
+               ret = virtio_pci__specific_io_out(kvm, vpci, port, data, size, offset);
+               break;
+       };
+
+       return ret;
+}
+
+static struct ioport_operations virtio_pci__io_ops = {
+       .io_in  = virtio_pci__io_in,
+       .io_out = virtio_pci__io_out,
+};
+
+static void callback_mmio_table(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr)
+{
+       struct virtio_pci *vpci = ptr;
+       void *table = &vpci->msix_table;
+
+       if (is_write)
+               memcpy(table + addr - vpci->msix_io_block, data, len);
+       else
+               memcpy(data, table + addr - vpci->msix_io_block, len);
+}
+
+static void callback_mmio_pba(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr)
+{
+       struct virtio_pci *vpci = ptr;
+       void *pba = &vpci->msix_pba;
+
+       if (is_write)
+               memcpy(pba + addr - vpci->msix_pba_block, data, len);
+       else
+               memcpy(data, pba + addr - vpci->msix_pba_block, len);
+}
+
+int virtio_pci__signal_vq(struct kvm *kvm, struct virtio_pci *vpci, u32 vq)
+{
+       int tbl = vpci->vq_vector[vq];
+
+       if (virtio_pci__msix_enabled(vpci)) {
+               if (vpci->pci_hdr.msix.ctrl & PCI_MSIX_FLAGS_MASKALL ||
+                       vpci->msix_table[tbl].ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) {
+
+                       vpci->msix_pba |= 1 << tbl;
+                       return 0;
+               }
+
+               kvm__irq_trigger(kvm, vpci->gsis[vq]);
+       } else {
+               kvm__irq_trigger(kvm, vpci->pci_hdr.irq_line);
+       }
+       return 0;
+}
+
+int virtio_pci__signal_config(struct kvm *kvm, struct virtio_pci *vpci)
+{
+       int tbl = vpci->config_vector;
+
+       if (virtio_pci__msix_enabled(vpci)) {
+               if (vpci->pci_hdr.msix.ctrl & PCI_MSIX_FLAGS_MASKALL ||
+                       vpci->msix_table[tbl].ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) {
+
+                       vpci->msix_pba |= 1 << tbl;
+                       return 0;
+               }
+
+               kvm__irq_trigger(kvm, vpci->config_gsi);
+       } else {
+               vpci->isr = VIRTIO_PCI_ISR_CONFIG;
+               kvm__irq_trigger(kvm, vpci->pci_hdr.irq_line);
+       }
+
+       return 0;
+}
+
+int virtio_pci__init(struct kvm *kvm, struct virtio_pci *vpci, void *dev,
+                       int device_id, int subsys_id)
+{
+       u8 pin, line, ndev;
+
+       vpci->dev = dev;
+       vpci->msix_io_block = pci_get_io_space_block(PCI_IO_SIZE);
+       vpci->msix_pba_block = pci_get_io_space_block(PCI_IO_SIZE);
+
+       vpci->base_addr = ioport__register(IOPORT_EMPTY, &virtio_pci__io_ops, IOPORT_SIZE, vpci);
+       kvm__register_mmio(kvm, vpci->msix_io_block, 0x100, callback_mmio_table, vpci);
+       kvm__register_mmio(kvm, vpci->msix_pba_block, 0x100, callback_mmio_pba, vpci);
+
+       vpci->pci_hdr = (struct pci_device_header) {
+               .vendor_id              = PCI_VENDOR_ID_REDHAT_QUMRANET,
+               .device_id              = device_id,
+               .header_type            = PCI_HEADER_TYPE_NORMAL,
+               .revision_id            = 0,
+               .class                  = 0x010000,
+               .subsys_vendor_id       = PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET,
+               .subsys_id              = subsys_id,
+               .bar[0]                 = vpci->base_addr | PCI_BASE_ADDRESS_SPACE_IO,
+               .bar[1]                 = vpci->msix_io_block | PCI_BASE_ADDRESS_SPACE_MEMORY
+                                       | PCI_BASE_ADDRESS_MEM_TYPE_64,
+               .bar[3]                 = vpci->msix_pba_block | PCI_BASE_ADDRESS_SPACE_MEMORY
+                                       | PCI_BASE_ADDRESS_MEM_TYPE_64,
+               .status                 = PCI_STATUS_CAP_LIST,
+               .capabilities           = (void *)&vpci->pci_hdr.msix - (void *)&vpci->pci_hdr,
+       };
+
+       vpci->pci_hdr.msix.cap = PCI_CAP_ID_MSIX;
+       vpci->pci_hdr.msix.next = 0;
+       vpci->pci_hdr.msix.ctrl = (VIRTIO_PCI_MAX_VQ + 1);
+
+       /*
+        * Both table and PBA could be mapped on the same BAR, but for now
+        * we're not in short of BARs
+        */
+       vpci->pci_hdr.msix.table_offset = 1; /* Use BAR 1 */
+       vpci->pci_hdr.msix.pba_offset = 3; /* Use BAR 3 */
+       vpci->config_vector = 0;
+
+       if (irq__register_device(VIRTIO_ID_RNG, &ndev, &pin, &line) < 0)
+               return -1;
+
+       vpci->pci_hdr.irq_pin   = pin;
+       vpci->pci_hdr.irq_line  = line;
+       pci__register(&vpci->pci_hdr, ndev);
+
+       return 0;
+}
diff --git a/tools/kvm/virtio/rng.c b/tools/kvm/virtio/rng.c
new file mode 100644 (file)
index 0000000..02308c2
--- /dev/null
@@ -0,0 +1,183 @@
+#include "kvm/virtio-rng.h"
+
+#include "kvm/virtio-pci-dev.h"
+
+#include "kvm/virtio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/threadpool.h"
+#include "kvm/guest_compat.h"
+#include "kvm/virtio-pci.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_rng.h>
+
+#include <linux/list.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <pthread.h>
+#include <linux/kernel.h>
+
+#define NUM_VIRT_QUEUES                1
+#define VIRTIO_RNG_QUEUE_SIZE  128
+
+struct rng_dev_job {
+       struct virt_queue       *vq;
+       struct rng_dev          *rdev;
+       struct thread_pool__job job_id;
+};
+
+struct rng_dev {
+       struct list_head        list;
+       struct virtio_pci       vpci;
+
+       int                     fd;
+       int                     compat_id;
+
+       /* virtio queue */
+       struct virt_queue       vqs[NUM_VIRT_QUEUES];
+       struct rng_dev_job      jobs[NUM_VIRT_QUEUES];
+};
+
+static LIST_HEAD(rdevs);
+
+static void set_config(struct kvm *kvm, void *dev, u8 data, u32 offset)
+{
+       /* Unused */
+}
+
+static u8 get_config(struct kvm *kvm, void *dev, u32 offset)
+{
+       /* Unused */
+       return 0;
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+       /* Unused */
+       return 0;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+       /* Unused */
+}
+
+static bool virtio_rng_do_io_request(struct kvm *kvm, struct rng_dev *rdev, struct virt_queue *queue)
+{
+       struct iovec iov[VIRTIO_RNG_QUEUE_SIZE];
+       unsigned int len = 0;
+       u16 out, in, head;
+
+       head            = virt_queue__get_iov(queue, iov, &out, &in, kvm);
+       len             = readv(rdev->fd, iov, in);
+
+       virt_queue__set_used_elem(queue, head, len);
+
+       return true;
+}
+
+static void virtio_rng_do_io(struct kvm *kvm, void *param)
+{
+       struct rng_dev_job *job = param;
+       struct virt_queue *vq = job->vq;
+       struct rng_dev *rdev = job->rdev;
+
+       while (virt_queue__available(vq))
+               virtio_rng_do_io_request(kvm, rdev, vq);
+
+       virtio_pci__signal_vq(kvm, &rdev->vpci, vq - rdev->vqs);
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 pfn)
+{
+       struct rng_dev *rdev = dev;
+       struct virt_queue *queue;
+       struct rng_dev_job *job;
+       void *p;
+
+       compat__remove_message(rdev->compat_id);
+
+       queue                   = &rdev->vqs[vq];
+       queue->pfn              = pfn;
+       p                       = guest_pfn_to_host(kvm, queue->pfn);
+
+       job = &rdev->jobs[vq];
+
+       vring_init(&queue->vring, VIRTIO_RNG_QUEUE_SIZE, p, VIRTIO_PCI_VRING_ALIGN);
+
+       *job            = (struct rng_dev_job) {
+               .vq             = queue,
+               .rdev           = rdev,
+       };
+
+       thread_pool__init_job(&job->job_id, kvm, virtio_rng_do_io, job);
+
+       return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct rng_dev *rdev = dev;
+
+       thread_pool__do_job(&rdev->jobs[vq].job_id);
+
+       return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       struct rng_dev *rdev = dev;
+
+       return rdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+       return VIRTIO_RNG_QUEUE_SIZE;
+}
+
+void virtio_rng__init(struct kvm *kvm)
+{
+       struct rng_dev *rdev;
+
+       rdev = malloc(sizeof(*rdev));
+       if (rdev == NULL)
+               return;
+
+       rdev->fd = open("/dev/urandom", O_RDONLY);
+       if (rdev->fd < 0)
+               die("Failed initializing RNG");
+
+       virtio_pci__init(kvm, &rdev->vpci, rdev, PCI_DEVICE_ID_VIRTIO_RNG, VIRTIO_ID_RNG);
+       rdev->vpci.ops = (struct virtio_pci_ops) {
+               .set_config             = set_config,
+               .get_config             = get_config,
+               .get_host_features      = get_host_features,
+               .set_guest_features     = set_guest_features,
+               .init_vq                = init_vq,
+               .notify_vq              = notify_vq,
+               .get_pfn_vq             = get_pfn_vq,
+               .get_size_vq            = get_size_vq,
+       };
+
+       list_add_tail(&rdev->list, &rdevs);
+
+       rdev->compat_id = compat__add_message("virtio-rng device was not detected",
+                                               "While you have requested a virtio-rng device, "
+                                               "the guest kernel didn't seem to detect it.\n"
+                                               "Please make sure that the kernel was compiled "
+                                               "with CONFIG_HW_RANDOM_VIRTIO.");
+}
+
+void virtio_rng__delete_all(struct kvm *kvm)
+{
+       while (!list_empty(&rdevs)) {
+               struct rng_dev *rdev;
+
+               rdev = list_first_entry(&rdevs, struct rng_dev, list);
+               list_del(&rdev->list);
+               free(rdev);
+       }
+}